[v3] bpf: Free special fields when update hash and local storage maps

Expand all Fold all

[PATCH bpf v3 4/4] selftests/bpf: Add tests to verify freeing the special fields when update hash and local storage maps

Posted by Leon Hwang 3 months, 1 week ago

Add tests to verify that updating hash and local storage maps decrements
refcount when BPF_KPTR_REF objects are involved.

The tests perform the following steps:

1. Call update_elem() to insert an initial value.
2. Use bpf_refcount_acquire() to increment the refcount.
3. Store the node pointer in the map value.
4. Add the node to a linked list.
5. Probe-read the refcount and verify it is *2*.
6. Call update_elem() again to trigger refcount decrement.
7. Probe-read the refcount and verify it is *1*.

Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
 .../bpf/prog_tests/refcounted_kptr.c          | 178 +++++++++++++++++-
 .../selftests/bpf/progs/refcounted_kptr.c     | 160 ++++++++++++++++
 2 files changed, 337 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
index d6bd5e16e6372..0a60330a1f4b3 100644
--- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
+++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
@@ -3,7 +3,7 @@
 
 #include <test_progs.h>
 #include <network_helpers.h>
-
+#include "cgroup_helpers.h"
 #include "refcounted_kptr.skel.h"
 #include "refcounted_kptr_fail.skel.h"
 
@@ -44,3 +44,179 @@ void test_refcounted_kptr_wrong_owner(void)
 	ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a2 retval");
 	refcounted_kptr__destroy(skel);
 }
+
+static void test_refcnt_leak(void *values, size_t values_sz, u64 flags, struct bpf_map *map,
+			     struct bpf_program *prog_leak, struct bpf_program *prog_check)
+{
+	int ret, fd, key = 0;
+	LIBBPF_OPTS(bpf_test_run_opts, opts,
+		    .data_in = &pkt_v4,
+		    .data_size_in = sizeof(pkt_v4),
+		    .repeat = 1,
+	);
+
+	ret = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, flags);
+	if (!ASSERT_OK(ret, "bpf_map__update_elem init"))
+		return;
+
+	fd = bpf_program__fd(prog_leak);
+	ret = bpf_prog_test_run_opts(fd, &opts);
+	if (!ASSERT_OK(ret, "test_run_opts"))
+		return;
+	if (!ASSERT_EQ(opts.retval, 2, "retval refcount"))
+		return;
+
+	ret = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, flags);
+	if (!ASSERT_OK(ret, "bpf_map__update_elem dec refcount"))
+		return;
+
+	fd = bpf_program__fd(prog_check);
+	ret = bpf_prog_test_run_opts(fd, &opts);
+	ASSERT_OK(ret, "test_run_opts");
+	ASSERT_EQ(opts.retval, 1, "retval");
+}
+
+static void test_percpu_hash_refcount_leak(void)
+{
+	struct refcounted_kptr *skel;
+	size_t values_sz;
+	u64 *values;
+	int cpu_nr;
+
+	cpu_nr = libbpf_num_possible_cpus();
+	if (!ASSERT_GT(cpu_nr, 0, "libbpf_num_possible_cpus"))
+		return;
+
+	values = calloc(cpu_nr, sizeof(u64));
+	if (!ASSERT_OK_PTR(values, "calloc values"))
+		return;
+
+	skel = refcounted_kptr__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load")) {
+		free(values);
+		return;
+	}
+
+	values_sz = cpu_nr * sizeof(u64);
+	memset(values, 0, values_sz);
+
+	test_refcnt_leak(values, values_sz, 0, skel->maps.pcpu_hash,
+			 skel->progs.pcpu_hash_refcount_leak,
+			 skel->progs.check_pcpu_hash_refcount);
+
+	refcounted_kptr__destroy(skel);
+	free(values);
+}
+
+struct lock_map_value {
+	u64 kptr;
+	struct bpf_spin_lock lock;
+	int value;
+};
+
+static void test_hash_lock_refcount_leak(void)
+{
+	struct lock_map_value value = {};
+	struct refcounted_kptr *skel;
+
+	skel = refcounted_kptr__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load"))
+		return;
+
+	test_refcnt_leak(&value, sizeof(value), BPF_F_LOCK, skel->maps.lock_hash,
+			 skel->progs.hash_lock_refcount_leak,
+			 skel->progs.check_hash_lock_refcount);
+
+	refcounted_kptr__destroy(skel);
+}
+
+static void test_cgrp_storage_refcount_leak(u64 flags)
+{
+	int server_fd = -1, client_fd = -1;
+	struct lock_map_value value = {};
+	struct refcounted_kptr *skel;
+	struct bpf_link *link;
+	struct bpf_map *map;
+	int cgroup, err;
+
+	cgroup = test__join_cgroup("/cg_refcount_leak");
+	if (!ASSERT_GE(cgroup, 0, "test__join_cgroup"))
+		return;
+
+	skel = refcounted_kptr__open_and_load();
+	if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load"))
+		goto out;
+
+	link = bpf_program__attach_cgroup(skel->progs.cgroup_storage_refcount_leak, cgroup);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach_cgroup"))
+		goto out;
+	skel->links.cgroup_storage_refcount_leak = link;
+
+	server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
+	if (!ASSERT_GE(server_fd, 0, "start_server"))
+		goto out;
+
+	client_fd = connect_to_fd(server_fd, 0);
+	if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
+		goto out;
+
+	map = skel->maps.cgrp_strg;
+	err = bpf_map__lookup_elem(map, &cgroup, sizeof(cgroup), &value, sizeof(value), flags);
+	if (!ASSERT_OK(err, "bpf_map__lookup_elem"))
+		goto out;
+
+	ASSERT_EQ(value.value, 2, "refcount");
+
+	err = bpf_map__update_elem(map, &cgroup, sizeof(cgroup), &value, sizeof(value), flags);
+	if (!ASSERT_OK(err, "bpf_map__update_elem"))
+		goto out;
+
+	err = bpf_link__detach(skel->links.cgroup_storage_refcount_leak);
+	if (!ASSERT_OK(err, "bpf_link__detach"))
+		goto out;
+
+	link = bpf_program__attach(skel->progs.check_cgroup_storage_refcount);
+	if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
+		goto out;
+	skel->links.check_cgroup_storage_refcount = link;
+
+	close(client_fd);
+	client_fd = connect_to_fd(server_fd, 0);
+	if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
+		goto out;
+
+	err = bpf_map__lookup_elem(map, &cgroup, sizeof(cgroup), &value, sizeof(value), flags);
+	if (!ASSERT_OK(err, "bpf_map__lookup_elem"))
+		goto out;
+
+	ASSERT_EQ(value.value, 1, "refcount");
+out:
+	close(cgroup);
+	refcounted_kptr__destroy(skel);
+	if (client_fd >= 0)
+		close(client_fd);
+	if (server_fd >= 0)
+		close(server_fd);
+}
+
+static void test_cgroup_storage_refcount_leak(void)
+{
+	test_cgrp_storage_refcount_leak(0);
+}
+
+static void test_cgroup_storage_lock_refcount_leak(void)
+{
+	test_cgrp_storage_refcount_leak(BPF_F_LOCK);
+}
+
+void test_kptr_refcount_leak(void)
+{
+	if (test__start_subtest("percpu_hash_refcount_leak"))
+		test_percpu_hash_refcount_leak();
+	if (test__start_subtest("hash_lock_refcount_leak"))
+		test_hash_lock_refcount_leak();
+	if (test__start_subtest("cgroup_storage_refcount_leak"))
+		test_cgroup_storage_refcount_leak();
+	if (test__start_subtest("cgroup_storage_lock_refcount_leak"))
+		test_cgroup_storage_lock_refcount_leak();
+}
diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
index 893a4fdb4b6e9..09efae9537c9b 100644
--- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c
+++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
@@ -7,6 +7,7 @@
 #include <bpf/bpf_core_read.h>
 #include "bpf_misc.h"
 #include "bpf_experimental.h"
+#include "bpf_tracing_net.h"
 
 extern void bpf_rcu_read_lock(void) __ksym;
 extern void bpf_rcu_read_unlock(void) __ksym;
@@ -568,4 +569,163 @@ int BPF_PROG(rbtree_sleepable_rcu_no_explicit_rcu_lock,
 	return 0;
 }
 
+private(leak) u64 ref;
+
+static u32 probe_read_refcount(void)
+{
+	u32 refcnt;
+
+	bpf_probe_read_kernel(&refcnt, sizeof(refcnt), (void *) ref);
+	return refcnt;
+}
+
+static int __insert_in_list(struct bpf_list_head *head, struct bpf_spin_lock *lock,
+			    struct node_data __kptr **node)
+{
+	struct node_data *n, *m;
+
+	n = bpf_obj_new(typeof(*n));
+	if (!n)
+		return -1;
+
+	m = bpf_refcount_acquire(n);
+	n = bpf_kptr_xchg(node, n);
+	if (n) {
+		bpf_obj_drop(n);
+		bpf_obj_drop(m);
+		return -2;
+	}
+
+	bpf_spin_lock(lock);
+	bpf_list_push_front(head, &m->l);
+	ref = (u64)(void *) &m->ref;
+	bpf_spin_unlock(lock);
+	return probe_read_refcount();
+}
+
+static void *__lookup_map(void *map)
+{
+	int key = 0;
+
+	return bpf_map_lookup_elem(map, &key);
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+	__type(key, int);
+	__type(value, struct map_value);
+	__uint(max_entries, 1);
+} pcpu_hash SEC(".maps");
+
+SEC("tc")
+int pcpu_hash_refcount_leak(void *ctx)
+{
+	struct map_value *v;
+
+	v = __lookup_map(&pcpu_hash);
+	if (!v)
+		return 0;
+
+	return __insert_in_list(&head, &lock, &v->node);
+}
+
+SEC("tc")
+int check_pcpu_hash_refcount(void *ctx)
+{
+	return probe_read_refcount();
+}
+
+struct lock_map_value {
+	struct node_data __kptr *node;
+	struct bpf_spin_lock lock;
+	int value;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__type(key, int);
+	__type(value, struct lock_map_value);
+	__uint(max_entries, 1);
+} lock_hash SEC(".maps");
+
+SEC("tc")
+int hash_lock_refcount_leak(void *ctx)
+{
+	struct lock_map_value *v;
+
+	v = __lookup_map(&lock_hash);
+	if (!v)
+		return 0;
+
+	bpf_spin_lock(&v->lock);
+	v->value = 42;
+	bpf_spin_unlock(&v->lock);
+	return __insert_in_list(&head, &lock, &v->node);
+}
+
+SEC("tc")
+int check_hash_lock_refcount(void *ctx)
+{
+	return probe_read_refcount();
+}
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, struct lock_map_value);
+} cgrp_strg SEC(".maps");
+
+SEC("cgroup/connect6")
+int cgroup_storage_refcount_leak(struct bpf_sock_addr *ctx)
+{
+	struct lock_map_value *v;
+	struct tcp_sock *tsk;
+	struct bpf_sock *sk;
+	u32 refcnt;
+
+	if (ctx->family != AF_INET6 || ctx->user_family != AF_INET6)
+		return 1;
+
+	sk = ctx->sk;
+	if (!sk)
+		return 1;
+
+	tsk = bpf_skc_to_tcp_sock(sk);
+	if (!tsk)
+		return 1;
+
+	v = bpf_cgrp_storage_get(&cgrp_strg, tsk->inet_conn.icsk_inet.sk.sk_cgrp_data.cgroup, 0,
+				 BPF_LOCAL_STORAGE_GET_F_CREATE);
+	if (!v)
+		return 1;
+
+	refcnt = __insert_in_list(&head, &lock, &v->node);
+	bpf_spin_lock(&v->lock);
+	v->value = refcnt;
+	bpf_spin_unlock(&v->lock);
+	return 1;
+}
+
+SEC("fexit/inet_stream_connect")
+int BPF_PROG(check_cgroup_storage_refcount, struct socket *sock, struct sockaddr *uaddr, int addr_len,
+	     int flags)
+{
+	struct lock_map_value *v;
+	u32 refcnt;
+
+	if (uaddr->sa_family != AF_INET6)
+		return 0;
+
+	v = bpf_cgrp_storage_get(&cgrp_strg, sock->sk->sk_cgrp_data.cgroup, 0, 0);
+	if (!v)
+		return 0;
+
+	refcnt = probe_read_refcount();
+	bpf_spin_lock(&v->lock);
+	v->value = refcnt;
+	bpf_spin_unlock(&v->lock);
+	return 0;
+}
+
 char _license[] SEC("license") = "GPL";
-- 
2.51.0

Re: [PATCH bpf v3 4/4] selftests/bpf: Add tests to verify freeing the special fields when update hash and local storage maps

Posted by Amery Hung 3 months, 1 week ago

On Sun, Oct 26, 2025 at 8:42 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>
> Add tests to verify that updating hash and local storage maps decrements
> refcount when BPF_KPTR_REF objects are involved.
>
> The tests perform the following steps:
>
> 1. Call update_elem() to insert an initial value.
> 2. Use bpf_refcount_acquire() to increment the refcount.
> 3. Store the node pointer in the map value.
> 4. Add the node to a linked list.
> 5. Probe-read the refcount and verify it is *2*.
> 6. Call update_elem() again to trigger refcount decrement.
> 7. Probe-read the refcount and verify it is *1*.
>
> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> ---
>  .../bpf/prog_tests/refcounted_kptr.c          | 178 +++++++++++++++++-
>  .../selftests/bpf/progs/refcounted_kptr.c     | 160 ++++++++++++++++
>  2 files changed, 337 insertions(+), 1 deletion(-)
>
> diff --git a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
> index d6bd5e16e6372..0a60330a1f4b3 100644
> --- a/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
> +++ b/tools/testing/selftests/bpf/prog_tests/refcounted_kptr.c
> @@ -3,7 +3,7 @@
>
>  #include <test_progs.h>
>  #include <network_helpers.h>
> -
> +#include "cgroup_helpers.h"
>  #include "refcounted_kptr.skel.h"
>  #include "refcounted_kptr_fail.skel.h"
>
> @@ -44,3 +44,179 @@ void test_refcounted_kptr_wrong_owner(void)
>         ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a2 retval");
>         refcounted_kptr__destroy(skel);
>  }
> +
> +static void test_refcnt_leak(void *values, size_t values_sz, u64 flags, struct bpf_map *map,
> +                            struct bpf_program *prog_leak, struct bpf_program *prog_check)
> +{
> +       int ret, fd, key = 0;
> +       LIBBPF_OPTS(bpf_test_run_opts, opts,
> +                   .data_in = &pkt_v4,
> +                   .data_size_in = sizeof(pkt_v4),
> +                   .repeat = 1,
> +       );
> +
> +       ret = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, flags);
> +       if (!ASSERT_OK(ret, "bpf_map__update_elem init"))
> +               return;
> +
> +       fd = bpf_program__fd(prog_leak);
> +       ret = bpf_prog_test_run_opts(fd, &opts);
> +       if (!ASSERT_OK(ret, "test_run_opts"))
> +               return;
> +       if (!ASSERT_EQ(opts.retval, 2, "retval refcount"))
> +               return;
> +
> +       ret = bpf_map__update_elem(map, &key, sizeof(key), values, values_sz, flags);
> +       if (!ASSERT_OK(ret, "bpf_map__update_elem dec refcount"))
> +               return;
> +
> +       fd = bpf_program__fd(prog_check);
> +       ret = bpf_prog_test_run_opts(fd, &opts);
> +       ASSERT_OK(ret, "test_run_opts");
> +       ASSERT_EQ(opts.retval, 1, "retval");
> +}

Just use syscall BPF programs across different subtests, and you can
share this test_refcnt_leak() across subtests.

It also saves you some code setting up bpf_test_run_opts. You can just
call bpf_prog_test_run_opts(prog_fd, NULL) as you don't pass any input
from ctx.

> +
> +static void test_percpu_hash_refcount_leak(void)
> +{
> +       struct refcounted_kptr *skel;
> +       size_t values_sz;
> +       u64 *values;
> +       int cpu_nr;
> +
> +       cpu_nr = libbpf_num_possible_cpus();
> +       if (!ASSERT_GT(cpu_nr, 0, "libbpf_num_possible_cpus"))
> +               return;
> +
> +       values = calloc(cpu_nr, sizeof(u64));
> +       if (!ASSERT_OK_PTR(values, "calloc values"))
> +               return;
> +
> +       skel = refcounted_kptr__open_and_load();
> +       if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load")) {
> +               free(values);
> +               return;
> +       }
> +
> +       values_sz = cpu_nr * sizeof(u64);
> +       memset(values, 0, values_sz);
> +
> +       test_refcnt_leak(values, values_sz, 0, skel->maps.pcpu_hash,
> +                        skel->progs.pcpu_hash_refcount_leak,
> +                        skel->progs.check_pcpu_hash_refcount);
> +
> +       refcounted_kptr__destroy(skel);
> +       free(values);
> +}
> +
> +struct lock_map_value {
> +       u64 kptr;
> +       struct bpf_spin_lock lock;
> +       int value;
> +};
> +
> +static void test_hash_lock_refcount_leak(void)
> +{
> +       struct lock_map_value value = {};
> +       struct refcounted_kptr *skel;
> +
> +       skel = refcounted_kptr__open_and_load();
> +       if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load"))
> +               return;
> +
> +       test_refcnt_leak(&value, sizeof(value), BPF_F_LOCK, skel->maps.lock_hash,
> +                        skel->progs.hash_lock_refcount_leak,
> +                        skel->progs.check_hash_lock_refcount);
> +
> +       refcounted_kptr__destroy(skel);
> +}
> +
> +static void test_cgrp_storage_refcount_leak(u64 flags)
> +{
> +       int server_fd = -1, client_fd = -1;
> +       struct lock_map_value value = {};
> +       struct refcounted_kptr *skel;
> +       struct bpf_link *link;
> +       struct bpf_map *map;
> +       int cgroup, err;
> +
> +       cgroup = test__join_cgroup("/cg_refcount_leak");
> +       if (!ASSERT_GE(cgroup, 0, "test__join_cgroup"))
> +               return;
> +
> +       skel = refcounted_kptr__open_and_load();
> +       if (!ASSERT_OK_PTR(skel, "refcounted_kptr__open_and_load"))
> +               goto out;
> +
> +       link = bpf_program__attach_cgroup(skel->progs.cgroup_storage_refcount_leak, cgroup);
> +       if (!ASSERT_OK_PTR(link, "bpf_program__attach_cgroup"))
> +               goto out;
> +       skel->links.cgroup_storage_refcount_leak = link;
> +
> +       server_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0);
> +       if (!ASSERT_GE(server_fd, 0, "start_server"))
> +               goto out;
> +
> +       client_fd = connect_to_fd(server_fd, 0);
> +       if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
> +               goto out;
> +
> +       map = skel->maps.cgrp_strg;
> +       err = bpf_map__lookup_elem(map, &cgroup, sizeof(cgroup), &value, sizeof(value), flags);
> +       if (!ASSERT_OK(err, "bpf_map__lookup_elem"))
> +               goto out;
> +
> +       ASSERT_EQ(value.value, 2, "refcount");
> +
> +       err = bpf_map__update_elem(map, &cgroup, sizeof(cgroup), &value, sizeof(value), flags);
> +       if (!ASSERT_OK(err, "bpf_map__update_elem"))
> +               goto out;
> +
> +       err = bpf_link__detach(skel->links.cgroup_storage_refcount_leak);
> +       if (!ASSERT_OK(err, "bpf_link__detach"))
> +               goto out;
> +
> +       link = bpf_program__attach(skel->progs.check_cgroup_storage_refcount);
> +       if (!ASSERT_OK_PTR(link, "bpf_program__attach"))
> +               goto out;
> +       skel->links.check_cgroup_storage_refcount = link;
> +
> +       close(client_fd);
> +       client_fd = connect_to_fd(server_fd, 0);
> +       if (!ASSERT_GE(client_fd, 0, "connect_to_fd"))
> +               goto out;
> +
> +       err = bpf_map__lookup_elem(map, &cgroup, sizeof(cgroup), &value, sizeof(value), flags);
> +       if (!ASSERT_OK(err, "bpf_map__lookup_elem"))
> +               goto out;
> +
> +       ASSERT_EQ(value.value, 1, "refcount");
> +out:
> +       close(cgroup);
> +       refcounted_kptr__destroy(skel);
> +       if (client_fd >= 0)
> +               close(client_fd);
> +       if (server_fd >= 0)
> +               close(server_fd);
> +}

Then, you won't need to set up server, connection.... just to
read/write cgroup local storage. Just call test_refcnt_leak() that
runs the two BPF syscall programs for cgroup local storage.

> +
> +static void test_cgroup_storage_refcount_leak(void)
> +{
> +       test_cgrp_storage_refcount_leak(0);
> +}
> +
> +static void test_cgroup_storage_lock_refcount_leak(void)
> +{
> +       test_cgrp_storage_refcount_leak(BPF_F_LOCK);
> +}
> +
> +void test_kptr_refcount_leak(void)
> +{
> +       if (test__start_subtest("percpu_hash_refcount_leak"))
> +               test_percpu_hash_refcount_leak();
> +       if (test__start_subtest("hash_lock_refcount_leak"))
> +               test_hash_lock_refcount_leak();
> +       if (test__start_subtest("cgroup_storage_refcount_leak"))
> +               test_cgroup_storage_refcount_leak();
> +       if (test__start_subtest("cgroup_storage_lock_refcount_leak"))
> +               test_cgroup_storage_lock_refcount_leak();
> +}
> diff --git a/tools/testing/selftests/bpf/progs/refcounted_kptr.c b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
> index 893a4fdb4b6e9..09efae9537c9b 100644
> --- a/tools/testing/selftests/bpf/progs/refcounted_kptr.c
> +++ b/tools/testing/selftests/bpf/progs/refcounted_kptr.c
> @@ -7,6 +7,7 @@
>  #include <bpf/bpf_core_read.h>
>  #include "bpf_misc.h"
>  #include "bpf_experimental.h"
> +#include "bpf_tracing_net.h"
>
>  extern void bpf_rcu_read_lock(void) __ksym;
>  extern void bpf_rcu_read_unlock(void) __ksym;
> @@ -568,4 +569,163 @@ int BPF_PROG(rbtree_sleepable_rcu_no_explicit_rcu_lock,
>         return 0;
>  }
>
> +private(leak) u64 ref;
> +
> +static u32 probe_read_refcount(void)
> +{
> +       u32 refcnt;
> +
> +       bpf_probe_read_kernel(&refcnt, sizeof(refcnt), (void *) ref);
> +       return refcnt;
> +}
> +
> +static int __insert_in_list(struct bpf_list_head *head, struct bpf_spin_lock *lock,
> +                           struct node_data __kptr **node)
> +{
> +       struct node_data *n, *m;
> +
> +       n = bpf_obj_new(typeof(*n));
> +       if (!n)
> +               return -1;
> +
> +       m = bpf_refcount_acquire(n);
> +       n = bpf_kptr_xchg(node, n);
> +       if (n) {
> +               bpf_obj_drop(n);
> +               bpf_obj_drop(m);
> +               return -2;
> +       }
> +
> +       bpf_spin_lock(lock);
> +       bpf_list_push_front(head, &m->l);
> +       ref = (u64)(void *) &m->ref;
> +       bpf_spin_unlock(lock);
> +       return probe_read_refcount();
> +}
> +
> +static void *__lookup_map(void *map)
> +{
> +       int key = 0;
> +
> +       return bpf_map_lookup_elem(map, &key);
> +}
> +
> +struct {
> +       __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
> +       __type(key, int);
> +       __type(value, struct map_value);
> +       __uint(max_entries, 1);
> +} pcpu_hash SEC(".maps");
> +
> +SEC("tc")
> +int pcpu_hash_refcount_leak(void *ctx)
> +{
> +       struct map_value *v;
> +
> +       v = __lookup_map(&pcpu_hash);
> +       if (!v)
> +               return 0;
> +
> +       return __insert_in_list(&head, &lock, &v->node);
> +}
> +
> +SEC("tc")
> +int check_pcpu_hash_refcount(void *ctx)
> +{
> +       return probe_read_refcount();
> +}
> +
> +struct lock_map_value {
> +       struct node_data __kptr *node;
> +       struct bpf_spin_lock lock;
> +       int value;
> +};
> +
> +struct {
> +       __uint(type, BPF_MAP_TYPE_HASH);
> +       __type(key, int);
> +       __type(value, struct lock_map_value);
> +       __uint(max_entries, 1);
> +} lock_hash SEC(".maps");
> +
> +SEC("tc")
> +int hash_lock_refcount_leak(void *ctx)
> +{
> +       struct lock_map_value *v;
> +
> +       v = __lookup_map(&lock_hash);
> +       if (!v)
> +               return 0;
> +
> +       bpf_spin_lock(&v->lock);
> +       v->value = 42;
> +       bpf_spin_unlock(&v->lock);
> +       return __insert_in_list(&head, &lock, &v->node);
> +}
> +
> +SEC("tc")
> +int check_hash_lock_refcount(void *ctx)
> +{
> +       return probe_read_refcount();
> +}
> +
> +struct {
> +       __uint(type, BPF_MAP_TYPE_CGRP_STORAGE);
> +       __uint(map_flags, BPF_F_NO_PREALLOC);
> +       __type(key, int);
> +       __type(value, struct lock_map_value);
> +} cgrp_strg SEC(".maps");
> +
> +SEC("cgroup/connect6")
> +int cgroup_storage_refcount_leak(struct bpf_sock_addr *ctx)
> +{
> +       struct lock_map_value *v;
> +       struct tcp_sock *tsk;
> +       struct bpf_sock *sk;
> +       u32 refcnt;
> +
> +       if (ctx->family != AF_INET6 || ctx->user_family != AF_INET6)
> +               return 1;
> +
> +       sk = ctx->sk;
> +       if (!sk)
> +               return 1;
> +
> +       tsk = bpf_skc_to_tcp_sock(sk);
> +       if (!tsk)
> +               return 1;
> +
> +       v = bpf_cgrp_storage_get(&cgrp_strg, tsk->inet_conn.icsk_inet.sk.sk_cgrp_data.cgroup, 0,
> +                                BPF_LOCAL_STORAGE_GET_F_CREATE);
> +       if (!v)
> +               return 1;
> +
> +       refcnt = __insert_in_list(&head, &lock, &v->node);
> +       bpf_spin_lock(&v->lock);
> +       v->value = refcnt;
> +       bpf_spin_unlock(&v->lock);
> +       return 1;
> +}


And in syscall BPF program, you can simply get the cgroup through the
current task

SEC("syscall")
int syscall_prog(void *ctx)
{
        struct task_struct *task = bpf_get_current_task_btf();

        v = bpf_cgrp_storage_get(&cgrp_strg, task->cgroups->dfl_cgrp, 0,
                               BPF_LOCAL_STORAGE_GET_F_CREATE);
        ...
}

> +
> +SEC("fexit/inet_stream_connect")
> +int BPF_PROG(check_cgroup_storage_refcount, struct socket *sock, struct sockaddr *uaddr, int addr_len,
> +            int flags)
> +{
> +       struct lock_map_value *v;
> +       u32 refcnt;
> +
> +       if (uaddr->sa_family != AF_INET6)
> +               return 0;
> +
> +       v = bpf_cgrp_storage_get(&cgrp_strg, sock->sk->sk_cgrp_data.cgroup, 0, 0);
> +       if (!v)
> +               return 0;
> +
> +       refcnt = probe_read_refcount();
> +       bpf_spin_lock(&v->lock);
> +       v->value = refcnt;
> +       bpf_spin_unlock(&v->lock);
> +       return 0;
> +}
> +
>  char _license[] SEC("license") = "GPL";
> --
> 2.51.0
>
>

Re: [PATCH bpf v3 4/4] selftests/bpf: Add tests to verify freeing the special fields when update hash and local storage maps

Posted by Leon Hwang 3 months, 1 week ago


On 2025/10/28 00:34, Amery Hung wrote:
> On Sun, Oct 26, 2025 at 8:42 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>>
>> Add tests to verify that updating hash and local storage maps decrements
>> refcount when BPF_KPTR_REF objects are involved.
>>
>> The tests perform the following steps:
>>
>> 1. Call update_elem() to insert an initial value.
>> 2. Use bpf_refcount_acquire() to increment the refcount.
>> 3. Store the node pointer in the map value.
>> 4. Add the node to a linked list.
>> 5. Probe-read the refcount and verify it is *2*.
>> 6. Call update_elem() again to trigger refcount decrement.
>> 7. Probe-read the refcount and verify it is *1*.
>>
>> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
>> ---
>>  .../bpf/prog_tests/refcounted_kptr.c          | 178 +++++++++++++++++-
>>  .../selftests/bpf/progs/refcounted_kptr.c     | 160 ++++++++++++++++
>>  2 files changed, 337 insertions(+), 1 deletion(-)
>>

[...]

>> @@ -44,3 +44,179 @@ void test_refcounted_kptr_wrong_owner(void)
>>         ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a2 retval");
>>         refcounted_kptr__destroy(skel);
>>  }
>> +
>> +static void test_refcnt_leak(void *values, size_t values_sz, u64 flags, struct bpf_map *map,
>> +                            struct bpf_program *prog_leak, struct bpf_program *prog_check)
>> +{

[...]

>> +}
>
> Just use syscall BPF programs across different subtests, and you can
> share this test_refcnt_leak() across subtests.
>
> It also saves you some code setting up bpf_test_run_opts. You can just
> call bpf_prog_test_run_opts(prog_fd, NULL) as you don't pass any input
> from ctx.
>
>> +
>> +static void test_percpu_hash_refcount_leak(void)
>> +{

[...]

>> +out:
>> +       close(cgroup);
>> +       refcounted_kptr__destroy(skel);
>> +       if (client_fd >= 0)
>> +               close(client_fd);
>> +       if (server_fd >= 0)
>> +               close(server_fd);
>> +}
>
> Then, you won't need to set up server, connection.... just to
> read/write cgroup local storage. Just call test_refcnt_leak() that
> runs the two BPF syscall programs for cgroup local storage.
>

[...]

>
>
> And in syscall BPF program, you can simply get the cgroup through the
> current task
>
> SEC("syscall")
> int syscall_prog(void *ctx)
> {
>         struct task_struct *task = bpf_get_current_task_btf();
>
>         v = bpf_cgrp_storage_get(&cgrp_strg, task->cgroups->dfl_cgrp, 0,
>                                BPF_LOCAL_STORAGE_GET_F_CREATE);
>         ...
> }
>

Hi Amery,

Thanks for the suggestion.

I tried your approach, but the verifier rejected it with the following
error:

0: R1=ctx() R10=fp0
; task = bpf_get_current_task_btf(); @ refcounted_kptr.c:686
0: (85) call bpf_get_current_task_btf#158     ; R0=trusted_ptr_task_struct()
; v = bpf_cgrp_storage_get(&cgrp_strg, task->cgroups->dfl_cgrp, 0, @
refcounted_kptr.c:687
1: (79) r1 = *(u64 *)(r0 +4856)       ; R0=trusted_ptr_task_struct()
R1=untrusted_ptr_css_set()
2: (79) r2 = *(u64 *)(r1 +96)         ; R1=untrusted_ptr_css_set()
R2=untrusted_ptr_cgroup()
3: (18) r1 = 0xffffa1b442a4b800       ; R1=map_ptr(map=cgrp_strg,ks=4,vs=16)
5: (b7) r3 = 0                        ; R3=0
6: (b7) r4 = 1                        ; R4=1
7: (85) call bpf_cgrp_storage_get#210
R2 type=untrusted_ptr_ expected=ptr_, trusted_ptr_, rcu_ptr_
processed 7 insns (limit 1000000) max_states_per_insn 0 total_states 0
peak_states 0 mark_read 0

The issue is that dereferencing task->cgroups->dfl_cgrp results in an
untrusted pointer, which bpf_cgrp_storage_get() doesn't accept in
syscall programs. I will investigate the issue later.

However, while searching for 'task->cgroups->dfl_cgrp' usage in the
selftests, I found that bpf_cgrp_storage_get() works fine in fentry
programs. This approach also has the benefit of avoiding the cgroup
setup and client/server infrastructure.

I'll respin the selftest using fentry programs instead.

Thanks,
Leon

[...]

Re: [PATCH bpf v3 4/4] selftests/bpf: Add tests to verify freeing the special fields when update hash and local storage maps

Posted by Leon Hwang 3 months, 1 week ago


On 2025/10/29 22:58, Leon Hwang wrote:
> 
> 
> On 2025/10/28 00:34, Amery Hung wrote:
>> On Sun, Oct 26, 2025 at 8:42 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>>>
>>> Add tests to verify that updating hash and local storage maps decrements
>>> refcount when BPF_KPTR_REF objects are involved.
>>>
>>> The tests perform the following steps:
>>>
>>> 1. Call update_elem() to insert an initial value.
>>> 2. Use bpf_refcount_acquire() to increment the refcount.
>>> 3. Store the node pointer in the map value.
>>> 4. Add the node to a linked list.
>>> 5. Probe-read the refcount and verify it is *2*.
>>> 6. Call update_elem() again to trigger refcount decrement.
>>> 7. Probe-read the refcount and verify it is *1*.
>>>
>>> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
>>> ---
>>>  .../bpf/prog_tests/refcounted_kptr.c          | 178 +++++++++++++++++-
>>>  .../selftests/bpf/progs/refcounted_kptr.c     | 160 ++++++++++++++++
>>>  2 files changed, 337 insertions(+), 1 deletion(-)
>>>
> 
> [...]
> 
>>> @@ -44,3 +44,179 @@ void test_refcounted_kptr_wrong_owner(void)
>>>         ASSERT_OK(opts.retval, "rbtree_wrong_owner_remove_fail_a2 retval");
>>>         refcounted_kptr__destroy(skel);
>>>  }
>>> +
>>> +static void test_refcnt_leak(void *values, size_t values_sz, u64 flags, struct bpf_map *map,
>>> +                            struct bpf_program *prog_leak, struct bpf_program *prog_check)
>>> +{
> 
> [...]
> 
>>> +}
>>
>> Just use syscall BPF programs across different subtests, and you can
>> share this test_refcnt_leak() across subtests.
>>
>> It also saves you some code setting up bpf_test_run_opts. You can just
>> call bpf_prog_test_run_opts(prog_fd, NULL) as you don't pass any input
>> from ctx.
>>
>>> +
>>> +static void test_percpu_hash_refcount_leak(void)
>>> +{
> 
> [...]
> 
>>> +out:
>>> +       close(cgroup);
>>> +       refcounted_kptr__destroy(skel);
>>> +       if (client_fd >= 0)
>>> +               close(client_fd);
>>> +       if (server_fd >= 0)
>>> +               close(server_fd);
>>> +}
>>
>> Then, you won't need to set up server, connection.... just to
>> read/write cgroup local storage. Just call test_refcnt_leak() that
>> runs the two BPF syscall programs for cgroup local storage.
>>
> 
> [...]
> 
>>
>>
>> And in syscall BPF program, you can simply get the cgroup through the
>> current task
>>
>> SEC("syscall")
>> int syscall_prog(void *ctx)
>> {
>>         struct task_struct *task = bpf_get_current_task_btf();
>>
>>         v = bpf_cgrp_storage_get(&cgrp_strg, task->cgroups->dfl_cgrp, 0,
>>                                BPF_LOCAL_STORAGE_GET_F_CREATE);
>>         ...
>> }
>>
> 
> Hi Amery,
> 
> Thanks for the suggestion.
> 
> I tried your approach, but the verifier rejected it with the following
> error:
> 
> 0: R1=ctx() R10=fp0
> ; task = bpf_get_current_task_btf(); @ refcounted_kptr.c:686
> 0: (85) call bpf_get_current_task_btf#158     ; R0=trusted_ptr_task_struct()
> ; v = bpf_cgrp_storage_get(&cgrp_strg, task->cgroups->dfl_cgrp, 0, @
> refcounted_kptr.c:687
> 1: (79) r1 = *(u64 *)(r0 +4856)       ; R0=trusted_ptr_task_struct()
> R1=untrusted_ptr_css_set()
> 2: (79) r2 = *(u64 *)(r1 +96)         ; R1=untrusted_ptr_css_set()
> R2=untrusted_ptr_cgroup()
> 3: (18) r1 = 0xffffa1b442a4b800       ; R1=map_ptr(map=cgrp_strg,ks=4,vs=16)
> 5: (b7) r3 = 0                        ; R3=0
> 6: (b7) r4 = 1                        ; R4=1
> 7: (85) call bpf_cgrp_storage_get#210
> R2 type=untrusted_ptr_ expected=ptr_, trusted_ptr_, rcu_ptr_
> processed 7 insns (limit 1000000) max_states_per_insn 0 total_states 0
> peak_states 0 mark_read 0
> 

After analyzing the verifier log (with a bit of AI help), it turned out
that 'task->cgroups->dfl_cgrp' wasn't protected by an RCU read lock.
According to verifier.c::btf_ld_kptr_type(), this pointer must be
accessed either within an RCU critical section or in a non-sleepable
program.

Adding RCU protection fixed the issue:

    bpf_rcu_read_lock()
    v = bpf_cgrp_storage_get(&cgrp_strg, task->cgroups->dfl_cgrp, 0,
                             BPF_LOCAL_STORAGE_GET_F_CREATE);
    bpf_rcu_read_unlock()

With this change, test_refcnt_leak() can now be used across all subtests.

Thanks again for the helpful suggestion.

Thanks,
Leon

[PATCH bpf v3 1/4] bpf: Free special fields when update [lru_,]percpu_hash maps
[PATCH bpf v3 2/4] bpf: Free special fields when update hash maps with BPF_F_LOCK
[PATCH bpf v3 3/4] bpf: Free special fields when update local storage maps
[PATCH bpf v3 4/4] selftests/bpf: Add tests to verify freeing the special fields when update hash and local storage maps