[v2] selftests/bpf: Run test_perf_branches more than 1 time

[PATCH bpf-next v2] selftests/bpf: Run test_perf_branches more than 1 time

Posted by Maya Matuszczyk 2 days, 15 hours ago

test_perf_branches bpf program was only being ran once, and
perf_branches test would fail if the only sample that was passed to
test_perf_branches didn't have branch data.
Fix those failures by running it a few more times and waiting until a valid
sample shows up, and leave the perf event only enabled for the duration of
test spins.

Signed-off-by: Maya Matuszczyk <mayamatuszczyk@google.com>
---
Changes from v1:
- Switched from CHECK to ASSERT_OK (Alexei Starovoitov)
- bpf -> bpf-next

v1: https://lore.kernel.org/bpf/20260408095710.4090495-1-mayamatuszczyk@google.com/
---
 .../selftests/bpf/prog_tests/perf_branches.c  | 57 +++++++++++++------
 .../selftests/bpf/progs/test_perf_branches.c  | 11 ++--
 2 files changed, 43 insertions(+), 25 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
index 0a7ef770c487..153afcdc082e 100644
--- a/tools/testing/selftests/bpf/prog_tests/perf_branches.c
+++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
@@ -15,10 +15,6 @@ static void check_good_sample(struct test_perf_branches *skel)
 	int pbe_size = sizeof(struct perf_branch_entry);
 	int duration = 0;
 
-	if (CHECK(!skel->bss->run_cnt, "invalid run_cnt",
-		  "checked sample validity before prog run"))
-		return;
-
 	if (CHECK(!skel->bss->valid, "output not valid",
 		 "no valid sample from prog"))
 		return;
@@ -49,10 +45,6 @@ static void check_bad_sample(struct test_perf_branches *skel)
 	int written_stack = skel->bss->written_stack_out;
 	int duration = 0;
 
-	if (CHECK(!skel->bss->run_cnt, "invalid run_cnt",
-		  "checked sample validity before prog run"))
-		return;
-
 	if (CHECK(!skel->bss->valid, "output not valid",
 		 "no valid sample from prog"))
 		return;
@@ -73,7 +65,6 @@ static void test_perf_branches_common(int perf_fd,
 	bool detached = false;
 	struct bpf_link *link;
 	volatile int j = 0;
-	cpu_set_t cpu_set;
 
 	skel = test_perf_branches__open_and_load();
 	if (CHECK(!skel, "test_perf_branches_load",
@@ -85,19 +76,35 @@ static void test_perf_branches_common(int perf_fd,
 	if (!ASSERT_OK_PTR(link, "attach_perf_event"))
 		goto out_destroy_skel;
 
-	/* generate some branches on cpu 0 */
-	CPU_ZERO(&cpu_set);
-	CPU_SET(0, &cpu_set);
-	err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
-	if (CHECK(err, "set_affinity", "cpu #0, err %d\n", err))
+	err = ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0);
+	if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_RESET)"))
 		goto out_destroy;
 
-	/* Spin the loop for a while by using a high iteration count, and by
-	 * checking whether the specific run count marker has been explicitly
-	 * incremented at least once by the backing perf_event BPF program.
+	err = ioctl(perf_fd, PERF_EVENT_IOC_REFRESH, 10);
+	if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_REFRESH)"))
+		goto out_destroy;
+
+	/* Make up a bunch of branch events until we time out or bpf prog sets a flag
+	 * that it caught a valid event. To be completely sure that required_size_out,
+	 * written_stack_out or written_global_out writes and reads won't get
+	 * reordered by anyone in a way that would fail this test, use atomic
+	 * operations
 	 */
-	for (i = 0; i < 100000000 && !*(volatile int *)&skel->bss->run_cnt; ++i)
+	for (i = 0; i < 100000000; ++i) {
+		/* This technically should be an atomic load with acquire
+		 * ordering, but we don't want to clog up memory bus while
+		 * spinning
+		 */
+		if (READ_ONCE(skel->bss->valid) > 0)
+			break;
 		++j;
+	}
+
+	__atomic_thread_fence(__ATOMIC_ACQUIRE);
+
+	err = ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0);
+	if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_DISABLE)"))
+		goto out_destroy;
 
 	test_perf_branches__detach(skel);
 	detached = true;
@@ -121,6 +128,7 @@ static void test_perf_branches_hw(void)
 	attr.size = sizeof(attr);
 	attr.type = PERF_TYPE_HARDWARE;
 	attr.config = PERF_COUNT_HW_CPU_CYCLES;
+	attr.disabled = 1;
 	attr.freq = 1;
 	attr.sample_freq = 1000;
 	attr.sample_type = PERF_SAMPLE_BRANCH_STACK;
@@ -162,6 +170,7 @@ static void test_perf_branches_no_hw(void)
 	attr.size = sizeof(attr);
 	attr.type = PERF_TYPE_SOFTWARE;
 	attr.config = PERF_COUNT_SW_CPU_CLOCK;
+	attr.disabled = 1;
 	attr.freq = 1;
 	attr.sample_freq = 1000;
 	pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
@@ -175,6 +184,18 @@ static void test_perf_branches_no_hw(void)
 
 void test_perf_branches(void)
 {
+	int err, duration = 0;
+	cpu_set_t cpu_set;
+
+	/* Pin ourselves to cpu0 so that all branches we generate would be
+	 * also on cpu0
+	 */
+	CPU_ZERO(&cpu_set);
+	CPU_SET(0, &cpu_set);
+	err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
+	if (!ASSERT_OK(err, "pthread_set_affinity"))
+		return;
+
 	if (test__start_subtest("perf_branches_hw"))
 		test_perf_branches_hw();
 	if (test__start_subtest("perf_branches_no_hw"))
diff --git a/tools/testing/selftests/bpf/progs/test_perf_branches.c b/tools/testing/selftests/bpf/progs/test_perf_branches.c
index 05ac9410cd68..9806414cebc4 100644
--- a/tools/testing/selftests/bpf/progs/test_perf_branches.c
+++ b/tools/testing/selftests/bpf/progs/test_perf_branches.c
@@ -8,7 +8,6 @@
 #include <bpf/bpf_tracing.h>
 
 int valid = 0;
-int run_cnt = 0;
 int required_size_out = 0;
 int written_stack_out = 0;
 int written_global_out = 0;
@@ -25,13 +24,11 @@ int perf_branches(void *ctx)
 	__u64 entries[4 * 3] = {0};
 	int required_size, written_stack, written_global;
 
-	++run_cnt;
-
 	/* write to stack */
 	written_stack = bpf_read_branch_records(ctx, entries, sizeof(entries), 0);
 	/* ignore spurious events */
 	if (!written_stack)
-		return 1;
+		return 0;
 
 	/* get required size */
 	required_size = bpf_read_branch_records(ctx, NULL, 0,
@@ -40,14 +37,14 @@ int perf_branches(void *ctx)
 	written_global = bpf_read_branch_records(ctx, fpbe, sizeof(fpbe), 0);
 	/* ignore spurious events */
 	if (!written_global)
-		return 1;
+		return 0;
 
 	required_size_out = required_size;
 	written_stack_out = written_stack;
 	written_global_out = written_global;
-	valid = 1;
+	__atomic_add_fetch(&valid, 1, __ATOMIC_RELEASE);
 
-	return 0;
+	return 1;
 }
 
 char _license[] SEC("license") = "GPL";
-- 
2.54.0.1032.g2f8565e1d1-goog

Re: [PATCH bpf-next v2] selftests/bpf: Run test_perf_branches more than 1 time

Posted by Mykyta Yatsenko 2 days, 14 hours ago

On 6/5/26 2:06 PM, Maya Matuszczyk wrote:
> test_perf_branches bpf program was only being ran once, and
> perf_branches test would fail if the only sample that was passed to
> test_perf_branches didn't have branch data.
> Fix those failures by running it a few more times and waiting until a valid
> sample shows up, and leave the perf event only enabled for the duration of
> test spins.
> 
> Signed-off-by: Maya Matuszczyk <mayamatuszczyk@google.com>
> ---

Change looks good.
minor nit: Maybe it makes sense to migrate all CHECKs in this test to ASSERT, 
though.

Acked-by: Mykyta Yatsenko <yatsenko@meta.com>

> Changes from v1:
> - Switched from CHECK to ASSERT_OK (Alexei Starovoitov)
> - bpf -> bpf-next
> 
> v1: https://lore.kernel.org/bpf/20260408095710.4090495-1-mayamatuszczyk@google.com/
> ---
>  .../selftests/bpf/prog_tests/perf_branches.c  | 57 +++++++++++++------
>  .../selftests/bpf/progs/test_perf_branches.c  | 11 ++--
>  2 files changed, 43 insertions(+), 25 deletions(-)
> 
> diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
> index 0a7ef770c487..153afcdc082e 100644
> --- a/tools/testing/selftests/bpf/prog_tests/perf_branches.c
> +++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
> @@ -15,10 +15,6 @@ static void check_good_sample(struct test_perf_branches *skel)
>  	int pbe_size = sizeof(struct perf_branch_entry);
>  	int duration = 0;
>  
> -	if (CHECK(!skel->bss->run_cnt, "invalid run_cnt",
> -		  "checked sample validity before prog run"))
> -		return;
> -
>  	if (CHECK(!skel->bss->valid, "output not valid",
>  		 "no valid sample from prog"))
>  		return;
> @@ -49,10 +45,6 @@ static void check_bad_sample(struct test_perf_branches *skel)
>  	int written_stack = skel->bss->written_stack_out;
>  	int duration = 0;
>  
> -	if (CHECK(!skel->bss->run_cnt, "invalid run_cnt",
> -		  "checked sample validity before prog run"))
> -		return;
> -
>  	if (CHECK(!skel->bss->valid, "output not valid",
>  		 "no valid sample from prog"))
>  		return;
> @@ -73,7 +65,6 @@ static void test_perf_branches_common(int perf_fd,
>  	bool detached = false;
>  	struct bpf_link *link;
>  	volatile int j = 0;
> -	cpu_set_t cpu_set;
>  
>  	skel = test_perf_branches__open_and_load();
>  	if (CHECK(!skel, "test_perf_branches_load",
> @@ -85,19 +76,35 @@ static void test_perf_branches_common(int perf_fd,
>  	if (!ASSERT_OK_PTR(link, "attach_perf_event"))
>  		goto out_destroy_skel;
>  
> -	/* generate some branches on cpu 0 */
> -	CPU_ZERO(&cpu_set);
> -	CPU_SET(0, &cpu_set);
> -	err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
> -	if (CHECK(err, "set_affinity", "cpu #0, err %d\n", err))
> +	err = ioctl(perf_fd, PERF_EVENT_IOC_RESET, 0);
> +	if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_RESET)"))
>  		goto out_destroy;
>  
> -	/* Spin the loop for a while by using a high iteration count, and by
> -	 * checking whether the specific run count marker has been explicitly
> -	 * incremented at least once by the backing perf_event BPF program.
> +	err = ioctl(perf_fd, PERF_EVENT_IOC_REFRESH, 10);
> +	if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_REFRESH)"))
> +		goto out_destroy;
> +
> +	/* Make up a bunch of branch events until we time out or bpf prog sets a flag
> +	 * that it caught a valid event. To be completely sure that required_size_out,
> +	 * written_stack_out or written_global_out writes and reads won't get
> +	 * reordered by anyone in a way that would fail this test, use atomic
> +	 * operations
>  	 */
> -	for (i = 0; i < 100000000 && !*(volatile int *)&skel->bss->run_cnt; ++i)
> +	for (i = 0; i < 100000000; ++i) {
> +		/* This technically should be an atomic load with acquire
> +		 * ordering, but we don't want to clog up memory bus while
> +		 * spinning
> +		 */
> +		if (READ_ONCE(skel->bss->valid) > 0)
> +			break;
>  		++j;
> +	}
> +
> +	__atomic_thread_fence(__ATOMIC_ACQUIRE);
> +
> +	err = ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, 0);
> +	if (!ASSERT_OK(err, "ioctl(PERF_EVENT_IOC_DISABLE)"))
> +		goto out_destroy;
>  
>  	test_perf_branches__detach(skel);
>  	detached = true;
> @@ -121,6 +128,7 @@ static void test_perf_branches_hw(void)
>  	attr.size = sizeof(attr);
>  	attr.type = PERF_TYPE_HARDWARE;
>  	attr.config = PERF_COUNT_HW_CPU_CYCLES;
> +	attr.disabled = 1;
>  	attr.freq = 1;
>  	attr.sample_freq = 1000;
>  	attr.sample_type = PERF_SAMPLE_BRANCH_STACK;
> @@ -162,6 +170,7 @@ static void test_perf_branches_no_hw(void)
>  	attr.size = sizeof(attr);
>  	attr.type = PERF_TYPE_SOFTWARE;
>  	attr.config = PERF_COUNT_SW_CPU_CLOCK;
> +	attr.disabled = 1;
>  	attr.freq = 1;
>  	attr.sample_freq = 1000;
>  	pfd = syscall(__NR_perf_event_open, &attr, -1, 0, -1, PERF_FLAG_FD_CLOEXEC);
> @@ -175,6 +184,18 @@ static void test_perf_branches_no_hw(void)
>  
>  void test_perf_branches(void)
>  {
> +	int err, duration = 0;
> +	cpu_set_t cpu_set;
> +
> +	/* Pin ourselves to cpu0 so that all branches we generate would be
> +	 * also on cpu0
> +	 */
> +	CPU_ZERO(&cpu_set);
> +	CPU_SET(0, &cpu_set);
> +	err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
> +	if (!ASSERT_OK(err, "pthread_set_affinity"))
> +		return;
> +
>  	if (test__start_subtest("perf_branches_hw"))
>  		test_perf_branches_hw();
>  	if (test__start_subtest("perf_branches_no_hw"))
> diff --git a/tools/testing/selftests/bpf/progs/test_perf_branches.c b/tools/testing/selftests/bpf/progs/test_perf_branches.c
> index 05ac9410cd68..9806414cebc4 100644
> --- a/tools/testing/selftests/bpf/progs/test_perf_branches.c
> +++ b/tools/testing/selftests/bpf/progs/test_perf_branches.c
> @@ -8,7 +8,6 @@
>  #include <bpf/bpf_tracing.h>
>  
>  int valid = 0;
> -int run_cnt = 0;
>  int required_size_out = 0;
>  int written_stack_out = 0;
>  int written_global_out = 0;
> @@ -25,13 +24,11 @@ int perf_branches(void *ctx)
>  	__u64 entries[4 * 3] = {0};
>  	int required_size, written_stack, written_global;
>  
> -	++run_cnt;
> -
>  	/* write to stack */
>  	written_stack = bpf_read_branch_records(ctx, entries, sizeof(entries), 0);
>  	/* ignore spurious events */
>  	if (!written_stack)
> -		return 1;
> +		return 0;
>  
>  	/* get required size */
>  	required_size = bpf_read_branch_records(ctx, NULL, 0,
> @@ -40,14 +37,14 @@ int perf_branches(void *ctx)
>  	written_global = bpf_read_branch_records(ctx, fpbe, sizeof(fpbe), 0);
>  	/* ignore spurious events */
>  	if (!written_global)
> -		return 1;
> +		return 0;
>  
>  	required_size_out = required_size;
>  	written_stack_out = written_stack;
>  	written_global_out = written_global;
> -	valid = 1;
> +	__atomic_add_fetch(&valid, 1, __ATOMIC_RELEASE);
>  
> -	return 0;
> +	return 1;
>  }
>  
>  char _license[] SEC("license") = "GPL";

Re: [PATCH bpf-next v2] selftests/bpf: Run test_perf_branches more than 1 time

Posted by bot+bpf-ci@kernel.org 2 days, 14 hours ago

> diff --git a/tools/testing/selftests/bpf/prog_tests/perf_branches.c b/tools/testing/selftests/bpf/prog_tests/perf_branches.c
> index 0a7ef770c487..153afcdc082e 100644
> --- a/tools/testing/selftests/bpf/prog_tests/perf_branches.c
> +++ b/tools/testing/selftests/bpf/prog_tests/perf_branches.c

[ ... ]

> @@ -85,19 +76,35 @@ static void test_perf_branches_common(int perf_fd,
> +	/* Make up a bunch of branch events until we time out or bpf prog sets a flag
> +	 * that it caught a valid event. To be completely sure that required_size_out,
> +	 * written_stack_out or written_global_out writes and reads won't get
> +	 * reordered by anyone in a way that would fail this test, use atomic
> +	 * operations
> +	 */

This isn't a bug, but a prior automated review on v2 noted these new
multi-line comments keep text on the same line as the opening /*, while
the BPF subsystem style usually wants the opening /* on its own line.

The same applies to the in-loop comment and the cpu0 pinning comment in
test_perf_branches().

> +	for (i = 0; i < 100000000; ++i) {
> +		/* This technically should be an atomic load with acquire
> +		 * ordering, but we don't want to clog up memory bus while
> +		 * spinning
> +		 */
> +		if (READ_ONCE(skel->bss->valid) > 0)
> +			break;
>  		++j;
> +	}

[ ... ]

> diff --git a/tools/testing/selftests/bpf/progs/test_perf_branches.c b/tools/testing/selftests/bpf/progs/test_perf_branches.c
> index 05ac9410cd68..9806414cebc4 100644
> --- a/tools/testing/selftests/bpf/progs/test_perf_branches.c
> +++ b/tools/testing/selftests/bpf/progs/test_perf_branches.c

[ ... ]

> @@ -40,14 +37,14 @@ int perf_branches(void *ctx)
>  	required_size_out = required_size;
>  	written_stack_out = written_stack;
>  	written_global_out = written_global;
> -	valid = 1;
> +	__atomic_add_fetch(&valid, 1, __ATOMIC_RELEASE);

Does this still emit the intended release barrier in the BPF program?

A prior automated review on v2 raised this and it does not look like it was
answered.  Because the return value of __atomic_add_fetch() is discarded,
the Clang BPF backend can lower this to a relaxed BPF atomic add (without
BPF_FETCH), which on weakly-ordered architectures would carry no release
ordering.

If that happens, could the stores to required_size_out, written_stack_out
and written_global_out be observed by the waiter after the store to valid,
so that the check_good_sample()/check_bad_sample() readers see valid > 0
but read stale outputs?

In this specific test the waiter pins itself to cpu0 and the perf event
follows the task, so the BPF producer and the userspace consumer run on the
same CPU and the reordering cannot actually be observed here.  Is relying on
that co-location intended, or would an explicit fetching/relaxed form make
the ordering contract clearer?


---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/27017486169