We added a memset operation during the adjust operation, which may cause
performance issues.
Therefore, we added perf testing, and testing found that for common header
length operations, memset() operation increased the performance overhead
by 2ns, which is negligible for the net stack.
Before memset
./test_progs -a xdp_adjust_head_perf -v
run adjust head with size 6 cost 56 ns
run adjust head with size 20 cost 56 ns
run adjust head with size 40 cost 56 ns
run adjust head with size 200 cost 56 ns
After memset
./test_progs -a xdp_adjust_head_perf -v
run adjust head with size 6 cost 58 ns
run adjust head with size 20 cost 58 ns
run adjust head with size 40 cost 58 ns
run adjust head with size 200 cost 66 ns
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
.../selftests/bpf/prog_tests/xdp_perf.c | 52 ++++++++++++++++---
tools/testing/selftests/bpf/progs/xdp_dummy.c | 14 +++++
2 files changed, 59 insertions(+), 7 deletions(-)
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
index ec5369f247cb..1b4260c6e5d7 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_perf.c
@@ -1,10 +1,11 @@
// SPDX-License-Identifier: GPL-2.0
#include <test_progs.h>
+#include <network_helpers.h>
+#include "xdp_dummy.skel.h"
void test_xdp_perf(void)
{
- const char *file = "./xdp_dummy.bpf.o";
- struct bpf_object *obj;
+ struct xdp_dummy *skel;
char in[128], out[128];
int err, prog_fd;
LIBBPF_OPTS(bpf_test_run_opts, topts,
@@ -15,14 +16,51 @@ void test_xdp_perf(void)
.repeat = 1000000,
);
- err = bpf_prog_test_load(file, BPF_PROG_TYPE_XDP, &obj, &prog_fd);
- if (CHECK_FAIL(err))
- return;
-
+ skel = xdp_dummy__open_and_load();
+ prog_fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
err = bpf_prog_test_run_opts(prog_fd, &topts);
ASSERT_OK(err, "test_run");
ASSERT_EQ(topts.retval, XDP_PASS, "test_run retval");
ASSERT_EQ(topts.data_size_out, 128, "test_run data_size_out");
- bpf_object__close(obj);
+ xdp_dummy__destroy(skel);
+}
+
+void test_xdp_adjust_head_perf(void)
+{
+ struct xdp_dummy *skel;
+ int repeat = 9000000;
+ struct xdp_md ctx_in;
+ char data[100];
+ int err, prog_fd;
+ size_t test_header_size[] = {
+ ETH_ALEN,
+ sizeof(struct iphdr),
+ sizeof(struct ipv6hdr),
+ 200,
+ };
+ DECLARE_LIBBPF_OPTS(bpf_test_run_opts, topts,
+ .data_in = &data,
+ .data_size_in = sizeof(data),
+ .repeat = repeat,
+ );
+
+ topts.ctx_in = &ctx_in;
+ topts.ctx_size_in = sizeof(ctx_in);
+ memset(&ctx_in, 0, sizeof(ctx_in));
+ ctx_in.data_meta = 0;
+ ctx_in.data_end = ctx_in.data + sizeof(data);
+
+ skel = xdp_dummy__open_and_load();
+ prog_fd = bpf_program__fd(skel->progs.xdp_dummy_adjust_head);
+
+ for (int i = 0; i < ARRAY_SIZE(test_header_size); i++) {
+ skel->bss->head_size = test_header_size[i];
+ err = bpf_prog_test_run_opts(prog_fd, &topts);
+ ASSERT_OK(err, "test_run");
+ ASSERT_EQ(topts.retval, XDP_PASS, "test_run retval");
+ fprintf(stdout, "run adjust head with size %zd cost %d ns\n",
+ test_header_size[i], topts.duration);
+ }
+ xdp_dummy__destroy(skel);
}
diff --git a/tools/testing/selftests/bpf/progs/xdp_dummy.c b/tools/testing/selftests/bpf/progs/xdp_dummy.c
index d988b2e0cee8..7bebedbbc949 100644
--- a/tools/testing/selftests/bpf/progs/xdp_dummy.c
+++ b/tools/testing/selftests/bpf/progs/xdp_dummy.c
@@ -4,10 +4,24 @@
#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
+int head_size;
+
SEC("xdp")
int xdp_dummy_prog(struct xdp_md *ctx)
{
return XDP_PASS;
}
+SEC("xdp")
+int xdp_dummy_adjust_head(struct xdp_md *ctx)
+{
+ if (bpf_xdp_adjust_head(ctx, -head_size))
+ return XDP_DROP;
+
+ if (bpf_xdp_adjust_head(ctx, head_size))
+ return XDP_DROP;
+
+ return XDP_PASS;
+}
+
char _license[] SEC("license") = "GPL";
--
2.47.1
On Mon, 31 Mar 2025 11:23:45 +0800 Jiayuan Chen wrote: > which is negligible for the net stack. > > Before memset > ./test_progs -a xdp_adjust_head_perf -v > run adjust head with size 6 cost 56 ns > run adjust head with size 20 cost 56 ns > run adjust head with size 40 cost 56 ns > run adjust head with size 200 cost 56 ns > > After memset > ./test_progs -a xdp_adjust_head_perf -v > run adjust head with size 6 cost 58 ns > run adjust head with size 20 cost 58 ns > run adjust head with size 40 cost 58 ns > run adjust head with size 200 cost 66 ns FWIW I'm not sure if this is "negligible" for XDP like you say, but I defer to Jesper :)
On 03/04/2025 02.24, Jakub Kicinski wrote: > On Mon, 31 Mar 2025 11:23:45 +0800 Jiayuan Chen wrote: >> which is negligible for the net stack. >> >> Before memset >> ./test_progs -a xdp_adjust_head_perf -v >> run adjust head with size 6 cost 56 ns >> run adjust head with size 20 cost 56 ns >> run adjust head with size 40 cost 56 ns >> run adjust head with size 200 cost 56 ns >> >> After memset >> ./test_progs -a xdp_adjust_head_perf -v >> run adjust head with size 6 cost 58 ns >> run adjust head with size 20 cost 58 ns >> run adjust head with size 40 cost 58 ns >> run adjust head with size 200 cost 66 ns > > FWIW I'm not sure if this is "negligible" for XDP like you say, > but I defer to Jesper :) It would be too much for the XDP_DROP use-case, e.g. DDoS protection and driver hardware eval. But this is changing a BPF-helper, which means it is opt-in by the BPF-programmer. Thus, we can accept larger perf overhead here. I suspect your 2 nanosec overhead primarily comes from the function call overhead. On my AMD production system with SRSO mitigation enabled I expect to see around 6 ns overhead (5.699 ns), which is sad. I've done a lot of benchmarking of memset (see [1]). One take-away is that memset with small const values will get compiled into very fast code that avoids the function call (basically QWORD MOVs). E.g. memset with const 32 is extremely fast[2], on my system it takes 0.673 ns (and 0.562 ns comes from for-loop overhead). Thus, it is possible to do something faster, as we are clearing very small values. I.e. using a duff's device construct like I did for remainder in [2]. In this case, as this is a BPF-helper, I am uncertain if it is worth the complexity to add such optimizations... I guess not. This turned into a long way of saying, I'm okay with this change. [1] https://github.com/netoptimizer/prototype-kernel/blob/master/kernel/lib/time_bench_memset.c [2] https://github.com/netoptimizer/prototype-kernel/blob/35b1716d0c300e7fa2c8b6d8cfed2ec81df8f3a4/kernel/lib/time_bench_memset.c#L520-L521 --Jesper
© 2016 - 2025 Red Hat, Inc.