From nobody Fri Apr 10 12:50:51 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 218CDECAAA1 for ; Tue, 6 Sep 2022 20:10:45 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231230AbiIFUKm (ORCPT ); Tue, 6 Sep 2022 16:10:42 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:55358 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S230087AbiIFUKP (ORCPT ); Tue, 6 Sep 2022 16:10:15 -0400 Received: from linux.microsoft.com (linux.microsoft.com [13.77.154.182]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id 35E3DBC80F; Tue, 6 Sep 2022 13:05:14 -0700 (PDT) Received: from pwmachine.numericable.fr (85-170-34-72.rev.numericable.fr [85.170.34.72]) by linux.microsoft.com (Postfix) with ESMTPSA id C7ACA20B929C; Tue, 6 Sep 2022 12:58:21 -0700 (PDT) DKIM-Filter: OpenDKIM Filter v2.11.0 linux.microsoft.com C7ACA20B929C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.microsoft.com; s=default; t=1662494306; bh=Y3tZA2UfXIUdO651eZpQHa6bop0UrWXdjXGmdZD7XUw=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=I0ImiMk2OSzmHAF9EcIea/Iv6FlA7LB/hWbPy9LpmWBxA/OWCrauZk3R6t9VWJC0j 3itEgHNH7uQHYpyP7WU01yZH1+GyGnywJA1qfj5ePpesxIZc6DIPD0paS1A2iERyfe wJx8sQQLjg4QvCTa3T51mNdXaCtjSca+V+FF1wrE= From: Francis Laniel To: bpf@vger.kernel.org Cc: Francis Laniel , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Jonathan Corbet , Mykola Lysenko , Shuah Khan , Joanne Koong , Dave Marchevsky , Lorenzo Bianconi , Maxim Mikityanskiy , Geliang Tang , "Naveen N. Rao" , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH v2 1/5] bpf: Make ring buffer overwritable. Date: Tue, 6 Sep 2022 21:56:42 +0200 Message-Id: <20220906195656.33021-2-flaniel@linux.microsoft.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220906195656.33021-1-flaniel@linux.microsoft.com> References: <20220906195656.33021-1-flaniel@linux.microsoft.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" By default, BPF ring buffer are size bounded, when producers already filled= the buffer, they need to wait for the consumer to get those data before adding = new ones. In terms of API, bpf_ringbuf_reserve() returns NULL if the buffer is full. This patch permits making BPF ring buffer overwritable. When producers already wrote as many data as the buffer size, they will beg= in to over write existing data, so the oldest will be replaced. As a result, bpf_ringbuf_reserve() never returns NULL. To avoid memory consumption, this patch writes data backward like overwrita= ble perf ring buffer added in commit 9ecda41acb97 ("perf/core: Add ::write_backward attribute to perf eve= nt"). Signed-off-by: Francis Laniel --- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/ringbuf.c | 43 ++++++++++++++++++++++++++++++---------- 2 files changed, 36 insertions(+), 10 deletions(-) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 59a217ca2dfd..c87a667649ab 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1227,6 +1227,9 @@ enum { =20 /* Create a map that is suitable to be an inner map with dynamic max entri= es */ BPF_F_INNER_MAP =3D (1U << 12), + +/* Create an overwritable BPF_RINGBUF */ + BFP_F_RB_OVERWRITABLE =3D (1U << 13), }; =20 /* Flags for BPF_PROG_QUERY. */ diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index ded4faeca192..369c61cfe8aa 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -12,7 +12,7 @@ #include #include =20 -#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE) +#define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE | BFP_F_RB_OVERWRITABLE) =20 /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */ #define RINGBUF_PGOFF \ @@ -37,6 +37,8 @@ struct bpf_ringbuf { u64 mask; struct page **pages; int nr_pages; + __u8 overwritable: 1, + __reserved: 7; spinlock_t spinlock ____cacheline_aligned_in_smp; /* Consumer and producer counters are put into separate pages to allow * mapping consumer page as r/w, but restrict producer page to r/o. @@ -127,7 +129,12 @@ static void bpf_ringbuf_notify(struct irq_work *work) wake_up_all(&rb->waitq); } =20 -static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) +static inline bool is_overwritable(struct bpf_ringbuf *rb) +{ + return !!rb->overwritable; +} + +static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node= , __u32 flags) { struct bpf_ringbuf *rb; =20 @@ -142,6 +149,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t dat= a_sz, int numa_node) rb->mask =3D data_sz - 1; rb->consumer_pos =3D 0; rb->producer_pos =3D 0; + rb->overwritable =3D !!(flags & BFP_F_RB_OVERWRITABLE); =20 return rb; } @@ -170,7 +178,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr= *attr) =20 bpf_map_init_from_attr(&rb_map->map, attr); =20 - rb_map->rb =3D bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node= ); + rb_map->rb =3D bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node= , attr->map_flags); if (!rb_map->rb) { kfree(rb_map); return ERR_PTR(-ENOMEM); @@ -248,6 +256,7 @@ static unsigned long ringbuf_avail_data_sz(struct bpf_r= ingbuf *rb) =20 cons_pos =3D smp_load_acquire(&rb->consumer_pos); prod_pos =3D smp_load_acquire(&rb->producer_pos); + return prod_pos - cons_pos; } =20 @@ -325,14 +334,24 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf= *rb, u64 size) } =20 prod_pos =3D rb->producer_pos; - new_prod_pos =3D prod_pos + len; =20 - /* check for out of ringbuf space by ensuring producer position - * doesn't advance more than (ringbuf_size - 1) ahead - */ - if (new_prod_pos - cons_pos > rb->mask) { - spin_unlock_irqrestore(&rb->spinlock, flags); - return NULL; + if (!is_overwritable(rb)) { + new_prod_pos =3D prod_pos + len; + + /* check for out of ringbuf space by ensuring producer position + * doesn't advance more than (ringbuf_size - 1) ahead + */ + if (new_prod_pos - cons_pos > rb->mask) { + spin_unlock_irqrestore(&rb->spinlock, flags); + return NULL; + } + } else { + /* + * With overwritable ring buffer we go from the end toward the + * beginning. + */ + prod_pos -=3D len; + new_prod_pos =3D prod_pos; } =20 hdr =3D (void *)rb->data + (prod_pos & rb->mask); @@ -457,10 +476,14 @@ BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, = u64, flags) =20 switch (flags) { case BPF_RB_AVAIL_DATA: + if (is_overwritable(rb)) + return -EINVAL; return ringbuf_avail_data_sz(rb); case BPF_RB_RING_SIZE: return rb->mask + 1; case BPF_RB_CONS_POS: + if (is_overwritable(rb)) + return -EINVAL; return smp_load_acquire(&rb->consumer_pos); case BPF_RB_PROD_POS: return smp_load_acquire(&rb->producer_pos); --=20 2.25.1 From nobody Fri Apr 10 12:50:51 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 3FA7DECAAA1 for ; Tue, 6 Sep 2022 20:10:42 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230389AbiIFUKk (ORCPT ); Tue, 6 Sep 2022 16:10:40 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:57268 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S230332AbiIFUKO (ORCPT ); Tue, 6 Sep 2022 16:10:14 -0400 Received: from linux.microsoft.com (linux.microsoft.com [13.77.154.182]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id 010781083; Tue, 6 Sep 2022 13:05:13 -0700 (PDT) Received: from pwmachine.numericable.fr (85-170-34-72.rev.numericable.fr [85.170.34.72]) by linux.microsoft.com (Postfix) with ESMTPSA id 9EE5D2049BAF; Tue, 6 Sep 2022 12:58:36 -0700 (PDT) DKIM-Filter: OpenDKIM Filter v2.11.0 linux.microsoft.com 9EE5D2049BAF DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.microsoft.com; s=default; t=1662494321; bh=5lDK7It259RN4ymtCiet+4wq5JdclQetrfA0fw0WV64=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=n4pfBsr8YWPyT8jTqgIQ0DuqftZrl5YeBbvTauX+bmGr6Oi5Og9mJq+BFVDiKjYRZ +28qWwbzGIlpkEiLF16UdzsP9MLefcnk+gN0iBPjiPS68hMTZENletySVI9taNLRSI zMDXto3nHyBx3sfB03NjnyOBkFLegNVD8mPIMfA8= From: Francis Laniel To: bpf@vger.kernel.org Cc: Francis Laniel , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Jonathan Corbet , Mykola Lysenko , Shuah Khan , Joanne Koong , Dave Marchevsky , Lorenzo Bianconi , Maxim Mikityanskiy , Geliang Tang , "Naveen N. Rao" , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH v2 2/5] selftests: Add BPF overwritable ring buffer self tests. Date: Tue, 6 Sep 2022 21:56:43 +0200 Message-Id: <20220906195656.33021-3-flaniel@linux.microsoft.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220906195656.33021-1-flaniel@linux.microsoft.com> References: <20220906195656.33021-1-flaniel@linux.microsoft.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Add tests to confirm behavior of overwritable BPF ring buffer, particularly= the oldest data being overwritten by newest ones. Signed-off-by: Francis Laniel --- tools/testing/selftests/bpf/Makefile | 5 +- .../bpf/prog_tests/ringbuf_overwritable.c | 158 ++++++++++++++++++ .../bpf/progs/test_ringbuf_overwritable.c | 61 +++++++ 3 files changed, 222 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/ringbuf_overwrit= able.c create mode 100644 tools/testing/selftests/bpf/progs/test_ringbuf_overwrit= able.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests= /bpf/Makefile index 8d59ec7f4c2d..96e95dcfc982 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -351,8 +351,9 @@ LINKED_SKELS :=3D test_static_linked.skel.h linked_func= s.skel.h \ test_usdt.skel.h =20 LSKELS :=3D kfunc_call_test.c fentry_test.c fexit_test.c fexit_sleep.c \ - test_ringbuf.c atomics.c trace_printk.c trace_vprintk.c \ - map_ptr_kern.c core_kern.c core_kern_overflow.c + test_ringbuf.c test_ringbuf_overwritable.c atomics.c \ + trace_printk.c trace_vprintk.c map_ptr_kern.c \ + core_kern.c core_kern_overflow.c # Generate both light skeleton and libbpf skeleton for these LSKELS_EXTRA :=3D test_ksyms_module.c test_ksyms_weak.c kfunc_call_test_su= bprog.c SKEL_BLACKLIST +=3D $$(LSKELS) diff --git a/tools/testing/selftests/bpf/prog_tests/ringbuf_overwritable.c = b/tools/testing/selftests/bpf/prog_tests/ringbuf_overwritable.c new file mode 100644 index 000000000000..b5ec1e62f761 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/ringbuf_overwritable.c @@ -0,0 +1,158 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "test_ringbuf_overwritable.lskel.h" + +struct sample { + int count; + /* + * filler size will be computed to have 8 samples in a 4096 bytes long + * buffer. + */ + char filler[4096 / 8 - sizeof(int) - 8]; +}; + +struct ring { + ring_buffer_sample_fn sample_cb; + __u8 overwritable: 1, + __reserved: 7; + void *ctx; + void *data; + unsigned long *consumer_pos; + unsigned long *producer_pos; + unsigned long mask; + int map_fd; +}; + +struct ring_buffer { + struct epoll_event *events; + struct ring *rings; + size_t page_size; + int epoll_fd; + int ring_cnt; +}; + +static int duration; +static struct test_ringbuf_overwritable_lskel *skel; + +void test_ringbuf_overwritable(void) +{ + const size_t rec_sz =3D BPF_RINGBUF_HDR_SZ + sizeof(struct sample); + int page_size =3D getpagesize(); + int sample_cnt =3D 0, sample_read =3D 0; + unsigned long mask =3D page_size - 1; + struct ring_buffer *ringbuf; + int err, *len_ptr, len; + struct sample *sample; + long read_pos; + void *data_ptr; + + skel =3D test_ringbuf_overwritable_lskel__open(); + if (CHECK(!skel, "skel_open", "skeleton open failed\n")) + return; + + skel->maps.ringbuf.max_entries =3D page_size; + + err =3D test_ringbuf_overwritable_lskel__load(skel); + if (CHECK(err !=3D 0, "skel_load", "skeleton load failed\n")) + goto cleanup; + + /* only trigger BPF program for current process */ + skel->bss->pid =3D getpid(); + + ringbuf =3D ring_buffer__new(skel->maps.ringbuf.map_fd, NULL, NULL, NULL); + if (CHECK(!ringbuf, "ringbuf_create", "failed to create ringbuf\n")) + goto cleanup; + + /* There is only one ring in this ringbuf. */ + data_ptr =3D ringbuf->rings[0].data; + + err =3D test_ringbuf_overwritable_lskel__attach(skel); + if (CHECK(err, "skel_attach", "skeleton attachment failed: %d\n", err)) + goto cleanup; + + /* Trigger one sample. */ + syscall(__NR_getpgid); + sample_cnt++; + + CHECK(skel->bss->avail_data !=3D -EINVAL, + "err_avail_size", "exp %d, got %ld\n", + -EINVAL, skel->bss->avail_data); + CHECK(skel->bss->ring_size !=3D page_size, + "err_ring_size", "exp %ld, got %ld\n", + (long)page_size, skel->bss->ring_size); + CHECK(skel->bss->cons_pos !=3D -EINVAL, + "err_cons_pos", "exp %d, got %ld\n", + -EINVAL, skel->bss->cons_pos); + CHECK(skel->bss->prod_pos !=3D sample_cnt * -rec_sz, + "err_prod_pos", "exp %ld, got %ld\n", + sample_cnt * -rec_sz, skel->bss->prod_pos); + + len_ptr =3D data_ptr + (skel->bss->prod_pos & mask); + len =3D smp_load_acquire(len_ptr); + + CHECK(len !=3D sizeof(struct sample), + "err_sample_len", "exp %ld, got %d\n", + sizeof(struct sample), len); + + sample =3D (void *)len_ptr + BPF_RINGBUF_HDR_SZ; + + CHECK(sample->count !=3D sample_cnt, + "err_sample_cnt", "exp %d, got %d", + sample_cnt, sample->count); + + /* Trigger many samples, so we overwrite data */ + for (int i =3D 0; i < 16; i++) { + syscall(__NR_getpgid); + sample_cnt++; + } + + CHECK(skel->bss->avail_data !=3D -EINVAL, + "err_avail_size", "exp %d, got %ld\n", + -EINVAL, skel->bss->avail_data); + CHECK(skel->bss->ring_size !=3D page_size, + "err_ring_size", "exp %ld, got %ld\n", + (long)page_size, skel->bss->ring_size); + CHECK(skel->bss->cons_pos !=3D -EINVAL, + "err_cons_pos", "exp %d, got %ld\n", + -EINVAL, skel->bss->cons_pos); + CHECK(skel->bss->prod_pos !=3D sample_cnt * -rec_sz, + "err_prod_pos", "exp %ld, got %ld\n", + sample_cnt * -rec_sz, skel->bss->prod_pos); + + read_pos =3D skel->bss->prod_pos; + sample_read =3D 0; + while (read_pos - skel->bss->prod_pos < mask) { + len_ptr =3D data_ptr + (read_pos & mask); + len =3D smp_load_acquire(len_ptr); + + sample =3D (void *)len_ptr + BPF_RINGBUF_HDR_SZ; + + CHECK(sample->count !=3D sample_cnt - sample_read, + "err_sample_cnt", "exp %d, got %d", + sample_cnt - sample_read, sample->count); + + sample_read++; + read_pos +=3D round_up(len + BPF_RINGBUF_HDR_SZ, 8); + } + + CHECK(sample_read !=3D page_size / rec_sz, + "err_sample_read", "exp %ld, got %d", + page_size / rec_sz, sample_read); + + test_ringbuf_overwritable_lskel__detach(skel); +cleanup: + ring_buffer__free(ringbuf); + test_ringbuf_overwritable_lskel__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_ringbuf_overwritable.c = b/tools/testing/selftests/bpf/progs/test_ringbuf_overwritable.c new file mode 100644 index 000000000000..e28be35059b7 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_ringbuf_overwritable.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2020 Facebook + +#include +#include +#include "bpf_misc.h" + +char _license[] SEC("license") =3D "GPL"; + +struct sample { + int count; + /* + * filler size will be computed to have 8 samples in a 4096 bytes long + * buffer. + */ + char filler[4096 / 8 - sizeof(int) - BPF_RINGBUF_HDR_SZ]; +}; + +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(map_flags, BFP_F_RB_OVERWRITABLE); +} ringbuf SEC(".maps"); + +/* inputs */ +int pid =3D 0; + +/* outputs */ +long avail_data =3D 0; +long ring_size =3D 0; +long cons_pos =3D 0; +long prod_pos =3D 0; + +static int count; + +SEC("fentry/" SYS_PREFIX "sys_getpgid") +int test_ringbuf_overwritable(void *ctx) +{ + int cur_pid =3D bpf_get_current_pid_tgid() >> 32; + struct sample *sample; + + if (cur_pid !=3D pid) + return 0; + + sample =3D bpf_ringbuf_reserve(&ringbuf, sizeof(*sample), 0); + if (!sample) + return 0; + + __sync_fetch_and_add(&count, 1); + sample->count =3D count; + + bpf_printk("count: %d\n", count); + + bpf_ringbuf_submit(sample, 0); + + avail_data =3D bpf_ringbuf_query(&ringbuf, BPF_RB_AVAIL_DATA); + ring_size =3D bpf_ringbuf_query(&ringbuf, BPF_RB_RING_SIZE); + cons_pos =3D bpf_ringbuf_query(&ringbuf, BPF_RB_CONS_POS); + prod_pos =3D bpf_ringbuf_query(&ringbuf, BPF_RB_PROD_POS); + + return 0; +} --=20 2.25.1 From nobody Fri Apr 10 12:50:51 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id ED0F4C38145 for ; Tue, 6 Sep 2022 20:05:27 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231192AbiIFUF0 (ORCPT ); Tue, 6 Sep 2022 16:05:26 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:42820 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231166AbiIFUE4 (ORCPT ); Tue, 6 Sep 2022 16:04:56 -0400 Received: from linux.microsoft.com (linux.microsoft.com [13.77.154.182]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id B18CCB5E79; Tue, 6 Sep 2022 13:00:32 -0700 (PDT) Received: from pwmachine.numericable.fr (85-170-34-72.rev.numericable.fr [85.170.34.72]) by linux.microsoft.com (Postfix) with ESMTPSA id 36F4E204A062; Tue, 6 Sep 2022 12:58:55 -0700 (PDT) DKIM-Filter: OpenDKIM Filter v2.11.0 linux.microsoft.com 36F4E204A062 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.microsoft.com; s=default; t=1662494339; bh=uFGRrOq/51NXIcdfjbdqGyuEN8JjUWMtBLgpVH9tTbM=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=s0ip9PP/e7tNFhTzW5Qi5hmgISsGF5ZqFozbagJLd9QyP2nb8fQwXdiRQ3lp6yhyo SYD9DSLv8MQk1PstZ+J+YXCj5NpaCpaBeetft2N5Pr3g5S/kp4+m2LBnYNKboOS+bO C7/FC7B8CQ4ZEjDDb3eC26nfWjqax71Es0yYlFR8= From: Francis Laniel To: bpf@vger.kernel.org Cc: Francis Laniel , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Jonathan Corbet , Mykola Lysenko , Shuah Khan , Joanne Koong , Dave Marchevsky , Lorenzo Bianconi , Maxim Mikityanskiy , Geliang Tang , "Naveen N. Rao" , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH v2 3/5] docs/bpf: Add documentation for overwritable ring buffer. Date: Tue, 6 Sep 2022 21:56:44 +0200 Message-Id: <20220906195656.33021-4-flaniel@linux.microsoft.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220906195656.33021-1-flaniel@linux.microsoft.com> References: <20220906195656.33021-1-flaniel@linux.microsoft.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" Add documentation to precise behavior of overwritable BPF ring buffer compa= red to conventionnal ones. Signed-off-by: Francis Laniel --- Documentation/bpf/ringbuf.rst | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/Documentation/bpf/ringbuf.rst b/Documentation/bpf/ringbuf.rst index 6a615cd62bda..e062381ff604 100644 --- a/Documentation/bpf/ringbuf.rst +++ b/Documentation/bpf/ringbuf.rst @@ -124,7 +124,7 @@ buffer. Currently 4 are supported: =20 - ``BPF_RB_AVAIL_DATA`` returns amount of unconsumed data in ring buffer; - ``BPF_RB_RING_SIZE`` returns the size of ring buffer; -- ``BPF_RB_CONS_POS``/``BPF_RB_PROD_POS`` returns current logical possition +- ``BPF_RB_CONS_POS``/``BPF_RB_PROD_POS`` returns current logical position of consumer/producer, respectively. =20 Returned values are momentarily snapshots of ring buffer state and could be @@ -204,3 +204,19 @@ buffer. For extreme cases, when BPF program wants more= manual control of notifications, commit/discard/output helpers accept ``BPF_RB_NO_WAKEUP`` a= nd ``BPF_RB_FORCE_WAKEUP`` flags, which give full control over notifications = of data availability, but require extra caution and diligence in using this A= PI. + +Specific case of overwritable ring buffer +----------------------------------------- + +Using ``BFP_F_RB_OVERWRITABLE`` when creating the ring buffer will make it +overwritable. +As a consequence, the producers will never be stopped from writing data, *= i.e.* +in this mode ``bpf_ringbuf_reserve()`` never blocks and returns NULL, but = oldest +events will be replaced by newest ones. + +In terms of implementation, this feature uses the same logic than overwrit= able +perf ring buffer. +The ring buffer is written backward, while it should be read forward from = the +producer position. +As a consequence, in this mode, the consumer position has no meaning and c= an be +used freely by userspace implementation. --=20 2.25.1 From nobody Fri Apr 10 12:50:51 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 9F5AFC38145 for ; Tue, 6 Sep 2022 20:06:08 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229779AbiIFUGH (ORCPT ); Tue, 6 Sep 2022 16:06:07 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:44220 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231166AbiIFUFf (ORCPT ); Tue, 6 Sep 2022 16:05:35 -0400 Received: from linux.microsoft.com (linux.microsoft.com [13.77.154.182]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id 71F82BD296; Tue, 6 Sep 2022 13:01:11 -0700 (PDT) Received: from pwmachine.numericable.fr (85-170-34-72.rev.numericable.fr [85.170.34.72]) by linux.microsoft.com (Postfix) with ESMTPSA id A4272204A0FA; Tue, 6 Sep 2022 12:59:13 -0700 (PDT) DKIM-Filter: OpenDKIM Filter v2.11.0 linux.microsoft.com A4272204A0FA DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.microsoft.com; s=default; t=1662494358; bh=nWaEkFlsTxoT2qCkLeEAAJWZstzNd2OmaAn1xDktTx8=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=fXuGmpIIuRaK5ubeN/epmC7TP0PdLKlpXY6ps9ZJc116zqiws8ot0O0RNd5tCNba3 nTYBgwtzJqVdSySys+DMBGlf7Ye4e0bvbxvoIvh34iROJgnSn0hI8Xs6QI1KFL697Q I/d/d2hdG5YcMHwma37GhE4IYAs24Y5qdcdCgrAY= From: Francis Laniel To: bpf@vger.kernel.org Cc: Francis Laniel , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Jonathan Corbet , Mykola Lysenko , Shuah Khan , Joanne Koong , Dave Marchevsky , Lorenzo Bianconi , Maxim Mikityanskiy , Geliang Tang , "Naveen N. Rao" , linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH v2 4/5] libbpf: Add implementation to consume overwritable BPF ring buffer. Date: Tue, 6 Sep 2022 21:56:45 +0200 Message-Id: <20220906195656.33021-5-flaniel@linux.microsoft.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220906195656.33021-1-flaniel@linux.microsoft.com> References: <20220906195656.33021-1-flaniel@linux.microsoft.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" If the BPF ring buffer is overwritable, ringbuf_process_overwritable_ring()= will be called to handle the data consumption. All the available data will be consumed but some checks will be performed: * check we do not read data we already read, if there is no new data, nothi= ng happens. * check we do not read more than the buffer size. * check we do not read invalid data by checking they fit the buffer size. Signed-off-by: Francis Laniel --- tools/include/uapi/linux/bpf.h | 3 + tools/lib/bpf/ringbuf.c | 106 +++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 59a217ca2dfd..cd73a89e8ead 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1227,6 +1227,9 @@ enum { =20 /* Create a map that is suitable to be an inner map with dynamic max entri= es */ BPF_F_INNER_MAP =3D (1U << 12), + +/* Create an over writable BPF_RINGBUF */ + BFP_F_RB_OVERWRITABLE =3D (1U << 13), }; =20 /* Flags for BPF_PROG_QUERY. */ diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c index 8bc117bcc7bc..2362a6280fc5 100644 --- a/tools/lib/bpf/ringbuf.c +++ b/tools/lib/bpf/ringbuf.c @@ -23,6 +23,8 @@ =20 struct ring { ring_buffer_sample_fn sample_cb; + __u8 overwritable: 1, + __reserved: 7; void *ctx; void *data; unsigned long *consumer_pos; @@ -51,6 +53,11 @@ static void ringbuf_unmap_ring(struct ring_buffer *rb, s= truct ring *r) } } =20 +static inline bool is_overwritable(struct ring *r) +{ + return !!r->overwritable; +} + /* Add extra RINGBUF maps to this ring buffer manager */ int ring_buffer__add(struct ring_buffer *rb, int map_fd, ring_buffer_sample_fn sample_cb, void *ctx) @@ -95,6 +102,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd, r->sample_cb =3D sample_cb; r->ctx =3D ctx; r->mask =3D info.max_entries - 1; + r->overwritable =3D !!(info.map_flags & BFP_F_RB_OVERWRITABLE); =20 /* Map writable consumer page */ tmp =3D mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, @@ -202,6 +210,101 @@ static inline int roundup_len(__u32 len) return (len + 7) / 8 * 8; } =20 + +static int64_t ringbuf_process_overwritable_ring(struct ring *r) +{ + /* 64-bit to avoid overflow in case of extreme application behavior */ + int64_t cnt =3D 0; + unsigned long read_pos, prod_pos, previous_prod_pos; + + prod_pos =3D smp_load_acquire(r->producer_pos); + previous_prod_pos =3D smp_load_acquire(r->consumer_pos); + + /* + * For overwritable ring buffer, we use consumer_pos as the previous + * producer_pos. + * So, if between two calls to this function, the prod_pos did not move, + * it means there is no new data, so we can return right now rather than + * dealing with data we already proceeded. + * NOTE the kernel space does not care about consumer_pos to reserve() + * in overwritable ring buffers, hence we can hijack this field. + */ + if (previous_prod_pos =3D=3D prod_pos) + return 0; + + /* + * BPF ring buffer is over writable, we start reading from + * producer position. + */ + read_pos =3D prod_pos; + while (read_pos - prod_pos < r->mask) { + int *len_ptr, len; + + len_ptr =3D r->data + (read_pos & r->mask); + len =3D smp_load_acquire(len_ptr); + + /* sample not committed yet, bail out for now */ + if (len & BPF_RINGBUF_BUSY_BIT) + break; + + /* + * If len is 0, it means we read all the data + * available in the buffer and jump on 0 data: + * + * prod_pos read_pos + * | | + * V V + * +---+------+----------+-------+------+ + * | |D....D|C........C|B.....B|A....A| + * +---+------+----------+-------+------+ + */ + if (!len) + break; + + /* + * If adding the event len to the current + * consumer position makes us wrap the buffer, + * it means we already did "one loop" around the + * buffer. + * So, the pointed data would not be usable: + * + * prod_pos + * read_pos----+ | + * | | + * V V + * +---+------+----------+-------+---+--+ + * |..E|D....D|C........C|B.....B|A..|E.| + * +---+------+----------+-------+---+--+ + */ + if (read_pos - prod_pos + len > r->mask) + break; + + read_pos +=3D roundup_len(len); + + if ((len & BPF_RINGBUF_DISCARD_BIT) =3D=3D 0) { + void *sample; + int err; + + sample =3D (void *)len_ptr + BPF_RINGBUF_HDR_SZ; + err =3D r->sample_cb(r->ctx, sample, len); + if (err < 0) { + /* update consumer pos and bail out */ + smp_store_release(r->consumer_pos, + prod_pos); + return err; + } + cnt++; + } + + /* This prevents reading data we already processed. */ + if (previous_prod_pos && read_pos >=3D previous_prod_pos) + break; + } + + smp_store_release(r->consumer_pos, prod_pos); + return cnt; +} + static int64_t ringbuf_process_ring(struct ring* r) { int *len_ptr, len, err; @@ -211,6 +314,9 @@ static int64_t ringbuf_process_ring(struct ring* r) bool got_new_data; void *sample; =20 + if (is_overwritable(r)) + return ringbuf_process_overwritable_ring(r); + cons_pos =3D smp_load_acquire(r->consumer_pos); do { got_new_data =3D false; --=20 2.25.1 From nobody Fri Apr 10 12:50:51 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id EC215C38145 for ; Tue, 6 Sep 2022 20:06:02 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229816AbiIFUGB (ORCPT ); Tue, 6 Sep 2022 16:06:01 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:54912 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229956AbiIFUFe (ORCPT ); Tue, 6 Sep 2022 16:05:34 -0400 Received: from linux.microsoft.com (linux.microsoft.com [13.77.154.182]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id DAB97B602B; Tue, 6 Sep 2022 13:01:10 -0700 (PDT) Received: from pwmachine.numericable.fr (85-170-34-72.rev.numericable.fr [85.170.34.72]) by linux.microsoft.com (Postfix) with ESMTPSA id EAFD4204A580; Tue, 6 Sep 2022 12:59:36 -0700 (PDT) DKIM-Filter: OpenDKIM Filter v2.11.0 linux.microsoft.com EAFD4204A580 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.microsoft.com; s=default; t=1662494381; bh=N4qKGNRCArorGaqJPvl3arifkj3tGRGaxyc5hkP2FRA=; h=From:To:Cc:Subject:Date:In-Reply-To:References:From; b=YHDuOBp3oEAQ+LnOL1ippAIf7DJGdmUY1DDeO2SFn6XY2urLU+ZehCkolr/XgcX+t v1k3Pkj1O9uk1MP4cRwir7XBkmbkke1JOvRWMhZrHi5sdj3dHN1aM/lcT4XeHqRtrA 0tiIgnXQqhhkb/ZRVWw839cEBZZVp3aeC5DThNEM= From: Francis Laniel To: bpf@vger.kernel.org Cc: Francis Laniel , Alexei Starovoitov , Daniel Borkmann , Andrii Nakryiko , Martin KaFai Lau , Song Liu , Yonghong Song , John Fastabend , KP Singh , Stanislav Fomichev , Hao Luo , Jiri Olsa , Jonathan Corbet , Mykola Lysenko , Shuah Khan , Joanne Koong , Dave Marchevsky , Lorenzo Bianconi , Maxim Mikityanskiy , Geliang Tang , "Naveen N. Rao" , linux-kernel@vger.kernel.org, linux-doc@vger.kernel.org, linux-kselftest@vger.kernel.org Subject: [RFC PATCH v2 5/5] for test purpose only: Add toy to play with BPF ring. Date: Tue, 6 Sep 2022 21:56:46 +0200 Message-Id: <20220906195656.33021-6-flaniel@linux.microsoft.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20220906195656.33021-1-flaniel@linux.microsoft.com> References: <20220906195656.33021-1-flaniel@linux.microsoft.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" This patch should be applied on iovisor/bcc. Signed-off-by: Francis Laniel --- ...-only-Add-toy-to-play-with-BPF-ring-.patch | 147 ++++++++++++++++++ 1 file changed, 147 insertions(+) create mode 100644 0001-for-test-purpose-only-Add-toy-to-play-with-BPF-rin= g-.patch diff --git a/0001-for-test-purpose-only-Add-toy-to-play-with-BPF-ring-.patc= h b/0001-for-test-purpose-only-Add-toy-to-play-with-BPF-ring-.patch new file mode 100644 index 000000000000..37d08cc08a88 --- /dev/null +++ b/0001-for-test-purpose-only-Add-toy-to-play-with-BPF-ring-.patch @@ -0,0 +1,147 @@ +From e4b95b1f9625f62d0978173973070dce38bd7210 Mon Sep 17 00:00:00 2001 +From: Francis Laniel +Date: Tue, 9 Aug 2022 18:18:53 +0200 +Subject: [PATCH] for test purpose only: Add toy to play with BPF ring buff= er. + +Signed-off-by: Francis Laniel +--- + libbpf-tools/Makefile | 1 + + libbpf-tools/toy.bpf.c | 29 +++++++++++++++++++ + libbpf-tools/toy.c | 65 ++++++++++++++++++++++++++++++++++++++++++ + libbpf-tools/toy.h | 4 +++ + 4 files changed, 99 insertions(+) + create mode 100644 libbpf-tools/toy.bpf.c + create mode 100644 libbpf-tools/toy.c + create mode 100644 libbpf-tools/toy.h + +diff --git a/libbpf-tools/Makefile b/libbpf-tools/Makefile +index 3e40f6e5..0d81d3b7 100644 +--- a/libbpf-tools/Makefile ++++ b/libbpf-tools/Makefile +@@ -68,6 +68,7 @@ APPS =3D \ + tcplife \ + tcprtt \ + tcpsynbl \ ++ toy \ + vfsstat \ + # +=20 +diff --git a/libbpf-tools/toy.bpf.c b/libbpf-tools/toy.bpf.c +new file mode 100644 +index 00000000..3c28a20b +--- /dev/null ++++ b/libbpf-tools/toy.bpf.c +@@ -0,0 +1,29 @@ ++#include ++#include ++#include ++#include "toy.h" ++ ++ ++struct { ++ __uint(type, BPF_MAP_TYPE_RINGBUF); ++ __uint(max_entries, 4096); ++ __uint(map_flags, 1U << 13); ++} buffer SEC(".maps"); ++ ++static __u32 count =3D 0; ++ ++SEC("tracepoint/syscalls/sys_enter_execve") ++int sys_enter_execve(void) { ++ count++; ++ struct event *event =3D bpf_ringbuf_reserve(&buffer, sizeof(struct event= ), 0); ++ if (!event) { ++ return 1; ++ } ++ ++ event->count =3D count; ++ bpf_ringbuf_submit(event, 0); ++ ++ return 0; ++} ++ ++char _license[] SEC("license") =3D "GPL"; +diff --git a/libbpf-tools/toy.c b/libbpf-tools/toy.c +new file mode 100644 +index 00000000..4cd8b588 +--- /dev/null ++++ b/libbpf-tools/toy.c +@@ -0,0 +1,65 @@ ++#include ++#include ++#include ++#include "toy.h" ++#include "toy.skel.h" ++#include "btf_helpers.h" ++ ++ ++static int buf_process_sample(void *ctx, void *data, size_t len) { ++ struct event *evt =3D (struct event *)data; ++ ++ printf("%d\n", evt->count); ++ ++ return 0; ++} ++ ++int main(void) { ++ LIBBPF_OPTS(bpf_object_open_opts, open_opts); ++ int buffer_map_fd =3D -1; ++ struct toy_bpf *obj; ++ int err; ++ ++ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); ++ ++ err =3D ensure_core_btf(&open_opts); ++ if (err) { ++ fprintf(stderr, "failed to fetch necessary BTF for CO-RE: %s\n", strerr= or(-err)); ++ return 1; ++ } ++ ++ obj =3D toy_bpf__open_opts(&open_opts); ++ if (!obj) { ++ fprintf(stderr, "failed to open BPF object\n"); ++ return 1; ++ } ++ ++ err =3D toy_bpf__load(obj); ++ if (err) { ++ fprintf(stderr, "failed to load BPF object: %d\n", err); ++ return 1; ++ } ++ ++ struct ring_buffer *ring_buffer; ++ ++ buffer_map_fd =3D bpf_object__find_map_fd_by_name(obj->obj, "buffer"); ++ ring_buffer =3D ring_buffer__new(buffer_map_fd, buf_process_sample, NULL= , NULL); ++ ++ if(!ring_buffer) { ++ fprintf(stderr, "failed to create ring buffer\n"); ++ return 1; ++ } ++ ++ err =3D toy_bpf__attach(obj); ++ if (err) { ++ fprintf(stderr, "failed to attach BPF programs\n"); ++ return 1; ++ } ++ ++ for (;;) { ++ ring_buffer__consume(ring_buffer); ++ sleep(1); ++ } ++ ++ return 0; ++} +diff --git a/libbpf-tools/toy.h b/libbpf-tools/toy.h +new file mode 100644 +index 00000000..ebfedf06 +--- /dev/null ++++ b/libbpf-tools/toy.h +@@ -0,0 +1,4 @@ ++struct event { ++ __u32 count; ++ char filler[4096 / 8 - sizeof(__u32) - 8]; ++}; +--=20 +2.25.1 + --=20 2.25.1