[PATCH bpf-next 2/4] libbpf: ringbuf: Add overwrite ring buffer process

Xu Kuohai posted 4 patches 2 months ago
There is a newer version of this series
[PATCH bpf-next 2/4] libbpf: ringbuf: Add overwrite ring buffer process
Posted by Xu Kuohai 2 months ago
From: Xu Kuohai <xukuohai@huawei.com>

In overwrite mode, the producer does not wait for the consumer, so the
consumer is responsible for handling conflicts. An optimistic method
is used to resolve the conflicts: the consumer first reads consumer_pos,
producer_pos and overwrite_pos, then calculates a read window and copies
data in the window from the ring buffer. After copying, it checks the
positions to decide if the data in the copy window have been overwritten
by be the producer. If so, it discards the copy and tries again. Once
success, the consumer processes the events in the copy.

Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
---
 tools/lib/bpf/ringbuf.c | 103 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 102 insertions(+), 1 deletion(-)

diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
index 9702b70da444..9c072af675ff 100644
--- a/tools/lib/bpf/ringbuf.c
+++ b/tools/lib/bpf/ringbuf.c
@@ -27,10 +27,13 @@ struct ring {
 	ring_buffer_sample_fn sample_cb;
 	void *ctx;
 	void *data;
+	void *read_buffer;
 	unsigned long *consumer_pos;
 	unsigned long *producer_pos;
+	unsigned long *overwrite_pos;
 	unsigned long mask;
 	int map_fd;
+	bool overwrite_mode;
 };
 
 struct ring_buffer {
@@ -69,6 +72,9 @@ static void ringbuf_free_ring(struct ring_buffer *rb, struct ring *r)
 		r->producer_pos = NULL;
 	}
 
+	if (r->read_buffer)
+		free(r->read_buffer);
+
 	free(r);
 }
 
@@ -119,6 +125,14 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
 	r->sample_cb = sample_cb;
 	r->ctx = ctx;
 	r->mask = info.max_entries - 1;
+	r->overwrite_mode = info.map_flags & BPF_F_OVERWRITE;
+	if (unlikely(r->overwrite_mode)) {
+		r->read_buffer = malloc(info.max_entries);
+		if (!r->read_buffer) {
+			err = -ENOMEM;
+			goto err_out;
+		}
+	}
 
 	/* Map writable consumer page */
 	tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0);
@@ -148,6 +162,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
 		goto err_out;
 	}
 	r->producer_pos = tmp;
+	r->overwrite_pos = r->producer_pos + 1; /* overwrite_pos is next to producer_pos */
 	r->data = tmp + rb->page_size;
 
 	e = &rb->events[rb->ring_cnt];
@@ -232,7 +247,7 @@ static inline int roundup_len(__u32 len)
 	return (len + 7) / 8 * 8;
 }
 
-static int64_t ringbuf_process_ring(struct ring *r, size_t n)
+static int64_t ringbuf_process_normal_ring(struct ring *r, size_t n)
 {
 	int *len_ptr, len, err;
 	/* 64-bit to avoid overflow in case of extreme application behavior */
@@ -278,6 +293,92 @@ static int64_t ringbuf_process_ring(struct ring *r, size_t n)
 	return cnt;
 }
 
+static int64_t ringbuf_process_overwrite_ring(struct ring *r, size_t n)
+{
+
+	int err;
+	uint32_t *len_ptr, len;
+	/* 64-bit to avoid overflow in case of extreme application behavior */
+	int64_t cnt = 0;
+	size_t size, offset;
+	unsigned long cons_pos, prod_pos, over_pos, tmp_pos;
+	bool got_new_data;
+	void *sample;
+	bool copied;
+
+	size = r->mask + 1;
+
+	cons_pos = smp_load_acquire(r->consumer_pos);
+	do {
+		got_new_data = false;
+
+		/* grab a copy of data */
+		prod_pos = smp_load_acquire(r->producer_pos);
+		do {
+			over_pos = READ_ONCE(*r->overwrite_pos);
+			/* prod_pos may be outdated now */
+			if (over_pos < prod_pos) {
+				tmp_pos = max(cons_pos, over_pos);
+				/* smp_load_acquire(r->producer_pos) before
+				 * READ_ONCE(*r->overwrite_pos) ensures that
+				 * over_pos + r->mask < prod_pos never occurs,
+				 * so size is never larger than r->mask
+				 */
+				size = prod_pos - tmp_pos;
+				if (!size)
+					goto done;
+				memcpy(r->read_buffer,
+				       r->data + (tmp_pos & r->mask), size);
+				copied = true;
+			} else {
+				copied = false;
+			}
+			prod_pos = smp_load_acquire(r->producer_pos);
+		/* retry if data is overwritten by producer */
+		} while (!copied || prod_pos - tmp_pos > r->mask);
+
+		cons_pos = tmp_pos;
+
+		for (offset = 0; offset < size; offset += roundup_len(len)) {
+			len_ptr = r->read_buffer + (offset & r->mask);
+			len = *len_ptr;
+
+			if (len & BPF_RINGBUF_BUSY_BIT)
+				goto done;
+
+			got_new_data = true;
+			cons_pos += roundup_len(len);
+
+			if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) {
+				sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
+				err = r->sample_cb(r->ctx, sample, len);
+				if (err < 0) {
+					/* update consumer pos and bail out */
+					smp_store_release(r->consumer_pos,
+							  cons_pos);
+					return err;
+				}
+				cnt++;
+			}
+
+			if (cnt >= n)
+				goto done;
+		}
+	} while (got_new_data);
+
+done:
+	smp_store_release(r->consumer_pos, cons_pos);
+	return cnt;
+}
+
+static int64_t ringbuf_process_ring(struct ring *r, size_t n)
+{
+	if (likely(!r->overwrite_mode))
+		return ringbuf_process_normal_ring(r, n);
+	else
+		return ringbuf_process_overwrite_ring(r, n);
+}
+
 /* Consume available ring buffer(s) data without event polling, up to n
  * records.
  *
-- 
2.43.0
Re: [PATCH bpf-next 2/4] libbpf: ringbuf: Add overwrite ring buffer process
Posted by Andrii Nakryiko 1 month, 1 week ago
On Sun, Aug 3, 2025 at 7:27 PM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>
> From: Xu Kuohai <xukuohai@huawei.com>
>
> In overwrite mode, the producer does not wait for the consumer, so the
> consumer is responsible for handling conflicts. An optimistic method
> is used to resolve the conflicts: the consumer first reads consumer_pos,
> producer_pos and overwrite_pos, then calculates a read window and copies
> data in the window from the ring buffer. After copying, it checks the
> positions to decide if the data in the copy window have been overwritten
> by be the producer. If so, it discards the copy and tries again. Once
> success, the consumer processes the events in the copy.
>

I don't mind adding BPF_F_OVERWRITE mode to BPF ringbuf (it seems like
this will work fine) itself, but I don't think retrofitting it to this
callback-based libbpf-side API is a good fit.

For one, I don't like that extra memory copy and potentially a huge
allocation that you do. I think for some use cases user logic might be
totally fine with using ringbuf memory directly, even if it can be
overwritten at any point. So it would be unfair to penalize
sophisticated users for such cases. Even if not, I'd say allocating
just enough to hold the record would be a better approach.

Another downside is that the user doesn't really have much visibility
right now into whether any samples were overwritten.

I've been mulling over the idea of adding an iterator-like API for BPF
ringbuf on the libbpf side for a while now. I'm still debating some
API nuances with Eduard, but I think we'll end up adding something
pretty soon. Iterator-based API seems like a much better fit for
overwritable mode here.

But all that is not really overwrite-specific and is broader, so I
think we can proceed with finalizing kernel-side details of overwrite
and not block on libbpf side of things for now, though.

> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> ---
>  tools/lib/bpf/ringbuf.c | 103 +++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 102 insertions(+), 1 deletion(-)
>
> diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
> index 9702b70da444..9c072af675ff 100644
> --- a/tools/lib/bpf/ringbuf.c
> +++ b/tools/lib/bpf/ringbuf.c
> @@ -27,10 +27,13 @@ struct ring {
>         ring_buffer_sample_fn sample_cb;
>         void *ctx;
>         void *data;
> +       void *read_buffer;
>         unsigned long *consumer_pos;
>         unsigned long *producer_pos;
> +       unsigned long *overwrite_pos;
>         unsigned long mask;
>         int map_fd;
> +       bool overwrite_mode;
>  };

[...]
Re: [PATCH bpf-next 2/4] libbpf: ringbuf: Add overwrite ring buffer process
Posted by Xu Kuohai 1 month, 1 week ago
On 8/23/2025 5:23 AM, Andrii Nakryiko wrote:
> On Sun, Aug 3, 2025 at 7:27 PM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>>
>> From: Xu Kuohai <xukuohai@huawei.com>
>>
>> In overwrite mode, the producer does not wait for the consumer, so the
>> consumer is responsible for handling conflicts. An optimistic method
>> is used to resolve the conflicts: the consumer first reads consumer_pos,
>> producer_pos and overwrite_pos, then calculates a read window and copies
>> data in the window from the ring buffer. After copying, it checks the
>> positions to decide if the data in the copy window have been overwritten
>> by be the producer. If so, it discards the copy and tries again. Once
>> success, the consumer processes the events in the copy.
>>
> 
> I don't mind adding BPF_F_OVERWRITE mode to BPF ringbuf (it seems like
> this will work fine) itself, but I don't think retrofitting it to this
> callback-based libbpf-side API is a good fit.
> 
> For one, I don't like that extra memory copy and potentially a huge
> allocation that you do. I think for some use cases user logic might be
> totally fine with using ringbuf memory directly, even if it can be
> overwritten at any point. So it would be unfair to penalize
> sophisticated users for such cases. Even if not, I'd say allocating
> just enough to hold the record would be a better approach.
> 
> Another downside is that the user doesn't really have much visibility
> right now into whether any samples were overwritten.
> 
> I've been mulling over the idea of adding an iterator-like API for BPF
> ringbuf on the libbpf side for a while now. I'm still debating some
> API nuances with Eduard, but I think we'll end up adding something
> pretty soon. Iterator-based API seems like a much better fit for
> overwritable mode here.
> 
> But all that is not really overwrite-specific and is broader, so I
> think we can proceed with finalizing kernel-side details of overwrite
> and not block on libbpf side of things for now, though.
>

Sounds great! Looking forward to the new iterator-based API. Clearly, no
memory allocation or a samll allocation is better than a huge allocation.
I'll focus on the kernel side before the new API is introduced.

>> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
>> ---
>>   tools/lib/bpf/ringbuf.c | 103 +++++++++++++++++++++++++++++++++++++++-
>>   1 file changed, 102 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
>> index 9702b70da444..9c072af675ff 100644
>> --- a/tools/lib/bpf/ringbuf.c
>> +++ b/tools/lib/bpf/ringbuf.c
>> @@ -27,10 +27,13 @@ struct ring {
>>          ring_buffer_sample_fn sample_cb;
>>          void *ctx;
>>          void *data;
>> +       void *read_buffer;
>>          unsigned long *consumer_pos;
>>          unsigned long *producer_pos;
>> +       unsigned long *overwrite_pos;
>>          unsigned long mask;
>>          int map_fd;
>> +       bool overwrite_mode;
>>   };
> 
> [...]

Re: [PATCH bpf-next 2/4] libbpf: ringbuf: Add overwrite ring buffer process
Posted by Eduard Zingerman 1 month, 3 weeks ago
On Mon, 2025-08-04 at 10:20 +0800, Xu Kuohai wrote:

[...]

> @@ -278,6 +293,92 @@ static int64_t ringbuf_process_ring(struct ring *r, size_t n)
>  	return cnt;
>  }
>  
> +static int64_t ringbuf_process_overwrite_ring(struct ring *r, size_t n)
> +{
> +
> +	int err;
> +	uint32_t *len_ptr, len;
> +	/* 64-bit to avoid overflow in case of extreme application behavior */
> +	int64_t cnt = 0;
> +	size_t size, offset;
> +	unsigned long cons_pos, prod_pos, over_pos, tmp_pos;
> +	bool got_new_data;
> +	void *sample;
> +	bool copied;
> +
> +	size = r->mask + 1;
> +
> +	cons_pos = smp_load_acquire(r->consumer_pos);
> +	do {
> +		got_new_data = false;
> +
> +		/* grab a copy of data */
> +		prod_pos = smp_load_acquire(r->producer_pos);
> +		do {
> +			over_pos = READ_ONCE(*r->overwrite_pos);
> +			/* prod_pos may be outdated now */
> +			if (over_pos < prod_pos) {
> +				tmp_pos = max(cons_pos, over_pos);
> +				/* smp_load_acquire(r->producer_pos) before
> +				 * READ_ONCE(*r->overwrite_pos) ensures that
> +				 * over_pos + r->mask < prod_pos never occurs,
> +				 * so size is never larger than r->mask
> +				 */
> +				size = prod_pos - tmp_pos;
> +				if (!size)
> +					goto done;
> +				memcpy(r->read_buffer,
> +				       r->data + (tmp_pos & r->mask), size);
> +				copied = true;
> +			} else {
> +				copied = false;
> +			}
> +			prod_pos = smp_load_acquire(r->producer_pos);
> +		/* retry if data is overwritten by producer */
> +		} while (!copied || prod_pos - tmp_pos > r->mask);

Could you please elaborate a bit, why this condition is sufficient to
guarantee that r->overwrite_pos had not changed while memcpy() was
executing?

> +
> +		cons_pos = tmp_pos;
> +
> +		for (offset = 0; offset < size; offset += roundup_len(len)) {
> +			len_ptr = r->read_buffer + (offset & r->mask);
> +			len = *len_ptr;
> +
> +			if (len & BPF_RINGBUF_BUSY_BIT)
> +				goto done;
> +
> +			got_new_data = true;
> +			cons_pos += roundup_len(len);
> +
> +			if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) {
> +				sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
> +				err = r->sample_cb(r->ctx, sample, len);
> +				if (err < 0) {
> +					/* update consumer pos and bail out */
> +					smp_store_release(r->consumer_pos,
> +							  cons_pos);
> +					return err;
> +				}
> +				cnt++;
> +			}
> +
> +			if (cnt >= n)
> +				goto done;
> +		}
> +	} while (got_new_data);
> +
> +done:
> +	smp_store_release(r->consumer_pos, cons_pos);
> +	return cnt;
> +}

[...]
Re: [PATCH bpf-next 2/4] libbpf: ringbuf: Add overwrite ring buffer process
Posted by Zvi Effron 1 month, 3 weeks ago
On Thu, Aug 14, 2025 at 12:34 PM Eduard Zingerman <eddyz87@gmail.com> wrote:
>
> On Mon, 2025-08-04 at 10:20 +0800, Xu Kuohai wrote:
>
> [...]
>
> > @@ -278,6 +293,92 @@ static int64_t ringbuf_process_ring(struct ring *r, size_t n)
> >       return cnt;
> >  }
> >
> > +static int64_t ringbuf_process_overwrite_ring(struct ring *r, size_t n)
> > +{
> > +
> > +     int err;
> > +     uint32_t *len_ptr, len;
> > +     /* 64-bit to avoid overflow in case of extreme application behavior */
> > +     int64_t cnt = 0;
> > +     size_t size, offset;
> > +     unsigned long cons_pos, prod_pos, over_pos, tmp_pos;
> > +     bool got_new_data;
> > +     void *sample;
> > +     bool copied;
> > +
> > +     size = r->mask + 1;
> > +
> > +     cons_pos = smp_load_acquire(r->consumer_pos);
> > +     do {
> > +             got_new_data = false;
> > +
> > +             /* grab a copy of data */
> > +             prod_pos = smp_load_acquire(r->producer_pos);
> > +             do {
> > +                     over_pos = READ_ONCE(*r->overwrite_pos);
> > +                     /* prod_pos may be outdated now */
> > +                     if (over_pos < prod_pos) {
> > +                             tmp_pos = max(cons_pos, over_pos);
> > +                             /* smp_load_acquire(r->producer_pos) before
> > +                              * READ_ONCE(*r->overwrite_pos) ensures that
> > +                              * over_pos + r->mask < prod_pos never occurs,
> > +                              * so size is never larger than r->mask
> > +                              */
> > +                             size = prod_pos - tmp_pos;
> > +                             if (!size)
> > +                                     goto done;
> > +                             memcpy(r->read_buffer,
> > +                                    r->data + (tmp_pos & r->mask), size);
> > +                             copied = true;
> > +                     } else {
> > +                             copied = false;
> > +                     }
> > +                     prod_pos = smp_load_acquire(r->producer_pos);
> > +             /* retry if data is overwritten by producer */
> > +             } while (!copied || prod_pos - tmp_pos > r->mask);
>
> Could you please elaborate a bit, why this condition is sufficient to
> guarantee that r->overwrite_pos had not changed while memcpy() was
> executing?
>

It isn't sufficient to guarantee that, but does it need tobe ? The concern is
that the data being memcpy-ed might have been overwritten, right? This
condition is sufficient to guarantee that can't happen without forcing another
loop iteration.

For the producer to overwrite a memcpy-ed byte, it must have looped around the
entire buffer, so r->producer_pos will be at least r->mask + 1 more than
tmp_pos. The +1 is because r->producer_pos first had to produce the byte
that got overwritten for it to be included in the memcpy, then produce it a
second time to overwrite it.

Since the code rereads r->producer_pos before making the check, if any bytes
have been overwritten, prod_pos - tmp_pos will be at least r->mask + 1, so the
check will return true and the loop will iterate again, and a new memcpy will
be performed.

> > +
> > +             cons_pos = tmp_pos;
> > +
> > +             for (offset = 0; offset < size; offset += roundup_len(len)) {
> > +                     len_ptr = r->read_buffer + (offset & r->mask);
> > +                     len = *len_ptr;
> > +
> > +                     if (len & BPF_RINGBUF_BUSY_BIT)
> > +                             goto done;
> > +
> > +                     got_new_data = true;
> > +                     cons_pos += roundup_len(len);
> > +
> > +                     if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) {
> > +                             sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
> > +                             err = r->sample_cb(r->ctx, sample, len);
> > +                             if (err < 0) {
> > +                                     /* update consumer pos and bail out */
> > +                                     smp_store_release(r->consumer_pos,
> > +                                                       cons_pos);
> > +                                     return err;
> > +                             }
> > +                             cnt++;
> > +                     }
> > +
> > +                     if (cnt >= n)
> > +                             goto done;
> > +             }
> > +     } while (got_new_data);
> > +
> > +done:
> > +     smp_store_release(r->consumer_pos, cons_pos);
> > +     return cnt;
> > +}
>
> [...]
>
Re: [PATCH bpf-next 2/4] libbpf: ringbuf: Add overwrite ring buffer process
Posted by Zvi Effron 1 month, 3 weeks ago
On Sun, Aug 3, 2025 at 7:27 PM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>
> From: Xu Kuohai <xukuohai@huawei.com>
>
> In overwrite mode, the producer does not wait for the consumer, so the
> consumer is responsible for handling conflicts. An optimistic method
> is used to resolve the conflicts: the consumer first reads consumer_pos,
> producer_pos and overwrite_pos, then calculates a read window and copies
> data in the window from the ring buffer. After copying, it checks the
> positions to decide if the data in the copy window have been overwritten
> by be the producer. If so, it discards the copy and tries again. Once
> success, the consumer processes the events in the copy.
>
> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
> ---
> tools/lib/bpf/ringbuf.c | 103 +++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 102 insertions(+), 1 deletion(-)
>
> diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
> index 9702b70da444..9c072af675ff 100644
> --- a/tools/lib/bpf/ringbuf.c
> +++ b/tools/lib/bpf/ringbuf.c
> @@ -27,10 +27,13 @@ struct ring {
> ring_buffer_sample_fn sample_cb;
> void *ctx;
> void *data;
> + void *read_buffer;
> unsigned long *consumer_pos;
> unsigned long *producer_pos;
> + unsigned long *overwrite_pos;
> unsigned long mask;
> int map_fd;
> + bool overwrite_mode;
> };
>
> struct ring_buffer {
> @@ -69,6 +72,9 @@ static void ringbuf_free_ring(struct ring_buffer *rb, struct ring *r)
> r->producer_pos = NULL;
> }
>
> + if (r->read_buffer)
> + free(r->read_buffer);
> +
> free(r);
> }
>
> @@ -119,6 +125,14 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
> r->sample_cb = sample_cb;
> r->ctx = ctx;
> r->mask = info.max_entries - 1;
> + r->overwrite_mode = info.map_flags & BPF_F_OVERWRITE;
> + if (unlikely(r->overwrite_mode)) {
> + r->read_buffer = malloc(info.max_entries);
> + if (!r->read_buffer) {
> + err = -ENOMEM;
> + goto err_out;
> + }
> + }
>
> /* Map writable consumer page */
> tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0);
> @@ -148,6 +162,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
> goto err_out;
> }
> r->producer_pos = tmp;
> + r->overwrite_pos = r->producer_pos + 1; /* overwrite_pos is next to producer_pos */
> r->data = tmp + rb->page_size;
>
> e = &rb->events[rb->ring_cnt];
> @@ -232,7 +247,7 @@ static inline int roundup_len(__u32 len)
> return (len + 7) / 8 * 8;
> }
>
> -static int64_t ringbuf_process_ring(struct ring *r, size_t n)
> +static int64_t ringbuf_process_normal_ring(struct ring *r, size_t n)
> {
> int *len_ptr, len, err;
> /* 64-bit to avoid overflow in case of extreme application behavior */
> @@ -278,6 +293,92 @@ static int64_t ringbuf_process_ring(struct ring *r, size_t n)
> return cnt;
> }
>
> +static int64_t ringbuf_process_overwrite_ring(struct ring *r, size_t n)
> +{
> +
> + int err;
> + uint32_t *len_ptr, len;
> + /* 64-bit to avoid overflow in case of extreme application behavior */
> + int64_t cnt = 0;
> + size_t size, offset;
> + unsigned long cons_pos, prod_pos, over_pos, tmp_pos;
> + bool got_new_data;
> + void *sample;
> + bool copied;
> +
> + size = r->mask + 1;
> +
> + cons_pos = smp_load_acquire(r->consumer_pos);
> + do {
> + got_new_data = false;
> +
> + /* grab a copy of data */
> + prod_pos = smp_load_acquire(r->producer_pos);
> + do {
> + over_pos = READ_ONCE(*r->overwrite_pos);
> + /* prod_pos may be outdated now */
> + if (over_pos < prod_pos) {
> + tmp_pos = max(cons_pos, over_pos);
> + /* smp_load_acquire(r->producer_pos) before
> + * READ_ONCE(*r->overwrite_pos) ensures that
> + * over_pos + r->mask < prod_pos never occurs,
> + * so size is never larger than r->mask
> + */
> + size = prod_pos - tmp_pos;
> + if (!size)
> + goto done;
> + memcpy(r->read_buffer,
> + r->data + (tmp_pos & r->mask), size);
> + copied = true;
> + } else {
> + copied = false;
> + }
> + prod_pos = smp_load_acquire(r->producer_pos);
> + /* retry if data is overwritten by producer */
> + } while (!copied || prod_pos - tmp_pos > r->mask);

This seems to allow for a situation where a call to process the ring can
infinite loop if the producers are producing and overwriting fast enough. That
seems suboptimal to me?

Should there be a timeout or maximum number of attempts or something that
returns -EBUSY or another error to the user?

> +
> + cons_pos = tmp_pos;
> +
> + for (offset = 0; offset < size; offset += roundup_len(len)) {
> + len_ptr = r->read_buffer + (offset & r->mask);
> + len = *len_ptr;
> +
> + if (len & BPF_RINGBUF_BUSY_BIT)
> + goto done;
> +
> + got_new_data = true;
> + cons_pos += roundup_len(len);
> +
> + if ((len & BPF_RINGBUF_DISCARD_BIT) == 0) {
> + sample = (void *)len_ptr + BPF_RINGBUF_HDR_SZ;
> + err = r->sample_cb(r->ctx, sample, len);
> + if (err < 0) {
> + /* update consumer pos and bail out */
> + smp_store_release(r->consumer_pos,
> + cons_pos);
> + return err;
> + }
> + cnt++;
> + }
> +
> + if (cnt >= n)
> + goto done;
> + }
> + } while (got_new_data);
> +
> +done:
> + smp_store_release(r->consumer_pos, cons_pos);
> + return cnt;
> +}
> +
> +static int64_t ringbuf_process_ring(struct ring *r, size_t n)
> +{
> + if (likely(!r->overwrite_mode))
> + return ringbuf_process_normal_ring(r, n);
> + else
> + return ringbuf_process_overwrite_ring(r, n);
> +}
> +
> /* Consume available ring buffer(s) data without event polling, up to n
> * records.
> *
> --
> 2.43.0
>
>
Re: [PATCH bpf-next 2/4] libbpf: ringbuf: Add overwrite ring buffer process
Posted by Xu Kuohai 1 month, 3 weeks ago
On 8/14/2025 2:21 AM, Zvi Effron wrote:
> On Sun, Aug 3, 2025 at 7:27 PM Xu Kuohai <xukuohai@huaweicloud.com> wrote:
>>
>> From: Xu Kuohai <xukuohai@huawei.com>
>>
>> In overwrite mode, the producer does not wait for the consumer, so the
>> consumer is responsible for handling conflicts. An optimistic method
>> is used to resolve the conflicts: the consumer first reads consumer_pos,
>> producer_pos and overwrite_pos, then calculates a read window and copies
>> data in the window from the ring buffer. After copying, it checks the
>> positions to decide if the data in the copy window have been overwritten
>> by be the producer. If so, it discards the copy and tries again. Once
>> success, the consumer processes the events in the copy.
>>
>> Signed-off-by: Xu Kuohai <xukuohai@huawei.com>
>> ---
>> tools/lib/bpf/ringbuf.c | 103 +++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 102 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/lib/bpf/ringbuf.c b/tools/lib/bpf/ringbuf.c
>> index 9702b70da444..9c072af675ff 100644
>> --- a/tools/lib/bpf/ringbuf.c
>> +++ b/tools/lib/bpf/ringbuf.c
>> @@ -27,10 +27,13 @@ struct ring {
>> ring_buffer_sample_fn sample_cb;
>> void *ctx;
>> void *data;
>> + void *read_buffer;
>> unsigned long *consumer_pos;
>> unsigned long *producer_pos;
>> + unsigned long *overwrite_pos;
>> unsigned long mask;
>> int map_fd;
>> + bool overwrite_mode;
>> };
>>
>> struct ring_buffer {
>> @@ -69,6 +72,9 @@ static void ringbuf_free_ring(struct ring_buffer *rb, struct ring *r)
>> r->producer_pos = NULL;
>> }
>>
>> + if (r->read_buffer)
>> + free(r->read_buffer);
>> +
>> free(r);
>> }
>>
>> @@ -119,6 +125,14 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
>> r->sample_cb = sample_cb;
>> r->ctx = ctx;
>> r->mask = info.max_entries - 1;
>> + r->overwrite_mode = info.map_flags & BPF_F_OVERWRITE;
>> + if (unlikely(r->overwrite_mode)) {
>> + r->read_buffer = malloc(info.max_entries);
>> + if (!r->read_buffer) {
>> + err = -ENOMEM;
>> + goto err_out;
>> + }
>> + }
>>
>> /* Map writable consumer page */
>> tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0);
>> @@ -148,6 +162,7 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd,
>> goto err_out;
>> }
>> r->producer_pos = tmp;
>> + r->overwrite_pos = r->producer_pos + 1; /* overwrite_pos is next to producer_pos */
>> r->data = tmp + rb->page_size;
>>
>> e = &rb->events[rb->ring_cnt];
>> @@ -232,7 +247,7 @@ static inline int roundup_len(__u32 len)
>> return (len + 7) / 8 * 8;
>> }
>>
>> -static int64_t ringbuf_process_ring(struct ring *r, size_t n)
>> +static int64_t ringbuf_process_normal_ring(struct ring *r, size_t n)
>> {
>> int *len_ptr, len, err;
>> /* 64-bit to avoid overflow in case of extreme application behavior */
>> @@ -278,6 +293,92 @@ static int64_t ringbuf_process_ring(struct ring *r, size_t n)
>> return cnt;
>> }
>>
>> +static int64_t ringbuf_process_overwrite_ring(struct ring *r, size_t n)
>> +{
>> +
>> + int err;
>> + uint32_t *len_ptr, len;
>> + /* 64-bit to avoid overflow in case of extreme application behavior */
>> + int64_t cnt = 0;
>> + size_t size, offset;
>> + unsigned long cons_pos, prod_pos, over_pos, tmp_pos;
>> + bool got_new_data;
>> + void *sample;
>> + bool copied;
>> +
>> + size = r->mask + 1;
>> +
>> + cons_pos = smp_load_acquire(r->consumer_pos);
>> + do {
>> + got_new_data = false;
>> +
>> + /* grab a copy of data */
>> + prod_pos = smp_load_acquire(r->producer_pos);
>> + do {
>> + over_pos = READ_ONCE(*r->overwrite_pos);
>> + /* prod_pos may be outdated now */
>> + if (over_pos < prod_pos) {
>> + tmp_pos = max(cons_pos, over_pos);
>> + /* smp_load_acquire(r->producer_pos) before
>> + * READ_ONCE(*r->overwrite_pos) ensures that
>> + * over_pos + r->mask < prod_pos never occurs,
>> + * so size is never larger than r->mask
>> + */
>> + size = prod_pos - tmp_pos;
>> + if (!size)
>> + goto done;
>> + memcpy(r->read_buffer,
>> + r->data + (tmp_pos & r->mask), size);
>> + copied = true;
>> + } else {
>> + copied = false;
>> + }
>> + prod_pos = smp_load_acquire(r->producer_pos);
>> + /* retry if data is overwritten by producer */
>> + } while (!copied || prod_pos - tmp_pos > r->mask);
> 
> This seems to allow for a situation where a call to process the ring can
> infinite loop if the producers are producing and overwriting fast enough. That
> seems suboptimal to me?
> 
> Should there be a timeout or maximum number of attempts or something that
> returns -EBUSY or another error to the user?
> 

Yeah, infinite loop is a bit unsettling, will return -EBUSY after some
tries.