Add a struct_ops callback called handle_events, which will be called
off the CQ waiting loop every time there is an event that might be
interesting to the program. The program takes the io_uring ctx and also
a loop state, which it can use to set the number of events it wants to
wait for as well as the timeout value.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
io_uring/bpf.c | 33 +++++++++++++++++++++++++++++++++
io_uring/bpf.h | 16 ++++++++++++++++
io_uring/io_uring.c | 22 +++++++++++++++++++++-
3 files changed, 70 insertions(+), 1 deletion(-)
diff --git a/io_uring/bpf.c b/io_uring/bpf.c
index 0f82acf09959..f86b12f280e8 100644
--- a/io_uring/bpf.c
+++ b/io_uring/bpf.c
@@ -1,11 +1,20 @@
#include <linux/mutex.h>
+#include <linux/bpf_verifier.h>
#include "bpf.h"
#include "register.h"
+static const struct btf_type *loop_state_type;
DEFINE_MUTEX(io_bpf_ctrl_mutex);
+static int io_bpf_ops__handle_events(struct io_ring_ctx *ctx,
+ struct iou_loop_state *state)
+{
+ return IOU_EVENTS_STOP;
+}
+
static struct io_uring_ops io_bpf_ops_stubs = {
+ .handle_events = io_bpf_ops__handle_events,
};
static bool bpf_io_is_valid_access(int off, int size,
@@ -27,6 +36,16 @@ static int bpf_io_btf_struct_access(struct bpf_verifier_log *log,
const struct bpf_reg_state *reg, int off,
int size)
{
+ const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id);
+
+ if (t == loop_state_type) {
+ if (off >= offsetof(struct iou_loop_state, target_cq_tail) &&
+ off + size <= offsetofend(struct iou_loop_state, target_cq_tail))
+ return SCALAR_VALUE;
+ if (off >= offsetof(struct iou_loop_state, timeout) &&
+ off + size <= offsetofend(struct iou_loop_state, timeout))
+ return SCALAR_VALUE;
+ }
return -EACCES;
}
@@ -36,8 +55,22 @@ static const struct bpf_verifier_ops bpf_io_verifier_ops = {
.btf_struct_access = bpf_io_btf_struct_access,
};
+static const struct btf_type *
+io_lookup_struct_type(struct btf *btf, const char *name)
+{
+ s32 type_id;
+
+ type_id = btf_find_by_name_kind(btf, name, BTF_KIND_STRUCT);
+ if (type_id < 0)
+ return NULL;
+ return btf_type_by_id(btf, type_id);
+}
+
static int bpf_io_init(struct btf *btf)
{
+ loop_state_type = io_lookup_struct_type(btf, "iou_loop_state");
+ if (!loop_state_type)
+ return -EINVAL;
return 0;
}
diff --git a/io_uring/bpf.h b/io_uring/bpf.h
index 4b147540d006..ac4a9361f9c7 100644
--- a/io_uring/bpf.h
+++ b/io_uring/bpf.h
@@ -7,12 +7,28 @@
#include "io_uring.h"
+enum {
+ IOU_EVENTS_WAIT,
+ IOU_EVENTS_STOP,
+};
+
struct io_uring_ops {
__u32 ring_fd;
+ int (*handle_events)(struct io_ring_ctx *ctx, struct iou_loop_state *state);
+
struct io_ring_ctx *ctx;
};
+static inline int io_run_bpf(struct io_ring_ctx *ctx, struct iou_loop_state *state)
+{
+ scoped_guard(mutex, &ctx->uring_lock) {
+ if (!ctx->bpf_ops)
+ return IOU_EVENTS_STOP;
+ return ctx->bpf_ops->handle_events(ctx, state);
+ }
+}
+
static inline bool io_bpf_attached(struct io_ring_ctx *ctx)
{
return IS_ENABLED(CONFIG_BPF) && ctx->bpf_ops != NULL;
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 8f68e898d60c..bf245be0844b 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2540,8 +2540,13 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
if (unlikely(test_bit(IO_CHECK_CQ_OVERFLOW_BIT, &ctx->check_cq)))
io_cqring_do_overflow_flush(ctx);
- if (__io_cqring_events_user(ctx) >= min_events)
+
+ if (io_bpf_attached(ctx)) {
+ if (ext_arg->min_time)
+ return -EINVAL;
+ } else if (__io_cqring_events_user(ctx) >= min_events) {
return 0;
+ }
init_waitqueue_func_entry(&iowq.wq, io_wake_function);
iowq.wq.private = current;
@@ -2621,6 +2626,21 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events, u32 flags,
if (ret < 0)
break;
+ if (io_bpf_attached(ctx)) {
+ ret = io_run_bpf(ctx, &iowq.state);
+ if (ret != IOU_EVENTS_WAIT)
+ break;
+
+ if (unlikely(read_thread_flags())) {
+ if (task_sigpending(current)) {
+ ret = -EINTR;
+ break;
+ }
+ cond_resched();
+ }
+ continue;
+ }
+
check_cq = READ_ONCE(ctx->check_cq);
if (unlikely(check_cq)) {
/* let the caller flush overflows, retry */
--
2.49.0
On Fri, Jun 6, 2025 at 6:58 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> +static inline int io_run_bpf(struct io_ring_ctx *ctx, struct iou_loop_state *state)
> +{
> + scoped_guard(mutex, &ctx->uring_lock) {
> + if (!ctx->bpf_ops)
> + return IOU_EVENTS_STOP;
> + return ctx->bpf_ops->handle_events(ctx, state);
> + }
> +}
you're grabbing the mutex before calling bpf prog and doing
it in a loop million times a second?
Looks like massive overhead for program invocation.
I'm surprised it's fast.
On 6/11/25 8:28 PM, Alexei Starovoitov wrote:
> On Fri, Jun 6, 2025 at 6:58?AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> +static inline int io_run_bpf(struct io_ring_ctx *ctx, struct iou_loop_state *state)
>> +{
>> + scoped_guard(mutex, &ctx->uring_lock) {
>> + if (!ctx->bpf_ops)
>> + return IOU_EVENTS_STOP;
>> + return ctx->bpf_ops->handle_events(ctx, state);
>> + }
>> +}
>
> you're grabbing the mutex before calling bpf prog and doing
> it in a loop million times a second?
> Looks like massive overhead for program invocation.
> I'm surprised it's fast.
Grabbing a mutex is only expensive if it's contended, or obviously
if it's already held. Repeatedly grabbing it on submission where
submission is the only one expected to grab it (or off that path, at
least) means it should be very cheap.
--
Jens Axboe
On 6/12/25 03:28, Alexei Starovoitov wrote:
> On Fri, Jun 6, 2025 at 6:58 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>>
>> +static inline int io_run_bpf(struct io_ring_ctx *ctx, struct iou_loop_state *state)
>> +{
>> + scoped_guard(mutex, &ctx->uring_lock) {
>> + if (!ctx->bpf_ops)
>> + return IOU_EVENTS_STOP;
>> + return ctx->bpf_ops->handle_events(ctx, state);
>> + }
>> +}
>
> you're grabbing the mutex before calling bpf prog and doing
> it in a loop million times a second?
> Looks like massive overhead for program invocation.
> I'm surprised it's fast.
You need the lock to submit anything with io_uring, so there is
a parity with how it already is. And the program is just a test
and pretty silly in nature, normally you'd either get higher
batching, and the user (incl bpf) can specifically specify to
wait for more, or it'll be intermingled with sleeping at which
point the mutex is not a problem. I'll write a storage IO
example for the next time.
If there will be a good use case, I can try to relax it for
programs that don't issue requests, but that might make
sync more complicated, especially on the reg/unreg side.
--
Pavel Begunkov
© 2016 - 2025 Red Hat, Inc.