[RFC v2 3/5] io_uring/bpf: implement struct_ops registration

Pavel Begunkov posted 5 patches 6 months, 2 weeks ago
[RFC v2 3/5] io_uring/bpf: implement struct_ops registration
Posted by Pavel Begunkov 6 months, 2 weeks ago
Add ring_fd to the struct_ops and implement [un]registration.

Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
 io_uring/bpf.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++-
 io_uring/bpf.h |  3 +++
 2 files changed, 69 insertions(+), 1 deletion(-)

diff --git a/io_uring/bpf.c b/io_uring/bpf.c
index 3096c54e4fb3..0f82acf09959 100644
--- a/io_uring/bpf.c
+++ b/io_uring/bpf.c
@@ -3,6 +3,8 @@
 #include "bpf.h"
 #include "register.h"
 
+DEFINE_MUTEX(io_bpf_ctrl_mutex);
+
 static struct io_uring_ops io_bpf_ops_stubs = {
 };
 
@@ -50,20 +52,83 @@ static int bpf_io_init_member(const struct btf_type *t,
 			       const struct btf_member *member,
 			       void *kdata, const void *udata)
 {
+	u32 moff = __btf_member_bit_offset(t, member) / 8;
+	const struct io_uring_ops *uops = udata;
+	struct io_uring_ops *ops = kdata;
+
+	switch (moff) {
+	case offsetof(struct io_uring_ops, ring_fd):
+		ops->ring_fd = uops->ring_fd;
+		return 1;
+	}
+	return 0;
+}
+
+static int io_register_bpf_ops(struct io_ring_ctx *ctx, struct io_uring_ops *ops)
+{
+	if (ctx->bpf_ops)
+		return -EBUSY;
+	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
+		return -EOPNOTSUPP;
+
+	percpu_ref_get(&ctx->refs);
+	ops->ctx = ctx;
+	ctx->bpf_ops = ops;
 	return 0;
 }
 
 static int bpf_io_reg(void *kdata, struct bpf_link *link)
 {
-	return -EOPNOTSUPP;
+	struct io_uring_ops *ops = kdata;
+	struct io_ring_ctx *ctx;
+	struct file *file;
+	int ret;
+
+	file = io_uring_register_get_file(ops->ring_fd, false);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	ctx = file->private_data;
+	scoped_guard(mutex, &ctx->uring_lock)
+		ret = io_register_bpf_ops(ctx, ops);
+
+	fput(file);
+	return ret;
 }
 
 static void bpf_io_unreg(void *kdata, struct bpf_link *link)
 {
+	struct io_uring_ops *ops = kdata;
+	struct io_ring_ctx *ctx;
+
+	guard(mutex)(&io_bpf_ctrl_mutex);
+
+	ctx = ops->ctx;
+	ops->ctx = NULL;
+
+	if (ctx) {
+		scoped_guard(mutex, &ctx->uring_lock) {
+			if (ctx->bpf_ops == ops)
+				ctx->bpf_ops = NULL;
+		}
+		percpu_ref_put(&ctx->refs);
+	}
 }
 
 void io_unregister_bpf_ops(struct io_ring_ctx *ctx)
 {
+	struct io_uring_ops *ops;
+
+	guard(mutex)(&io_bpf_ctrl_mutex);
+	guard(mutex)(&ctx->uring_lock);
+
+	ops = ctx->bpf_ops;
+	ctx->bpf_ops = NULL;
+
+	if (ops && ops->ctx) {
+		percpu_ref_put(&ctx->refs);
+		ops->ctx = NULL;
+	}
 }
 
 static struct bpf_struct_ops bpf_io_uring_ops = {
diff --git a/io_uring/bpf.h b/io_uring/bpf.h
index a61c489d306b..4b147540d006 100644
--- a/io_uring/bpf.h
+++ b/io_uring/bpf.h
@@ -8,6 +8,9 @@
 #include "io_uring.h"
 
 struct io_uring_ops {
+	__u32 ring_fd;
+
+	struct io_ring_ctx *ctx;
 };
 
 static inline bool io_bpf_attached(struct io_ring_ctx *ctx)
-- 
2.49.0
Re: [RFC v2 3/5] io_uring/bpf: implement struct_ops registration
Posted by Jens Axboe 6 months, 2 weeks ago
On 6/6/25 7:58 AM, Pavel Begunkov wrote:
> Add ring_fd to the struct_ops and implement [un]registration.
> 
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
>  io_uring/bpf.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++-
>  io_uring/bpf.h |  3 +++
>  2 files changed, 69 insertions(+), 1 deletion(-)
> 
> diff --git a/io_uring/bpf.c b/io_uring/bpf.c
> index 3096c54e4fb3..0f82acf09959 100644
> --- a/io_uring/bpf.c
> +++ b/io_uring/bpf.c
> @@ -3,6 +3,8 @@
>  #include "bpf.h"
>  #include "register.h"
>  
> +DEFINE_MUTEX(io_bpf_ctrl_mutex);
> +
>  static struct io_uring_ops io_bpf_ops_stubs = {
>  };
>  
> @@ -50,20 +52,83 @@ static int bpf_io_init_member(const struct btf_type *t,
>  			       const struct btf_member *member,
>  			       void *kdata, const void *udata)
>  {
> +	u32 moff = __btf_member_bit_offset(t, member) / 8;
> +	const struct io_uring_ops *uops = udata;
> +	struct io_uring_ops *ops = kdata;
> +
> +	switch (moff) {
> +	case offsetof(struct io_uring_ops, ring_fd):
> +		ops->ring_fd = uops->ring_fd;
> +		return 1;
> +	}
> +	return 0;

Possible to pass in here whether the ring fd is registered or not? Such
that it can be used in bpf_io_reg() as well.

> +static int io_register_bpf_ops(struct io_ring_ctx *ctx, struct io_uring_ops *ops)
> +{
> +	if (ctx->bpf_ops)
> +		return -EBUSY;
> +	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
> +		return -EOPNOTSUPP;
> +
> +	percpu_ref_get(&ctx->refs);
> +	ops->ctx = ctx;
> +	ctx->bpf_ops = ops;
>  	return 0;
>  }

Haven't looked too deeply yet, but what's the dependency with
DEFER_TASKRUN?

-- 
Jens Axboe
Re: [RFC v2 3/5] io_uring/bpf: implement struct_ops registration
Posted by Pavel Begunkov 6 months, 2 weeks ago
On 6/6/25 15:57, Jens Axboe wrote:
...>> @@ -50,20 +52,83 @@ static int bpf_io_init_member(const struct btf_type *t,
>>   			       const struct btf_member *member,
>>   			       void *kdata, const void *udata)
>>   {
>> +	u32 moff = __btf_member_bit_offset(t, member) / 8;
>> +	const struct io_uring_ops *uops = udata;
>> +	struct io_uring_ops *ops = kdata;
>> +
>> +	switch (moff) {
>> +	case offsetof(struct io_uring_ops, ring_fd):
>> +		ops->ring_fd = uops->ring_fd;
>> +		return 1;
>> +	}
>> +	return 0;
> 
> Possible to pass in here whether the ring fd is registered or not? Such
> that it can be used in bpf_io_reg() as well.

That requires registration to be done off the syscall path (e.g. no
workers), which is low risk and I'm pretty sure that's how it's done,
but in either case that's not up to io_uring and should be vetted by
bpf. It's not important to performance, and leaking that to other
syscalls is a bad idea as well, so in the meantime it's just left
unsupported.

>> +static int io_register_bpf_ops(struct io_ring_ctx *ctx, struct io_uring_ops *ops)
>> +{
>> +	if (ctx->bpf_ops)
>> +		return -EBUSY;
>> +	if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
>> +		return -EOPNOTSUPP;
>> +
>> +	percpu_ref_get(&ctx->refs);
>> +	ops->ctx = ctx;
>> +	ctx->bpf_ops = ops;
>>   	return 0;
>>   }
> 
> Haven't looked too deeply yet, but what's the dependency with
> DEFER_TASKRUN?
Unregistration needs to be sync'ed with waiters, and that can easily
become a problem. Taking the lock like in this set in not necessarily
the right solution. I plan to wait and see where it goes rather
than shooting myself in the leg right away.

-- 
Pavel Begunkov
Re: [RFC v2 3/5] io_uring/bpf: implement struct_ops registration
Posted by Jens Axboe 6 months, 2 weeks ago
On 6/6/25 2:00 PM, Pavel Begunkov wrote:
> On 6/6/25 15:57, Jens Axboe wrote:
> ...>> @@ -50,20 +52,83 @@ static int bpf_io_init_member(const struct btf_type *t,
>>>                      const struct btf_member *member,
>>>                      void *kdata, const void *udata)
>>>   {
>>> +    u32 moff = __btf_member_bit_offset(t, member) / 8;
>>> +    const struct io_uring_ops *uops = udata;
>>> +    struct io_uring_ops *ops = kdata;
>>> +
>>> +    switch (moff) {
>>> +    case offsetof(struct io_uring_ops, ring_fd):
>>> +        ops->ring_fd = uops->ring_fd;
>>> +        return 1;
>>> +    }
>>> +    return 0;
>>
>> Possible to pass in here whether the ring fd is registered or not? Such
>> that it can be used in bpf_io_reg() as well.
> 
> That requires registration to be done off the syscall path (e.g. no
> workers), which is low risk and I'm pretty sure that's how it's done,
> but in either case that's not up to io_uring and should be vetted by
> bpf. It's not important to performance, and leaking that to other
> syscalls is a bad idea as well, so in the meantime it's just left
> unsupported.

Don't care about the performance as much as it being a weird crinkle.
Obviously not a huge deal, and can always get sorted out down the line.

>>> +static int io_register_bpf_ops(struct io_ring_ctx *ctx, struct io_uring_ops *ops)
>>> +{
>>> +    if (ctx->bpf_ops)
>>> +        return -EBUSY;
>>> +    if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN))
>>> +        return -EOPNOTSUPP;
>>> +
>>> +    percpu_ref_get(&ctx->refs);
>>> +    ops->ctx = ctx;
>>> +    ctx->bpf_ops = ops;
>>>       return 0;
>>>   }
>>
>> Haven't looked too deeply yet, but what's the dependency with
>> DEFER_TASKRUN?
> Unregistration needs to be sync'ed with waiters, and that can easily
> become a problem. Taking the lock like in this set in not necessarily
> the right solution. I plan to wait and see where it goes rather
> than shooting myself in the leg right away.

That's fine, would be nice with a comment or something in the commit
message to that effect at least for the time being.

-- 
Jens Axboe