A handle_events program should be able to parse the CQ and submit new
requests, add kfuncs to cover that. The only essential kfunc here is
bpf_io_uring_submit_sqes, and the rest are likely be removed in a
non-RFC version in favour of a more general approach.
Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
---
io_uring/bpf.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 86 insertions(+)
diff --git a/io_uring/bpf.c b/io_uring/bpf.c
index f86b12f280e8..9494e4289605 100644
--- a/io_uring/bpf.c
+++ b/io_uring/bpf.c
@@ -1,12 +1,92 @@
#include <linux/mutex.h>
#include <linux/bpf_verifier.h>
+#include "io_uring.h"
#include "bpf.h"
#include "register.h"
static const struct btf_type *loop_state_type;
DEFINE_MUTEX(io_bpf_ctrl_mutex);
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc int bpf_io_uring_submit_sqes(struct io_ring_ctx *ctx,
+ unsigned nr)
+{
+ return io_submit_sqes(ctx, nr);
+}
+
+__bpf_kfunc int bpf_io_uring_post_cqe(struct io_ring_ctx *ctx,
+ u64 data, u32 res, u32 cflags)
+{
+ bool posted;
+
+ posted = io_post_aux_cqe(ctx, data, res, cflags);
+ return posted ? 0 : -ENOMEM;
+}
+
+__bpf_kfunc int bpf_io_uring_queue_sqe(struct io_ring_ctx *ctx,
+ void *bpf_sqe, int mem__sz)
+{
+ unsigned tail = ctx->rings->sq.tail;
+ struct io_uring_sqe *sqe;
+
+ if (mem__sz != sizeof(*sqe))
+ return -EINVAL;
+
+ ctx->rings->sq.tail++;
+ tail &= (ctx->sq_entries - 1);
+ /* double index for 128-byte SQEs, twice as long */
+ if (ctx->flags & IORING_SETUP_SQE128)
+ tail <<= 1;
+ sqe = &ctx->sq_sqes[tail];
+ memcpy(sqe, bpf_sqe, sizeof(*sqe));
+ return 0;
+}
+
+__bpf_kfunc
+struct io_uring_cqe *bpf_io_uring_get_cqe(struct io_ring_ctx *ctx, u32 idx)
+{
+ unsigned max_entries = ctx->cq_entries;
+ struct io_uring_cqe *cqe_array = ctx->rings->cqes;
+
+ if (ctx->flags & IORING_SETUP_CQE32)
+ max_entries *= 2;
+ return &cqe_array[idx & (max_entries - 1)];
+}
+
+__bpf_kfunc
+struct io_uring_cqe *bpf_io_uring_extract_next_cqe(struct io_ring_ctx *ctx)
+{
+ struct io_rings *rings = ctx->rings;
+ unsigned int mask = ctx->cq_entries - 1;
+ unsigned head = rings->cq.head;
+ struct io_uring_cqe *cqe;
+
+ /* TODO CQE32 */
+ if (head == rings->cq.tail)
+ return NULL;
+
+ cqe = &rings->cqes[head & mask];
+ rings->cq.head++;
+ return cqe;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(io_uring_kfunc_set)
+BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE);
+BTF_ID_FLAGS(func, bpf_io_uring_post_cqe, KF_SLEEPABLE);
+BTF_ID_FLAGS(func, bpf_io_uring_queue_sqe, KF_SLEEPABLE);
+BTF_ID_FLAGS(func, bpf_io_uring_get_cqe, 0);
+BTF_ID_FLAGS(func, bpf_io_uring_extract_next_cqe, KF_RET_NULL);
+BTF_KFUNCS_END(io_uring_kfunc_set)
+
+static const struct btf_kfunc_id_set bpf_io_uring_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &io_uring_kfunc_set,
+};
+
static int io_bpf_ops__handle_events(struct io_ring_ctx *ctx,
struct iou_loop_state *state)
{
@@ -186,6 +266,12 @@ static int __init io_uring_bpf_init(void)
return ret;
}
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS,
+ &bpf_io_uring_kfunc_set);
+ if (ret) {
+ pr_err("io_uring: Failed to register kfuncs (%d)\n", ret);
+ return ret;
+ }
return 0;
}
__initcall(io_uring_bpf_init);
--
2.49.0
On Fri, Jun 6, 2025 at 6:58 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> A handle_events program should be able to parse the CQ and submit new
> requests, add kfuncs to cover that. The only essential kfunc here is
> bpf_io_uring_submit_sqes, and the rest are likely be removed in a
> non-RFC version in favour of a more general approach.
>
> Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
> ---
> io_uring/bpf.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 86 insertions(+)
>
> diff --git a/io_uring/bpf.c b/io_uring/bpf.c
> index f86b12f280e8..9494e4289605 100644
> --- a/io_uring/bpf.c
> +++ b/io_uring/bpf.c
> @@ -1,12 +1,92 @@
> #include <linux/mutex.h>
> #include <linux/bpf_verifier.h>
>
> +#include "io_uring.h"
> #include "bpf.h"
> #include "register.h"
>
> static const struct btf_type *loop_state_type;
> DEFINE_MUTEX(io_bpf_ctrl_mutex);
>
> +__bpf_kfunc_start_defs();
> +
> +__bpf_kfunc int bpf_io_uring_submit_sqes(struct io_ring_ctx *ctx,
> + unsigned nr)
> +{
> + return io_submit_sqes(ctx, nr);
> +}
> +
> +__bpf_kfunc int bpf_io_uring_post_cqe(struct io_ring_ctx *ctx,
> + u64 data, u32 res, u32 cflags)
> +{
> + bool posted;
> +
> + posted = io_post_aux_cqe(ctx, data, res, cflags);
> + return posted ? 0 : -ENOMEM;
> +}
> +
> +__bpf_kfunc int bpf_io_uring_queue_sqe(struct io_ring_ctx *ctx,
> + void *bpf_sqe, int mem__sz)
> +{
> + unsigned tail = ctx->rings->sq.tail;
> + struct io_uring_sqe *sqe;
> +
> + if (mem__sz != sizeof(*sqe))
> + return -EINVAL;
> +
> + ctx->rings->sq.tail++;
> + tail &= (ctx->sq_entries - 1);
> + /* double index for 128-byte SQEs, twice as long */
> + if (ctx->flags & IORING_SETUP_SQE128)
> + tail <<= 1;
> + sqe = &ctx->sq_sqes[tail];
> + memcpy(sqe, bpf_sqe, sizeof(*sqe));
> + return 0;
> +}
> +
> +__bpf_kfunc
> +struct io_uring_cqe *bpf_io_uring_get_cqe(struct io_ring_ctx *ctx, u32 idx)
> +{
> + unsigned max_entries = ctx->cq_entries;
> + struct io_uring_cqe *cqe_array = ctx->rings->cqes;
> +
> + if (ctx->flags & IORING_SETUP_CQE32)
> + max_entries *= 2;
> + return &cqe_array[idx & (max_entries - 1)];
> +}
> +
> +__bpf_kfunc
> +struct io_uring_cqe *bpf_io_uring_extract_next_cqe(struct io_ring_ctx *ctx)
> +{
> + struct io_rings *rings = ctx->rings;
> + unsigned int mask = ctx->cq_entries - 1;
> + unsigned head = rings->cq.head;
> + struct io_uring_cqe *cqe;
> +
> + /* TODO CQE32 */
> + if (head == rings->cq.tail)
> + return NULL;
> +
> + cqe = &rings->cqes[head & mask];
> + rings->cq.head++;
> + return cqe;
> +}
> +
> +__bpf_kfunc_end_defs();
> +
> +BTF_KFUNCS_START(io_uring_kfunc_set)
> +BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE);
> +BTF_ID_FLAGS(func, bpf_io_uring_post_cqe, KF_SLEEPABLE);
> +BTF_ID_FLAGS(func, bpf_io_uring_queue_sqe, KF_SLEEPABLE);
> +BTF_ID_FLAGS(func, bpf_io_uring_get_cqe, 0);
> +BTF_ID_FLAGS(func, bpf_io_uring_extract_next_cqe, KF_RET_NULL);
> +BTF_KFUNCS_END(io_uring_kfunc_set)
This is not safe in general.
The verifier doesn't enforce argument safety here.
As a minimum you need to add KF_TRUSTED_ARGS flag to all kfunc.
And once you do that you'll see that the verifier
doesn't recognize the cqe returned from bpf_io_uring_get_cqe*()
as trusted.
Looking at your example:
https://github.com/axboe/liburing/commit/706237127f03e15b4cc9c7c31c16d34dbff37cdc
it doesn't care about contents of cqe and doesn't pass it further.
So sort-of ok-ish right now,
but if you need to pass cqe to another kfunc
you would need to add an open coded iterator for cqe-s
with appropriate KF_ITER* flags
or maybe add acquire/release semantics for cqe.
Like, get_cqe will be KF_ACQUIRE, and you'd need
matching KF_RELEASE kfunc,
so that 'cqe' is not lost.
Then 'cqe' will be trusted and you can pass it as actual 'cqe'
into another kfunc.
Without KF_ACQUIRE the verifier sees that get_cqe*() kfuncs
return 'struct io_uring_cqe *' and it's ok for tracing
or passing into kfuncs like bpf_io_uring_queue_sqe()
that don't care about a particular type,
but not ok for full tracking of objects.
For next revision please post all selftest, examples,
and bpf progs on the list,
so people don't need to search github.
On 6/12/25 03:47, Alexei Starovoitov wrote:
> On Fri, Jun 6, 2025 at 6:58 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
...>> +__bpf_kfunc
>> +struct io_uring_cqe *bpf_io_uring_extract_next_cqe(struct io_ring_ctx *ctx)
>> +{
>> + struct io_rings *rings = ctx->rings;
>> + unsigned int mask = ctx->cq_entries - 1;
>> + unsigned head = rings->cq.head;
>> + struct io_uring_cqe *cqe;
>> +
>> + /* TODO CQE32 */
>> + if (head == rings->cq.tail)
>> + return NULL;
>> +
>> + cqe = &rings->cqes[head & mask];
>> + rings->cq.head++;
>> + return cqe;
>> +}
>> +
>> +__bpf_kfunc_end_defs();
>> +
>> +BTF_KFUNCS_START(io_uring_kfunc_set)
>> +BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE);
>> +BTF_ID_FLAGS(func, bpf_io_uring_post_cqe, KF_SLEEPABLE);
>> +BTF_ID_FLAGS(func, bpf_io_uring_queue_sqe, KF_SLEEPABLE);
>> +BTF_ID_FLAGS(func, bpf_io_uring_get_cqe, 0);
>> +BTF_ID_FLAGS(func, bpf_io_uring_extract_next_cqe, KF_RET_NULL);
>> +BTF_KFUNCS_END(io_uring_kfunc_set)
>
> This is not safe in general.
> The verifier doesn't enforce argument safety here.
> As a minimum you need to add KF_TRUSTED_ARGS flag to all kfunc.
> And once you do that you'll see that the verifier
> doesn't recognize the cqe returned from bpf_io_uring_get_cqe*()
> as trusted.
Thanks, will add it. If I read it right, without the flag the
program can, for example, create a struct io_ring_ctx on stack,
fill it with nonsense and pass to kfuncs. Is that right?
> Looking at your example:
> https://github.com/axboe/liburing/commit/706237127f03e15b4cc9c7c31c16d34dbff37cdc
> it doesn't care about contents of cqe and doesn't pass it further.
> So sort-of ok-ish right now,
> but if you need to pass cqe to another kfunc
> you would need to add an open coded iterator for cqe-s
> with appropriate KF_ITER* flags
> or maybe add acquire/release semantics for cqe.
> Like, get_cqe will be KF_ACQUIRE, and you'd need
> matching KF_RELEASE kfunc,
> so that 'cqe' is not lost.
> Then 'cqe' will be trusted and you can pass it as actual 'cqe'
> into another kfunc.
> Without KF_ACQUIRE the verifier sees that get_cqe*() kfuncs
> return 'struct io_uring_cqe *' and it's ok for tracing
> or passing into kfuncs like bpf_io_uring_queue_sqe()
> that don't care about a particular type,
> but not ok for full tracking of objects.
I don't need type safety for SQEs / CQEs, they're supposed to be simple
memory blobs containing userspace data only. SQ / CQ are shared with
userspace, and the kfuncs can leak the content of passed CQE / SQE to
userspace. But I'd like to find a way to reject programs stashing
kernel pointers / data into them.
BPF_PROG(name, struct io_ring_ctx *io_ring)
{
struct io_uring_sqe *cqe = ...;
cqe->user_data = io_ring;
cqe->res = io_ring->private_field;
}
And I mentioned in the message, I rather want to get rid of half of the
kfuncs, and give BPF direct access to the SQ/CQ instead. Schematically
it should look like this:
BPF_PROG(name, struct io_ring_ctx *ring)
{
struct io_uring_sqe *sqes = get_SQ(ring);
sqes[ring->sq_tail]->opcode = OP_NOP;
bpf_kfunc_submit_sqes(ring, 1);
struct io_uring_cqe *cqes = get_CQ(ring);
print_cqe(&cqes[ring->cq_head]);
}
I hacked up RET_PTR_TO_MEM for kfuncs, the diff is below, but it'd be
great to get rid of the constness of the size argument. I need to
digest arenas first as conceptually they look very close.
> For next revision please post all selftest, examples,
> and bpf progs on the list,
> so people don't need to search github.
Did the link in the cover letter not work for you? I'm confused
since it's all in a branch in my tree, but you linked to the same
patches but in Jens' tree, and I have zero clue what they're
doing there or how you found them.
diff --git a/io_uring/bpf.c b/io_uring/bpf.c
index 9494e4289605..400a06a74b5d 100644
--- a/io_uring/bpf.c
+++ b/io_uring/bpf.c
@@ -2,6 +2,7 @@
#include <linux/bpf_verifier.h>
#include "io_uring.h"
+#include "memmap.h"
#include "bpf.h"
#include "register.h"
@@ -72,6 +73,14 @@ struct io_uring_cqe *bpf_io_uring_extract_next_cqe(struct io_ring_ctx *ctx)
return cqe;
}
+__bpf_kfunc
+void *bpf_io_uring_get_region(struct io_ring_ctx *ctx, u64 size__retsz)
+{
+ if (size__retsz > ((u64)ctx->ring_region.nr_pages << PAGE_SHIFT))
+ return NULL;
+ return io_region_get_ptr(&ctx->ring_region);
+}
+
__bpf_kfunc_end_defs();
BTF_KFUNCS_START(io_uring_kfunc_set)
@@ -80,6 +89,7 @@ BTF_ID_FLAGS(func, bpf_io_uring_post_cqe, KF_SLEEPABLE);
BTF_ID_FLAGS(func, bpf_io_uring_queue_sqe, KF_SLEEPABLE);
BTF_ID_FLAGS(func, bpf_io_uring_get_cqe, 0);
BTF_ID_FLAGS(func, bpf_io_uring_extract_next_cqe, KF_RET_NULL);
+BTF_ID_FLAGS(func, bpf_io_uring_get_region, KF_RET_NULL);
BTF_KFUNCS_END(io_uring_kfunc_set)
static const struct btf_kfunc_id_set bpf_io_uring_kfunc_set = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 54c6953a8b84..ac4803b5933c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -343,6 +343,7 @@ struct bpf_kfunc_call_arg_meta {
int uid;
} map;
u64 mem_size;
+ bool mem_size_found;
};
struct btf *btf_vmlinux;
@@ -11862,6 +11863,11 @@ static bool is_kfunc_arg_ignore(const struct btf *btf, const struct btf_param *a
return btf_param_match_suffix(btf, arg, "__ign");
}
+static bool is_kfunc_arg_ret_size(const struct btf *btf, const struct btf_param *arg)
+{
+ return btf_param_match_suffix(btf, arg, "__retsz");
+}
+
static bool is_kfunc_arg_map(const struct btf *btf, const struct btf_param *arg)
{
return btf_param_match_suffix(btf, arg, "__map");
@@ -12912,7 +12918,21 @@ static int check_kfunc_args(struct bpf_verifier_env *env, struct bpf_kfunc_call_
return -EINVAL;
}
- if (is_kfunc_arg_constant(meta->btf, &args[i])) {
+ if (is_kfunc_arg_ret_size(btf, &args[i])) {
+ if (!tnum_is_const(reg->var_off)) {
+ verbose(env, "R%d must be a known constant\n", regno);
+ return -EINVAL;
+ }
+ if (meta->mem_size_found) {
+ verbose(env, "Only one return size argument is permitted\n");
+ return -EINVAL;
+ }
+ meta->mem_size = reg->var_off.value;
+ meta->mem_size_found = true;
+ ret = mark_chain_precision(env, regno);
+ if (ret)
+ return ret;
+ } else if (is_kfunc_arg_constant(meta->btf, &args[i])) {
if (meta->arg_constant.found) {
verbose(env, "verifier internal error: only one constant argument permitted\n");
return -EFAULT;
@@ -13816,6 +13836,12 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
} else if (btf_type_is_void(ptr_type)) {
/* kfunc returning 'void *' is equivalent to returning scalar */
mark_reg_unknown(env, regs, BPF_REG_0);
+
+ if (meta.mem_size_found) {
+ mark_reg_known_zero(env, regs, BPF_REG_0);
+ regs[BPF_REG_0].type = PTR_TO_MEM;
+ regs[BPF_REG_0].mem_size = meta.mem_size;
+ }
} else if (!__btf_type_is_struct(ptr_type)) {
if (!meta.r0_size) {
__u32 sz;
--
Pavel Begunkov
On Thu, Jun 12, 2025 at 6:25 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 6/12/25 03:47, Alexei Starovoitov wrote:
> > On Fri, Jun 6, 2025 at 6:58 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> ...>> +__bpf_kfunc
> >> +struct io_uring_cqe *bpf_io_uring_extract_next_cqe(struct io_ring_ctx *ctx)
> >> +{
> >> + struct io_rings *rings = ctx->rings;
> >> + unsigned int mask = ctx->cq_entries - 1;
> >> + unsigned head = rings->cq.head;
> >> + struct io_uring_cqe *cqe;
> >> +
> >> + /* TODO CQE32 */
> >> + if (head == rings->cq.tail)
> >> + return NULL;
> >> +
> >> + cqe = &rings->cqes[head & mask];
> >> + rings->cq.head++;
> >> + return cqe;
> >> +}
> >> +
> >> +__bpf_kfunc_end_defs();
> >> +
> >> +BTF_KFUNCS_START(io_uring_kfunc_set)
> >> +BTF_ID_FLAGS(func, bpf_io_uring_submit_sqes, KF_SLEEPABLE);
> >> +BTF_ID_FLAGS(func, bpf_io_uring_post_cqe, KF_SLEEPABLE);
> >> +BTF_ID_FLAGS(func, bpf_io_uring_queue_sqe, KF_SLEEPABLE);
> >> +BTF_ID_FLAGS(func, bpf_io_uring_get_cqe, 0);
> >> +BTF_ID_FLAGS(func, bpf_io_uring_extract_next_cqe, KF_RET_NULL);
> >> +BTF_KFUNCS_END(io_uring_kfunc_set)
> >
> > This is not safe in general.
> > The verifier doesn't enforce argument safety here.
> > As a minimum you need to add KF_TRUSTED_ARGS flag to all kfunc.
> > And once you do that you'll see that the verifier
> > doesn't recognize the cqe returned from bpf_io_uring_get_cqe*()
> > as trusted.
>
> Thanks, will add it. If I read it right, without the flag the
> program can, for example, create a struct io_ring_ctx on stack,
> fill it with nonsense and pass to kfuncs. Is that right?
No. The verifier will only allow a pointer to struct io_ring_ctx
to be passed, but it may not be fully trusted.
The verifier has 3 types of pointers to kernel structures:
1. ptr_to_btf_id
2. ptr_to_btf_id | trusted
3. ptr_to_btf_id | untrusted
1st was added long ago for tracing and gradually got adopted
for non-tracing needs, but it has a foot gun, since
all pointer walks keep ptr_to_btf_id type.
It's fine in some cases to follow pointers, but not in all.
Hence 2nd variant was added and there
foo->bar dereference needs to be explicitly allowed
instead of allowed by default like for 1st kind.
All loads through 1 and 3 are implemented as probe_read_kernel.
while loads from 2 are direct loads.
So kfuncs without KF_TRUSTED_ARGS with struct io_ring_ctx *ctx
argument are likely fine and safe, since it's impossible
to get this io_ring_ctx pointer by dereferencing some other pointer.
But better to tighten safety from the start.
We recommend KF_TRUSTED_ARGS for all kfuncs and
eventually it will be the default.
> > Looking at your example:
> > https://github.com/axboe/liburing/commit/706237127f03e15b4cc9c7c31c16d34dbff37cdc
> > it doesn't care about contents of cqe and doesn't pass it further.
> > So sort-of ok-ish right now,
> > but if you need to pass cqe to another kfunc
> > you would need to add an open coded iterator for cqe-s
> > with appropriate KF_ITER* flags
> > or maybe add acquire/release semantics for cqe.
> > Like, get_cqe will be KF_ACQUIRE, and you'd need
> > matching KF_RELEASE kfunc,
> > so that 'cqe' is not lost.
> > Then 'cqe' will be trusted and you can pass it as actual 'cqe'
> > into another kfunc.
> > Without KF_ACQUIRE the verifier sees that get_cqe*() kfuncs
> > return 'struct io_uring_cqe *' and it's ok for tracing
> > or passing into kfuncs like bpf_io_uring_queue_sqe()
> > that don't care about a particular type,
> > but not ok for full tracking of objects.
>
> I don't need type safety for SQEs / CQEs, they're supposed to be simple
> memory blobs containing userspace data only. SQ / CQ are shared with
> userspace, and the kfuncs can leak the content of passed CQE / SQE to
> userspace. But I'd like to find a way to reject programs stashing
> kernel pointers / data into them.
That's impossible.
If you're worried about bpf prog exposing kernel addresses
to user space then abort the whole thing.
CAP_PERFMON is required for the majority of bpf progs.
>
> BPF_PROG(name, struct io_ring_ctx *io_ring)
> {
> struct io_uring_sqe *cqe = ...;
> cqe->user_data = io_ring;
> cqe->res = io_ring->private_field;
> }
>
> And I mentioned in the message, I rather want to get rid of half of the
> kfuncs, and give BPF direct access to the SQ/CQ instead. Schematically
> it should look like this:
>
> BPF_PROG(name, struct io_ring_ctx *ring)
> {
> struct io_uring_sqe *sqes = get_SQ(ring);
>
> sqes[ring->sq_tail]->opcode = OP_NOP;
> bpf_kfunc_submit_sqes(ring, 1);
>
> struct io_uring_cqe *cqes = get_CQ(ring);
> print_cqe(&cqes[ring->cq_head]);
> }
>
> I hacked up RET_PTR_TO_MEM for kfuncs, the diff is below, but it'd be
> great to get rid of the constness of the size argument. I need to
> digest arenas first as conceptually they look very close.
arena is a special memory region where every byte is writeable
by user space.
>
> > For next revision please post all selftest, examples,
> > and bpf progs on the list,
> > so people don't need to search github.
>
> Did the link in the cover letter not work for you? I'm confused
> since it's all in a branch in my tree, but you linked to the same
> patches but in Jens' tree, and I have zero clue what they're
> doing there or how you found them.
External links can disappear. It's not good for reviewers and
for keeping the history of conversation.
>
> diff --git a/io_uring/bpf.c b/io_uring/bpf.c
> index 9494e4289605..400a06a74b5d 100644
> --- a/io_uring/bpf.c
> +++ b/io_uring/bpf.c
> @@ -2,6 +2,7 @@
> #include <linux/bpf_verifier.h>
>
> #include "io_uring.h"
> +#include "memmap.h"
> #include "bpf.h"
> #include "register.h"
>
> @@ -72,6 +73,14 @@ struct io_uring_cqe *bpf_io_uring_extract_next_cqe(struct io_ring_ctx *ctx)
> return cqe;
> }
>
> +__bpf_kfunc
> +void *bpf_io_uring_get_region(struct io_ring_ctx *ctx, u64 size__retsz)
> +{
> + if (size__retsz > ((u64)ctx->ring_region.nr_pages << PAGE_SHIFT))
> + return NULL;
> + return io_region_get_ptr(&ctx->ring_region);
> +}
and bpf prog should be able to read/write anything in
[ctx->ring_region->ptr, ..ptr + size] region ?
Populating (creating) dynptr is probably better.
See bpf_dynptr_from*()
but what is the lifetime of that memory ?
On 6/13/25 01:25, Alexei Starovoitov wrote:
> On Thu, Jun 12, 2025 at 6:25 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
...>>>> +BTF_ID_FLAGS(func, bpf_io_uring_extract_next_cqe, KF_RET_NULL);
>>>> +BTF_KFUNCS_END(io_uring_kfunc_set)
>>>
>>> This is not safe in general.
>>> The verifier doesn't enforce argument safety here.
>>> As a minimum you need to add KF_TRUSTED_ARGS flag to all kfunc.
>>> And once you do that you'll see that the verifier
>>> doesn't recognize the cqe returned from bpf_io_uring_get_cqe*()
>>> as trusted.
>>
>> Thanks, will add it. If I read it right, without the flag the
>> program can, for example, create a struct io_ring_ctx on stack,
>> fill it with nonsense and pass to kfuncs. Is that right?
>
> No. The verifier will only allow a pointer to struct io_ring_ctx
> to be passed, but it may not be fully trusted.
>
> The verifier has 3 types of pointers to kernel structures:
> 1. ptr_to_btf_id
> 2. ptr_to_btf_id | trusted
> 3. ptr_to_btf_id | untrusted
>
> 1st was added long ago for tracing and gradually got adopted
> for non-tracing needs, but it has a foot gun, since
> all pointer walks keep ptr_to_btf_id type.
> It's fine in some cases to follow pointers, but not in all.
> Hence 2nd variant was added and there
> foo->bar dereference needs to be explicitly allowed
> instead of allowed by default like for 1st kind.
>
> All loads through 1 and 3 are implemented as probe_read_kernel.
> while loads from 2 are direct loads.
>
> So kfuncs without KF_TRUSTED_ARGS with struct io_ring_ctx *ctx
> argument are likely fine and safe, since it's impossible
> to get this io_ring_ctx pointer by dereferencing some other pointer.
> But better to tighten safety from the start.
> We recommend KF_TRUSTED_ARGS for all kfuncs and
> eventually it will be the default.
Sure, I'll add it, thanks for the explanation
...>> diff --git a/io_uring/bpf.c b/io_uring/bpf.c
>> index 9494e4289605..400a06a74b5d 100644
>> --- a/io_uring/bpf.c
>> +++ b/io_uring/bpf.c
>> @@ -2,6 +2,7 @@
>> #include <linux/bpf_verifier.h>
>>
>> #include "io_uring.h"
>> +#include "memmap.h"
>> #include "bpf.h"
>> #include "register.h"
>>
>> @@ -72,6 +73,14 @@ struct io_uring_cqe *bpf_io_uring_extract_next_cqe(struct io_ring_ctx *ctx)
>> return cqe;
>> }
>>
>> +__bpf_kfunc
>> +void *bpf_io_uring_get_region(struct io_ring_ctx *ctx, u64 size__retsz)
>> +{
>> + if (size__retsz > ((u64)ctx->ring_region.nr_pages << PAGE_SHIFT))
>> + return NULL;
>> + return io_region_get_ptr(&ctx->ring_region);
>> +}
>
> and bpf prog should be able to read/write anything in
> [ctx->ring_region->ptr, ..ptr + size] region ?
Right, and it's already rw mmap'ed into the user space.
> Populating (creating) dynptr is probably better.
> See bpf_dynptr_from*()
>
> but what is the lifetime of that memory ?
It's valid within a single run of the callback but shouldn't cross
into another invocation. Specifically, it's protected by the lock,
but that can be tuned. Does that match with what PTR_TO_MEM expects?
I can add refcounting for longer term pinning, maybe to store it
as a bpf map or whatever is the right way, but I'd rather avoid
anything expensive in the kfunc as that'll likely be called on
every program run.
--
Pavel Begunkov
On Fri, Jun 13, 2025 at 9:11 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
>
> On 6/13/25 01:25, Alexei Starovoitov wrote:
> > On Thu, Jun 12, 2025 at 6:25 AM Pavel Begunkov <asml.silence@gmail.com> wrote:
> ...>>>> +BTF_ID_FLAGS(func, bpf_io_uring_extract_next_cqe, KF_RET_NULL);
> >>>> +BTF_KFUNCS_END(io_uring_kfunc_set)
> >>>
> >>> This is not safe in general.
> >>> The verifier doesn't enforce argument safety here.
> >>> As a minimum you need to add KF_TRUSTED_ARGS flag to all kfunc.
> >>> And once you do that you'll see that the verifier
> >>> doesn't recognize the cqe returned from bpf_io_uring_get_cqe*()
> >>> as trusted.
> >>
> >> Thanks, will add it. If I read it right, without the flag the
> >> program can, for example, create a struct io_ring_ctx on stack,
> >> fill it with nonsense and pass to kfuncs. Is that right?
> >
> > No. The verifier will only allow a pointer to struct io_ring_ctx
> > to be passed, but it may not be fully trusted.
> >
> > The verifier has 3 types of pointers to kernel structures:
> > 1. ptr_to_btf_id
> > 2. ptr_to_btf_id | trusted
> > 3. ptr_to_btf_id | untrusted
> >
> > 1st was added long ago for tracing and gradually got adopted
> > for non-tracing needs, but it has a foot gun, since
> > all pointer walks keep ptr_to_btf_id type.
> > It's fine in some cases to follow pointers, but not in all.
> > Hence 2nd variant was added and there
> > foo->bar dereference needs to be explicitly allowed
> > instead of allowed by default like for 1st kind.
> >
> > All loads through 1 and 3 are implemented as probe_read_kernel.
> > while loads from 2 are direct loads.
> >
> > So kfuncs without KF_TRUSTED_ARGS with struct io_ring_ctx *ctx
> > argument are likely fine and safe, since it's impossible
> > to get this io_ring_ctx pointer by dereferencing some other pointer.
> > But better to tighten safety from the start.
> > We recommend KF_TRUSTED_ARGS for all kfuncs and
> > eventually it will be the default.
>
> Sure, I'll add it, thanks for the explanation
>
> ...>> diff --git a/io_uring/bpf.c b/io_uring/bpf.c
> >> index 9494e4289605..400a06a74b5d 100644
> >> --- a/io_uring/bpf.c
> >> +++ b/io_uring/bpf.c
> >> @@ -2,6 +2,7 @@
> >> #include <linux/bpf_verifier.h>
> >>
> >> #include "io_uring.h"
> >> +#include "memmap.h"
> >> #include "bpf.h"
> >> #include "register.h"
> >>
> >> @@ -72,6 +73,14 @@ struct io_uring_cqe *bpf_io_uring_extract_next_cqe(struct io_ring_ctx *ctx)
> >> return cqe;
> >> }
> >>
> >> +__bpf_kfunc
> >> +void *bpf_io_uring_get_region(struct io_ring_ctx *ctx, u64 size__retsz)
> >> +{
> >> + if (size__retsz > ((u64)ctx->ring_region.nr_pages << PAGE_SHIFT))
> >> + return NULL;
> >> + return io_region_get_ptr(&ctx->ring_region);
> >> +}
> >
> > and bpf prog should be able to read/write anything in
> > [ctx->ring_region->ptr, ..ptr + size] region ?
>
> Right, and it's already rw mmap'ed into the user space.
>
> > Populating (creating) dynptr is probably better.
> > See bpf_dynptr_from*()
> >
> > but what is the lifetime of that memory ?
>
> It's valid within a single run of the callback but shouldn't cross
> into another invocation. Specifically, it's protected by the lock,
> but that can be tuned. Does that match with what PTR_TO_MEM expects?
yes. PTR_TO_MEM lasts for duration of the prog.
> I can add refcounting for longer term pinning, maybe to store it
> as a bpf map or whatever is the right way, but I'd rather avoid
> anything expensive in the kfunc as that'll likely be called on
> every program run.
yeah. let's not add any refcounting.
It sounds like you want something similar to
__bpf_kfunc __u8 *
hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const
size_t rdwr_buf_size)
we have a special hack for it already in the verifier.
The argument need to be called rdwr_buf_size,
then it will be used to establish the range of PTR_TO_MEM.
It has to be run-time constant.
What you're proposing with "__retsz" is a cleaner version of the same.
But consider bpf_dynptr_from_io_uring(struct io_ring_ctx *ctx)
it can create a dynamically sized region,
and later use bpf_dynptr_slice_rdwr() to get writeable chunk of it.
I feel that __retsz approach may actually be a better fit at the end,
if you're ok with constant arg.
On 6/13/25 20:51, Alexei Starovoitov wrote: > On Fri, Jun 13, 2025 at 9:11 AM Pavel Begunkov <asml.silence@gmail.com> wrote: ...>> >> It's valid within a single run of the callback but shouldn't cross >> into another invocation. Specifically, it's protected by the lock, >> but that can be tuned. Does that match with what PTR_TO_MEM expects? > > yes. PTR_TO_MEM lasts for duration of the prog. > >> I can add refcounting for longer term pinning, maybe to store it >> as a bpf map or whatever is the right way, but I'd rather avoid >> anything expensive in the kfunc as that'll likely be called on >> every program run. > > yeah. let's not add any refcounting. > > It sounds like you want something similar to > __bpf_kfunc __u8 * > hid_bpf_get_data(struct hid_bpf_ctx *ctx, unsigned int offset, const > size_t rdwr_buf_size) > > we have a special hack for it already in the verifier. > The argument need to be called rdwr_buf_size, > then it will be used to establish the range of PTR_TO_MEM. > It has to be run-time constant. Great, I can just use that > What you're proposing with "__retsz" is a cleaner version of the same. > But consider bpf_dynptr_from_io_uring(struct io_ring_ctx *ctx) > it can create a dynamically sized region, > and later use bpf_dynptr_slice_rdwr() to get writeable chunk of it. > > I feel that __retsz approach may actually be a better fit at the end, > if you're ok with constant arg. I took a quick look, 16MB sounds a bit restrictive long term. I'll just go for rdwr_buf_size while experimenting and hopefully will be able to make a more educated choice later -- Pavel Begunkov
On 6/12/25 7:26 AM, Pavel Begunkov wrote: >> For next revision please post all selftest, examples, >> and bpf progs on the list, >> so people don't need to search github. > > Did the link in the cover letter not work for you? I'm confused > since it's all in a branch in my tree, but you linked to the same > patches but in Jens' tree, and I have zero clue what they're > doing there or how you found them. Puzzled me too, but if you go there, github will say: "This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository." which is exactly because it's not in my tree, but in your fork of my tree. Pretty wonky GH behavior if you ask me, but there it is. -- Jens Axboe
© 2016 - 2025 Red Hat, Inc.