In the security kexec_file_load case, the buffer holding the kernel
image should not be accessible from userspace.
Typically, BPF data flow occurs between user space and kernel space in
either direction. However, the above case presents a unique scenario
where the kernel, instead of a user task, reads data from a file, passes
it to a BPF program for parsing, and finally stores the parsed result.
This requires a mechanism to channel the intermediate data from the BPF
program directly to the kernel. BPF buffer parser kfuncs are introduced
to serve this purpose:
BTF_ID_FLAGS(func, bpf_get_parser_context, KF_ACQUIRE | KF_RET_NULL)
BTF_ID_FLAGS(func, bpf_put_parser_context, KF_RELEASE)
BTF_ID_FLAGS(func, bpf_buffer_parser, KF_SLEEPABLE)
where bpf_get_parser_context() and bpf_put_parser_context() manage the
trusted argument, and bpf_buffer_parser() forwards data to a callback
that processes the structured buffer constructed by the BPF program.
Signed-off-by: Pingfan Liu <piliu@redhat.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: David S. Miller <davem@davemloft.net>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Eduard Zingerman <eddyz87@gmail.com>
Cc: Song Liu <song@kernel.org>
Cc: Yonghong Song <yonghong.song@linux.dev>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Stanislav Fomichev <sdf@fomichev.me>
Cc: Hao Luo <haoluo@google.com>
Cc: Jiri Olsa <jolsa@kernel.org>
To: bpf@vger.kernel.org
---
include/linux/bpf.h | 20 ++++
kernel/bpf/Makefile | 3 +
kernel/bpf/bpf_buffer_parser.c | 186 +++++++++++++++++++++++++++++++++
3 files changed, 209 insertions(+)
create mode 100644 kernel/bpf/bpf_buffer_parser.c
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 05b34a6355b03..93a1c9163685f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -4005,4 +4005,24 @@ static inline int bpf_map_check_op_flags(struct bpf_map *map, u64 flags, u64 all
return 0;
}
+struct bpf_parser_buf {
+ char *buf;
+ int size;
+};
+
+struct bpf_parser_context;
+typedef int (*bpf_parser_handler_t)(struct bpf_parser_context *ctx);
+
+struct bpf_parser_context {
+ struct kref ref;
+ struct hlist_node hash_node;
+ /* This callback should be sync so that @buf can be freed */
+ bpf_parser_handler_t func;
+ struct bpf_parser_buf *buf;
+ void *data;
+};
+
+struct bpf_parser_context *alloc_bpf_parser_context(bpf_parser_handler_t func,
+ void *data);
+void put_bpf_parser_context(struct bpf_parser_context *ctx);
#endif /* _LINUX_BPF_H */
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 79cf22860a99b..cceff165b4037 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -66,6 +66,9 @@ obj-$(CONFIG_BPF_SYSCALL) += kmem_cache_iter.o
ifeq ($(CONFIG_DMA_SHARED_BUFFER),y)
obj-$(CONFIG_BPF_SYSCALL) += dmabuf_iter.o
endif
+ifeq ($(CONFIG_KEXEC_BPF),y)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_buffer_parser.o
+endif
CFLAGS_REMOVE_percpu_freelist.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_bpf_lru_list.o = $(CC_FLAGS_FTRACE)
diff --git a/kernel/bpf/bpf_buffer_parser.c b/kernel/bpf/bpf_buffer_parser.c
new file mode 100644
index 0000000000000..5d5c068330791
--- /dev/null
+++ b/kernel/bpf/bpf_buffer_parser.c
@@ -0,0 +1,186 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/hashtable.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/bpf.h>
+
+#define BPF_CONTEXT_HASH_BITS 10
+
+static DEFINE_SPINLOCK(bpf_parser_context_lock);
+static DEFINE_HASHTABLE(bpf_parser_context_map, BPF_CONTEXT_HASH_BITS);
+
+/* Generate a simple hash key from pointer address */
+static inline unsigned int bpf_parser_context_hash_key(struct bpf_parser_context *ctx)
+{
+ return hash_ptr(ctx, BPF_CONTEXT_HASH_BITS);
+}
+
+static void release_bpf_parser_context(struct kref *kref)
+{
+ struct bpf_parser_context *ctx = container_of(kref, struct bpf_parser_context, ref);
+
+ if (ctx->buf) {
+ vfree(ctx->buf->buf);
+ kfree(ctx->buf);
+ }
+ spin_lock(&bpf_parser_context_lock);
+ hash_del(&ctx->hash_node);
+ spin_unlock(&bpf_parser_context_lock);
+ kfree(ctx);
+}
+
+struct bpf_parser_context *alloc_bpf_parser_context(bpf_parser_handler_t func,
+ void *data)
+{
+ struct bpf_parser_context *ctx;
+ unsigned int key;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return NULL;
+ ctx->func = func;
+ ctx->data = data;
+ kref_init(&ctx->ref);
+ key = bpf_parser_context_hash_key(ctx);
+ spin_lock(&bpf_parser_context_lock);
+ hash_add(bpf_parser_context_map, &ctx->hash_node, key);
+ spin_unlock(&bpf_parser_context_lock);
+
+ return ctx;
+}
+
+void put_bpf_parser_context(struct bpf_parser_context *ctx)
+{
+ if (!ctx)
+ return;
+ kref_put(&ctx->ref, release_bpf_parser_context);
+}
+
+static struct bpf_parser_context *find_bpf_parser_context(unsigned long id)
+{
+ struct bpf_parser_context *ctx;
+ unsigned int key;
+ int cnt;
+
+ key = bpf_parser_context_hash_key((struct bpf_parser_context *)id);
+ spin_lock(&bpf_parser_context_lock);
+ hash_for_each_possible(bpf_parser_context_map, ctx, hash_node, key) {
+ if (ctx == (struct bpf_parser_context *)id) {
+ cnt = kref_get_unless_zero(&ctx->ref);
+ if (!cnt)
+ ctx = NULL;
+ spin_unlock(&bpf_parser_context_lock);
+ return ctx;
+ }
+ }
+ spin_unlock(&bpf_parser_context_lock);
+
+ return NULL;
+}
+
+__bpf_kfunc_start_defs()
+
+__bpf_kfunc struct bpf_parser_context *bpf_get_parser_context(unsigned long id)
+{
+ struct bpf_parser_context *ctx;
+
+ ctx = find_bpf_parser_context(id);
+
+ return ctx;
+}
+
+__bpf_kfunc void bpf_put_parser_context(struct bpf_parser_context *ctx)
+{
+ put_bpf_parser_context(ctx);
+}
+
+__bpf_kfunc void bpf_parser_context_release_dtor(void *ctx)
+{
+ put_bpf_parser_context(ctx);
+}
+CFI_NOSEAL(bpf_parser_context_release_dtor);
+
+__bpf_kfunc int bpf_buffer_parser(char *buf, int buf_sz,
+ struct bpf_parser_context *context)
+{
+ struct bpf_parser_buf *parser_buf;
+ void *old_val;
+ int ret;
+ char *b;
+
+ if (buf == NULL || buf_sz <= 0)
+ return -EINVAL;
+
+ if (unlikely(context->func == NULL))
+ return -EINVAL;
+
+ /* Lock the pointer */
+ old_val = cmpxchg(&context->buf, NULL, (void *)1);
+ if (old_val != NULL)
+ return -EBUSY;
+ b = __vmalloc(buf_sz, GFP_KERNEL_ACCOUNT | __GFP_ZERO);
+ if (!b) {
+ context->buf = NULL;
+ return -ENOMEM;
+ }
+ ret = copy_from_kernel_nofault(b, buf, buf_sz);
+ if (!!ret) {
+ context->buf = NULL;
+ vfree(b);
+ return ret;
+ }
+
+ parser_buf = kmalloc(sizeof(struct bpf_parser_buf), GFP_KERNEL);
+ if (!parser_buf) {
+ vfree(b);
+ context->buf = NULL;
+ return -ENOMEM;
+ }
+ parser_buf->buf = b;
+ parser_buf->size = buf_sz;
+ context->buf = parser_buf;
+ /* @func should be a sync call */
+ ret = context->func(context);
+ context->buf = NULL;
+ vfree(b);
+ kfree(parser_buf);
+
+ return ret;
+}
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(buffer_parser_ids)
+BTF_ID_FLAGS(func, bpf_get_parser_context, KF_ACQUIRE | KF_RET_NULL)
+BTF_ID_FLAGS(func, bpf_put_parser_context, KF_RELEASE)
+BTF_ID_FLAGS(func, bpf_buffer_parser, KF_SLEEPABLE)
+BTF_KFUNCS_END(buffer_parser_ids)
+
+static const struct btf_kfunc_id_set buffer_parser_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &buffer_parser_ids,
+};
+
+
+BTF_ID_LIST(buffer_parser_dtor_ids)
+BTF_ID(struct, bpf_parser_context)
+BTF_ID(func, bpf_parser_context_release_dtor)
+
+static int __init buffer_parser_kfunc_init(void)
+{
+ int ret;
+ const struct btf_id_dtor_kfunc buffer_parser_dtors[] = {
+ {
+ .btf_id = buffer_parser_dtor_ids[0],
+ .kfunc_btf_id = buffer_parser_dtor_ids[1]
+ },
+ };
+
+ ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &buffer_parser_kfunc_set);
+ return ret ?: register_btf_id_dtor_kfuncs(buffer_parser_dtors,
+ ARRAY_SIZE(buffer_parser_dtors),
+ THIS_MODULE);
+}
+
+late_initcall(buffer_parser_kfunc_init);
--
2.49.0
> diff --git a/kernel/bpf/bpf_buffer_parser.c b/kernel/bpf/bpf_buffer_parser.c
> new file mode 100644
> index 000000000000..5d5c06833079
> --- /dev/null
> +++ b/kernel/bpf/bpf_buffer_parser.c
[ ... ]
> +static void release_bpf_parser_context(struct kref *kref)
> +{
> + struct bpf_parser_context *ctx = container_of(kref, struct bpf_parser_context, ref);
> +
> + if (ctx->buf) {
> + vfree(ctx->buf->buf);
> + kfree(ctx->buf);
> + }
If release_bpf_parser_context() runs while bpf_buffer_parser() has
stored the sentinel (void *)1 in ctx->buf via:
old_val = cmpxchg(&context->buf, NULL, (void *)1);
then ctx->buf is non-NULL but not a valid struct bpf_parser_buf
pointer, and vfree(ctx->buf->buf) will dereference address 1.
The BPF verifier prevents this today by holding the acquired reference
across the kfunc call, but release_bpf_parser_context() has no
awareness of the sentinel value. Would it be worth adding a check
here to handle the sentinel, or using a separate flag/mutex instead
of overloading ctx->buf?
[ ... ]
> +__bpf_kfunc int bpf_buffer_parser(char *buf, int buf_sz,
> + struct bpf_parser_context *context)
> +{
[ ... ]
> + ret = copy_from_kernel_nofault(b, buf, buf_sz);
> + if (!!ret) {
The !!ret is equivalent to just testing ret here. Was the double
negation intentional?
> + context->buf = NULL;
> + vfree(b);
> + return ret;
> + }
[ ... ]
> +static const struct btf_kfunc_id_set buffer_parser_kfunc_set = {
> + .owner = THIS_MODULE,
> + .set = &buffer_parser_ids,
> +};
These two lines use spaces for indentation instead of tabs.
> +
> +
There is a double blank line here.
> +BTF_ID_LIST(buffer_parser_dtor_ids)
[ ... ]
> + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &buffer_parser_kfunc_set);
> + return ret ?: register_btf_id_dtor_kfuncs(buffer_parser_dtors,
There is a double space before ret.
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/23393379437
© 2016 - 2026 Red Hat, Inc.