arch/arm64/net/bpf_jit_comp.c | 9 +++++++++ 1 file changed, 9 insertions(+)
On ARM64, the pointer to task_struct is always available in the sp_el0
register and therefore the calls to bpf_get_current_task() and
bpf_get_current_task_btf() can be inlined into a single MRS instruction.
Here is the difference before and after this change:
Before:
; struct task_struct *task = bpf_get_current_task_btf();
54: mov x10, #0xffffffffffff7978 // #-34440
58: movk x10, #0x802b, lsl #16
5c: movk x10, #0x8000, lsl #32
60: blr x10 --------------> 0xffff8000802b7978 <+0>: mrs x0, sp_el0
64: add x7, x0, #0x0 <-------------- 0xffff8000802b797c <+4>: ret
After:
; struct task_struct *task = bpf_get_current_task_btf();
54: mrs x7, sp_el0
This shows around 1% performance improvement in artificial microbenchmark.
Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
---
arch/arm64/net/bpf_jit_comp.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 720336d28856..b838dab3bd26 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1244,6 +1244,13 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
break;
}
+ /* Implement helper call to bpf_get_current_task/_btf() inline */
+ if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task ||
+ insn->imm == BPF_FUNC_get_current_task_btf)) {
+ emit(A64_MRS_SP_EL0(r0), ctx);
+ break;
+ }
+
ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
&func_addr, &func_addr_fixed);
if (ret < 0)
@@ -2581,6 +2588,8 @@ bool bpf_jit_inlines_helper_call(s32 imm)
{
switch (imm) {
case BPF_FUNC_get_smp_processor_id:
+ case BPF_FUNC_get_current_task:
+ case BPF_FUNC_get_current_task_btf:
return true;
default:
return false;
--
2.40.1
On Wed, Jun 19, 2024 at 6:25 AM Puranjay Mohan <puranjay@kernel.org> wrote:
>
> On ARM64, the pointer to task_struct is always available in the sp_el0
> register and therefore the calls to bpf_get_current_task() and
> bpf_get_current_task_btf() can be inlined into a single MRS instruction.
>
> Here is the difference before and after this change:
>
> Before:
>
> ; struct task_struct *task = bpf_get_current_task_btf();
> 54: mov x10, #0xffffffffffff7978 // #-34440
> 58: movk x10, #0x802b, lsl #16
> 5c: movk x10, #0x8000, lsl #32
> 60: blr x10 --------------> 0xffff8000802b7978 <+0>: mrs x0, sp_el0
> 64: add x7, x0, #0x0 <-------------- 0xffff8000802b797c <+4>: ret
>
> After:
>
> ; struct task_struct *task = bpf_get_current_task_btf();
> 54: mrs x7, sp_el0
>
> This shows around 1% performance improvement in artificial microbenchmark.
>
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
> arch/arm64/net/bpf_jit_comp.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
These are frequently used helpers, similarly to
get_smp_processor_id(), so I think it makes sense to optimize them.
Acked-by: Andrii Nakryiko <andrii@kernel.org>
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 720336d28856..b838dab3bd26 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -1244,6 +1244,13 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
> break;
> }
>
> + /* Implement helper call to bpf_get_current_task/_btf() inline */
> + if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task ||
> + insn->imm == BPF_FUNC_get_current_task_btf)) {
> + emit(A64_MRS_SP_EL0(r0), ctx);
> + break;
> + }
> +
> ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
> &func_addr, &func_addr_fixed);
> if (ret < 0)
> @@ -2581,6 +2588,8 @@ bool bpf_jit_inlines_helper_call(s32 imm)
> {
> switch (imm) {
> case BPF_FUNC_get_smp_processor_id:
> + case BPF_FUNC_get_current_task:
> + case BPF_FUNC_get_current_task_btf:
> return true;
> default:
> return false;
> --
> 2.40.1
>
On 6/19/2024 9:13 PM, Puranjay Mohan wrote:
> On ARM64, the pointer to task_struct is always available in the sp_el0
> register and therefore the calls to bpf_get_current_task() and
> bpf_get_current_task_btf() can be inlined into a single MRS instruction.
>
> Here is the difference before and after this change:
>
> Before:
>
> ; struct task_struct *task = bpf_get_current_task_btf();
> 54: mov x10, #0xffffffffffff7978 // #-34440
> 58: movk x10, #0x802b, lsl #16
> 5c: movk x10, #0x8000, lsl #32
> 60: blr x10 --------------> 0xffff8000802b7978 <+0>: mrs x0, sp_el0
> 64: add x7, x0, #0x0 <-------------- 0xffff8000802b797c <+4>: ret
>
> After:
>
> ; struct task_struct *task = bpf_get_current_task_btf();
> 54: mrs x7, sp_el0
>
> This shows around 1% performance improvement in artificial microbenchmark.
>
I think it would be better if more detailed data could be provided.
> Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
> ---
> arch/arm64/net/bpf_jit_comp.c | 9 +++++++++
> 1 file changed, 9 insertions(+)
>
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 720336d28856..b838dab3bd26 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -1244,6 +1244,13 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
> break;
> }
>
> + /* Implement helper call to bpf_get_current_task/_btf() inline */
> + if (insn->src_reg == 0 && (insn->imm == BPF_FUNC_get_current_task ||
> + insn->imm == BPF_FUNC_get_current_task_btf)) {
> + emit(A64_MRS_SP_EL0(r0), ctx);
> + break;
> + }
> +
> ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
> &func_addr, &func_addr_fixed);
> if (ret < 0)
> @@ -2581,6 +2588,8 @@ bool bpf_jit_inlines_helper_call(s32 imm)
> {
> switch (imm) {
> case BPF_FUNC_get_smp_processor_id:
> + case BPF_FUNC_get_current_task:
> + case BPF_FUNC_get_current_task_btf:
> return true;
> default:
> return false;
Acked-by: Xu Kuohai <xukuohai@huawei.com>
© 2016 - 2025 Red Hat, Inc.