From: Pu Lehui <pulehui@huawei.com>
Implement bpf_arch_text_poke for RV64. For call scenario,
ftrace framework reserve 4 nops for RV64 kernel function
as function entry, and use auipc+jalr instructions to call
kernel or module functions. However, since the auipc+jalr
call instructions is non-atomic operation, we need to use
stop-machine to make sure instruction patching in atomic
context. As for jump scenario, since we only jump inside
the trampoline, a jal instruction is sufficient.
Signed-off-by: Pu Lehui <pulehui@huawei.com>
---
arch/riscv/net/bpf_jit.h | 5 ++
arch/riscv/net/bpf_jit_comp64.c | 131 +++++++++++++++++++++++++++++++-
2 files changed, 134 insertions(+), 2 deletions(-)
diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h
index d926e0f7ef57..bf9802a63061 100644
--- a/arch/riscv/net/bpf_jit.h
+++ b/arch/riscv/net/bpf_jit.h
@@ -573,6 +573,11 @@ static inline u32 rv_fence(u8 pred, u8 succ)
return rv_i_insn(imm11_0, 0, 0, 0, 0xf);
}
+static inline u32 rv_nop(void)
+{
+ return rv_i_insn(0, 0, 0, 0, 0x13);
+}
+
/* RVC instrutions. */
static inline u16 rvc_addi4spn(u8 rd, u32 imm10)
diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c
index bf4721a99a09..fa8b03c52463 100644
--- a/arch/riscv/net/bpf_jit_comp64.c
+++ b/arch/riscv/net/bpf_jit_comp64.c
@@ -8,6 +8,8 @@
#include <linux/bitfield.h>
#include <linux/bpf.h>
#include <linux/filter.h>
+#include <linux/memory.h>
+#include <linux/stop_machine.h>
#include "bpf_jit.h"
#define RV_REG_TCC RV_REG_A6
@@ -238,7 +240,7 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx)
if (!is_tail_call)
emit_mv(RV_REG_A0, RV_REG_A5, ctx);
emit_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA,
- is_tail_call ? 4 : 0, /* skip TCC init */
+ is_tail_call ? 20 : 0, /* skip reserved nops and TCC init */
ctx);
}
@@ -615,6 +617,127 @@ static int add_exception_handler(const struct bpf_insn *insn,
return 0;
}
+struct text_poke_args {
+ void *addr;
+ const void *insns;
+ size_t len;
+ atomic_t cpu_count;
+};
+
+static int do_text_poke(void *data)
+{
+ int ret = 0;
+ struct text_poke_args *patch = data;
+
+ if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) {
+ ret = patch_text_nosync(patch->addr, patch->insns, patch->len);
+ atomic_inc(&patch->cpu_count);
+ } else {
+ while (atomic_read(&patch->cpu_count) <= num_online_cpus())
+ cpu_relax();
+ smp_mb();
+ }
+
+ return ret;
+}
+
+static int bpf_text_poke_stop_machine(void *addr, const void *insns, size_t len)
+{
+ struct text_poke_args patch = {
+ .addr = addr,
+ .insns = insns,
+ .len = len,
+ .cpu_count = ATOMIC_INIT(0),
+ };
+
+ return stop_machine(do_text_poke, &patch, cpu_online_mask);
+}
+
+static int gen_call_or_nops(void *target, void *ip, u32 *insns)
+{
+ int i, ret;
+ s64 rvoff;
+ struct rv_jit_context ctx;
+
+ ctx.ninsns = 0;
+ ctx.insns = (u16 *)insns;
+
+ if (!target) {
+ for (i = 0; i < 4; i++)
+ emit(rv_nop(), &ctx);
+ return 0;
+ }
+
+ rvoff = (s64)(target - ip);
+ emit(rv_sd(RV_REG_SP, -8, RV_REG_RA), &ctx);
+ ret = emit_jump_and_link(RV_REG_RA, rvoff, false, &ctx);
+ if (ret)
+ return ret;
+ emit(rv_ld(RV_REG_RA, -8, RV_REG_SP), &ctx);
+
+ return 0;
+
+}
+
+static int bpf_text_poke_call(void *ip, void *old_addr, void *new_addr)
+{
+ int ret;
+ u32 old_insns[4], new_insns[4];
+
+ ret = gen_call_or_nops(old_addr, ip + 4, old_insns);
+ if (ret)
+ return ret;
+
+ ret = gen_call_or_nops(new_addr, ip + 4, new_insns);
+ if (ret)
+ return ret;
+
+ mutex_lock(&text_mutex);
+ if (memcmp(ip, old_insns, sizeof(old_insns))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (memcmp(ip, new_insns, sizeof(new_insns)))
+ ret = bpf_text_poke_stop_machine(ip, new_insns, sizeof(new_insns));
+out:
+ mutex_unlock(&text_mutex);
+ return ret;
+}
+
+static int bpf_text_poke_jump(void *ip, void *old_addr, void *new_addr)
+{
+ int ret;
+ u32 old_insn, new_insn;
+
+ old_insn = old_addr ? rv_jal(RV_REG_ZERO, (s64)(old_addr - ip) >> 1) : rv_nop();
+ new_insn = new_addr ? rv_jal(RV_REG_ZERO, (s64)(new_addr - ip) >> 1) : rv_nop();
+
+ mutex_lock(&text_mutex);
+ if (memcmp(ip, &old_insn, sizeof(old_insn))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ if (memcmp(ip, &new_insn, sizeof(new_insn)))
+ ret = patch_text_nosync(ip, &new_insn, sizeof(new_insn));
+out:
+ mutex_unlock(&text_mutex);
+ return ret;
+}
+
+int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type,
+ void *old_addr, void *new_addr)
+{
+ if (!is_kernel_text((unsigned long)ip) &&
+ !is_bpf_text_address((unsigned long)ip))
+ return -ENOTSUPP;
+
+ return poke_type == BPF_MOD_CALL ?
+ bpf_text_poke_call(ip, old_addr, new_addr) :
+ bpf_text_poke_jump(ip, old_addr, new_addr);
+}
+
int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
bool extra_pass)
{
@@ -1266,7 +1389,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx,
void bpf_jit_build_prologue(struct rv_jit_context *ctx)
{
- int stack_adjust = 0, store_offset, bpf_stack_adjust;
+ int i, stack_adjust = 0, store_offset, bpf_stack_adjust;
bool is_main_prog = ctx->prog->aux->func_idx == 0;
bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16);
@@ -1294,6 +1417,10 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx)
store_offset = stack_adjust - 8;
+ /* reserve 4 nop insns */
+ for (i = 0; i < 4; i++)
+ emit(rv_nop(), ctx);
+
/* First instruction is always setting the tail-call-counter
* (TCC) register. This instruction is skipped for tail calls.
* Force using a 4-byte (non-compressed) instruction.
--
2.25.1
Pu Lehui <pulehui@huaweicloud.com> writes: > From: Pu Lehui <pulehui@huawei.com> > > Implement bpf_arch_text_poke for RV64. For call scenario, > ftrace framework reserve 4 nops for RV64 kernel function > as function entry, and use auipc+jalr instructions to call > kernel or module functions. However, since the auipc+jalr > call instructions is non-atomic operation, we need to use > stop-machine to make sure instruction patching in atomic > context. As for jump scenario, since we only jump inside > the trampoline, a jal instruction is sufficient. Hmm, is that really true? More below! > > Signed-off-by: Pu Lehui <pulehui@huawei.com> > --- > arch/riscv/net/bpf_jit.h | 5 ++ > arch/riscv/net/bpf_jit_comp64.c | 131 +++++++++++++++++++++++++++++++- > 2 files changed, 134 insertions(+), 2 deletions(-) > > diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h > index d926e0f7ef57..bf9802a63061 100644 > --- a/arch/riscv/net/bpf_jit.h > +++ b/arch/riscv/net/bpf_jit.h > @@ -573,6 +573,11 @@ static inline u32 rv_fence(u8 pred, u8 succ) > return rv_i_insn(imm11_0, 0, 0, 0, 0xf); > } > > +static inline u32 rv_nop(void) > +{ > + return rv_i_insn(0, 0, 0, 0, 0x13); > +} > + > /* RVC instrutions. */ > > static inline u16 rvc_addi4spn(u8 rd, u32 imm10) > diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c > index bf4721a99a09..fa8b03c52463 100644 > --- a/arch/riscv/net/bpf_jit_comp64.c > +++ b/arch/riscv/net/bpf_jit_comp64.c > @@ -8,6 +8,8 @@ > #include <linux/bitfield.h> > #include <linux/bpf.h> > #include <linux/filter.h> > +#include <linux/memory.h> > +#include <linux/stop_machine.h> > #include "bpf_jit.h" > > #define RV_REG_TCC RV_REG_A6 > @@ -238,7 +240,7 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx) > if (!is_tail_call) > emit_mv(RV_REG_A0, RV_REG_A5, ctx); > emit_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA, > - is_tail_call ? 4 : 0, /* skip TCC init */ > + is_tail_call ? 20 : 0, /* skip reserved nops and TCC init */ > ctx); > } > > @@ -615,6 +617,127 @@ static int add_exception_handler(const struct bpf_insn *insn, > return 0; > } > > +struct text_poke_args { > + void *addr; > + const void *insns; > + size_t len; > + atomic_t cpu_count; > +}; > + > +static int do_text_poke(void *data) > +{ > + int ret = 0; > + struct text_poke_args *patch = data; > + > + if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) { > + ret = patch_text_nosync(patch->addr, patch->insns, patch->len); > + atomic_inc(&patch->cpu_count); > + } else { > + while (atomic_read(&patch->cpu_count) <= num_online_cpus()) > + cpu_relax(); > + smp_mb(); > + } > + > + return ret; > +} > + > +static int bpf_text_poke_stop_machine(void *addr, const void *insns, size_t len) > +{ > + struct text_poke_args patch = { > + .addr = addr, > + .insns = insns, > + .len = len, > + .cpu_count = ATOMIC_INIT(0), > + }; > + > + return stop_machine(do_text_poke, &patch, cpu_online_mask); > +} > + > +static int gen_call_or_nops(void *target, void *ip, u32 *insns) > +{ > + int i, ret; > + s64 rvoff; > + struct rv_jit_context ctx; > + > + ctx.ninsns = 0; > + ctx.insns = (u16 *)insns; > + > + if (!target) { > + for (i = 0; i < 4; i++) > + emit(rv_nop(), &ctx); > + return 0; > + } > + > + rvoff = (s64)(target - ip); > + emit(rv_sd(RV_REG_SP, -8, RV_REG_RA), &ctx); > + ret = emit_jump_and_link(RV_REG_RA, rvoff, false, &ctx); > + if (ret) > + return ret; > + emit(rv_ld(RV_REG_RA, -8, RV_REG_SP), &ctx); > + > + return 0; > + > +} > + > +static int bpf_text_poke_call(void *ip, void *old_addr, void *new_addr) > +{ > + int ret; > + u32 old_insns[4], new_insns[4]; > + > + ret = gen_call_or_nops(old_addr, ip + 4, old_insns); > + if (ret) > + return ret; > + > + ret = gen_call_or_nops(new_addr, ip + 4, new_insns); > + if (ret) > + return ret; > + > + mutex_lock(&text_mutex); > + if (memcmp(ip, old_insns, sizeof(old_insns))) { > + ret = -EFAULT; > + goto out; > + } > + > + if (memcmp(ip, new_insns, sizeof(new_insns))) > + ret = bpf_text_poke_stop_machine(ip, new_insns, > sizeof(new_insns)); I'd rather see that you added a patch_text variant to arch/riscv/kernel/patch.c (something like your bpf_text_poke_stop_machine()), and use that here. Might be other users of that as well -- Andy's ftrace patch maybe? :-) > +out: > + mutex_unlock(&text_mutex); > + return ret; > +} > + > +static int bpf_text_poke_jump(void *ip, void *old_addr, void *new_addr) > +{ > + int ret; > + u32 old_insn, new_insn; > + > + old_insn = old_addr ? rv_jal(RV_REG_ZERO, (s64)(old_addr - ip) >> 1) : rv_nop(); > + new_insn = new_addr ? rv_jal(RV_REG_ZERO, (s64)(new_addr - ip) >> 1) : rv_nop(); > + > + mutex_lock(&text_mutex); > + if (memcmp(ip, &old_insn, sizeof(old_insn))) { > + ret = -EFAULT; > + goto out; > + } > + > + if (memcmp(ip, &new_insn, sizeof(new_insn))) > + ret = patch_text_nosync(ip, &new_insn, sizeof(new_insn)); > +out: > + mutex_unlock(&text_mutex); > + return ret; > +} > + > +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, > + void *old_addr, void *new_addr) AFAIU there's nothing in the bpf_arch_text_poke() API that say that BPF_MOD_JUMP is jumps within the trampoline. That is one usage, but not the only one. In general, the jal might not have enough reach. I believe that this needs to be an auipc/jalr pair similar to BPF_MOD_CALL (w/o linked register). And again, thanks for working on the RV trampoline! Björn
On 2023/1/4 2:12, Björn Töpel wrote: > Pu Lehui <pulehui@huaweicloud.com> writes: > >> From: Pu Lehui <pulehui@huawei.com> >> >> Implement bpf_arch_text_poke for RV64. For call scenario, >> ftrace framework reserve 4 nops for RV64 kernel function >> as function entry, and use auipc+jalr instructions to call >> kernel or module functions. However, since the auipc+jalr >> call instructions is non-atomic operation, we need to use >> stop-machine to make sure instruction patching in atomic >> context. As for jump scenario, since we only jump inside >> the trampoline, a jal instruction is sufficient. > > Hmm, is that really true? More below! > >> >> Signed-off-by: Pu Lehui <pulehui@huawei.com> >> --- >> arch/riscv/net/bpf_jit.h | 5 ++ >> arch/riscv/net/bpf_jit_comp64.c | 131 +++++++++++++++++++++++++++++++- >> 2 files changed, 134 insertions(+), 2 deletions(-) >> >> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h >> index d926e0f7ef57..bf9802a63061 100644 >> --- a/arch/riscv/net/bpf_jit.h >> +++ b/arch/riscv/net/bpf_jit.h >> @@ -573,6 +573,11 @@ static inline u32 rv_fence(u8 pred, u8 succ) >> return rv_i_insn(imm11_0, 0, 0, 0, 0xf); >> } >> >> +static inline u32 rv_nop(void) >> +{ >> + return rv_i_insn(0, 0, 0, 0, 0x13); >> +} >> + >> /* RVC instrutions. */ >> >> static inline u16 rvc_addi4spn(u8 rd, u32 imm10) >> diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c >> index bf4721a99a09..fa8b03c52463 100644 >> --- a/arch/riscv/net/bpf_jit_comp64.c >> +++ b/arch/riscv/net/bpf_jit_comp64.c >> @@ -8,6 +8,8 @@ >> #include <linux/bitfield.h> >> #include <linux/bpf.h> >> #include <linux/filter.h> >> +#include <linux/memory.h> >> +#include <linux/stop_machine.h> >> #include "bpf_jit.h" >> >> #define RV_REG_TCC RV_REG_A6 >> @@ -238,7 +240,7 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx) >> if (!is_tail_call) >> emit_mv(RV_REG_A0, RV_REG_A5, ctx); >> emit_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA, >> - is_tail_call ? 4 : 0, /* skip TCC init */ >> + is_tail_call ? 20 : 0, /* skip reserved nops and TCC init */ >> ctx); >> } >> >> @@ -615,6 +617,127 @@ static int add_exception_handler(const struct bpf_insn *insn, >> return 0; >> } >> >> +struct text_poke_args { >> + void *addr; >> + const void *insns; >> + size_t len; >> + atomic_t cpu_count; >> +}; >> + >> +static int do_text_poke(void *data) >> +{ >> + int ret = 0; >> + struct text_poke_args *patch = data; >> + >> + if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) { >> + ret = patch_text_nosync(patch->addr, patch->insns, patch->len); >> + atomic_inc(&patch->cpu_count); >> + } else { >> + while (atomic_read(&patch->cpu_count) <= num_online_cpus()) >> + cpu_relax(); >> + smp_mb(); >> + } >> + >> + return ret; >> +} >> + >> +static int bpf_text_poke_stop_machine(void *addr, const void *insns, size_t len) >> +{ >> + struct text_poke_args patch = { >> + .addr = addr, >> + .insns = insns, >> + .len = len, >> + .cpu_count = ATOMIC_INIT(0), >> + }; >> + >> + return stop_machine(do_text_poke, &patch, cpu_online_mask); >> +} >> + >> +static int gen_call_or_nops(void *target, void *ip, u32 *insns) >> +{ >> + int i, ret; >> + s64 rvoff; >> + struct rv_jit_context ctx; >> + >> + ctx.ninsns = 0; >> + ctx.insns = (u16 *)insns; >> + >> + if (!target) { >> + for (i = 0; i < 4; i++) >> + emit(rv_nop(), &ctx); >> + return 0; >> + } >> + >> + rvoff = (s64)(target - ip); >> + emit(rv_sd(RV_REG_SP, -8, RV_REG_RA), &ctx); >> + ret = emit_jump_and_link(RV_REG_RA, rvoff, false, &ctx); >> + if (ret) >> + return ret; >> + emit(rv_ld(RV_REG_RA, -8, RV_REG_SP), &ctx); >> + >> + return 0; >> + >> +} >> + >> +static int bpf_text_poke_call(void *ip, void *old_addr, void *new_addr) >> +{ >> + int ret; >> + u32 old_insns[4], new_insns[4]; >> + >> + ret = gen_call_or_nops(old_addr, ip + 4, old_insns); >> + if (ret) >> + return ret; >> + >> + ret = gen_call_or_nops(new_addr, ip + 4, new_insns); >> + if (ret) >> + return ret; >> + >> + mutex_lock(&text_mutex); >> + if (memcmp(ip, old_insns, sizeof(old_insns))) { >> + ret = -EFAULT; >> + goto out; >> + } >> + >> + if (memcmp(ip, new_insns, sizeof(new_insns))) >> + ret = bpf_text_poke_stop_machine(ip, new_insns, >> sizeof(new_insns)); > > I'd rather see that you added a patch_text variant to > arch/riscv/kernel/patch.c (something like your > bpf_text_poke_stop_machine()), and use that here. Might be other users > of that as well -- Andy's ftrace patch maybe? :-) > Good idea. >> +out: >> + mutex_unlock(&text_mutex); >> + return ret; >> +} >> + >> +static int bpf_text_poke_jump(void *ip, void *old_addr, void *new_addr) >> +{ >> + int ret; >> + u32 old_insn, new_insn; >> + >> + old_insn = old_addr ? rv_jal(RV_REG_ZERO, (s64)(old_addr - ip) >> 1) : rv_nop(); >> + new_insn = new_addr ? rv_jal(RV_REG_ZERO, (s64)(new_addr - ip) >> 1) : rv_nop(); >> + >> + mutex_lock(&text_mutex); >> + if (memcmp(ip, &old_insn, sizeof(old_insn))) { >> + ret = -EFAULT; >> + goto out; >> + } >> + >> + if (memcmp(ip, &new_insn, sizeof(new_insn))) >> + ret = patch_text_nosync(ip, &new_insn, sizeof(new_insn)); >> +out: >> + mutex_unlock(&text_mutex); >> + return ret; >> +} >> + >> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, >> + void *old_addr, void *new_addr) > > AFAIU there's nothing in the bpf_arch_text_poke() API that say that > BPF_MOD_JUMP is jumps within the trampoline. That is one usage, but not > the only one. In general, the jal might not have enough reach. > > I believe that this needs to be an auipc/jalr pair similar to > BPF_MOD_CALL (w/o linked register). > The initial idea was that currently BPF_MOD_JUMP only serves for bpf_tramp_image_put, and jal, which range is +/- 1MB, is sufficient for the distance between im->ip_after_call and im->ip_epilogue, and try to not use not-atomic auipc/jalr pair. But take deep consideration, this might be extended to other uses, such as tailcall optimization. So agree with your suggestion. > > And again, thanks for working on the RV trampoline! > Björn
On 2023/1/6 9:57, Pu Lehui wrote: > > > On 2023/1/4 2:12, Björn Töpel wrote: >> Pu Lehui <pulehui@huaweicloud.com> writes: >> >>> From: Pu Lehui <pulehui@huawei.com> >>> >>> Implement bpf_arch_text_poke for RV64. For call scenario, >>> ftrace framework reserve 4 nops for RV64 kernel function >>> as function entry, and use auipc+jalr instructions to call >>> kernel or module functions. However, since the auipc+jalr >>> call instructions is non-atomic operation, we need to use >>> stop-machine to make sure instruction patching in atomic >>> context. As for jump scenario, since we only jump inside >>> the trampoline, a jal instruction is sufficient. >> >> Hmm, is that really true? More below! >> >>> >>> Signed-off-by: Pu Lehui <pulehui@huawei.com> >>> --- >>> arch/riscv/net/bpf_jit.h | 5 ++ >>> arch/riscv/net/bpf_jit_comp64.c | 131 +++++++++++++++++++++++++++++++- >>> 2 files changed, 134 insertions(+), 2 deletions(-) >>> >>> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h >>> index d926e0f7ef57..bf9802a63061 100644 >>> --- a/arch/riscv/net/bpf_jit.h >>> +++ b/arch/riscv/net/bpf_jit.h >>> @@ -573,6 +573,11 @@ static inline u32 rv_fence(u8 pred, u8 succ) >>> return rv_i_insn(imm11_0, 0, 0, 0, 0xf); >>> } >>> +static inline u32 rv_nop(void) >>> +{ >>> + return rv_i_insn(0, 0, 0, 0, 0x13); >>> +} >>> + >>> /* RVC instrutions. */ >>> static inline u16 rvc_addi4spn(u8 rd, u32 imm10) >>> diff --git a/arch/riscv/net/bpf_jit_comp64.c >>> b/arch/riscv/net/bpf_jit_comp64.c >>> index bf4721a99a09..fa8b03c52463 100644 >>> --- a/arch/riscv/net/bpf_jit_comp64.c >>> +++ b/arch/riscv/net/bpf_jit_comp64.c >>> @@ -8,6 +8,8 @@ >>> #include <linux/bitfield.h> >>> #include <linux/bpf.h> >>> #include <linux/filter.h> >>> +#include <linux/memory.h> >>> +#include <linux/stop_machine.h> >>> #include "bpf_jit.h" >>> #define RV_REG_TCC RV_REG_A6 >>> @@ -238,7 +240,7 @@ static void __build_epilogue(bool is_tail_call, >>> struct rv_jit_context *ctx) >>> if (!is_tail_call) >>> emit_mv(RV_REG_A0, RV_REG_A5, ctx); >>> emit_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA, >>> - is_tail_call ? 4 : 0, /* skip TCC init */ >>> + is_tail_call ? 20 : 0, /* skip reserved nops and TCC init */ >>> ctx); >>> } >>> @@ -615,6 +617,127 @@ static int add_exception_handler(const struct >>> bpf_insn *insn, >>> return 0; >>> } >>> +struct text_poke_args { >>> + void *addr; >>> + const void *insns; >>> + size_t len; >>> + atomic_t cpu_count; >>> +}; >>> + >>> +static int do_text_poke(void *data) >>> +{ >>> + int ret = 0; >>> + struct text_poke_args *patch = data; >>> + >>> + if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) { >>> + ret = patch_text_nosync(patch->addr, patch->insns, patch->len); >>> + atomic_inc(&patch->cpu_count); >>> + } else { >>> + while (atomic_read(&patch->cpu_count) <= num_online_cpus()) >>> + cpu_relax(); >>> + smp_mb(); >>> + } >>> + >>> + return ret; >>> +} >>> + >>> +static int bpf_text_poke_stop_machine(void *addr, const void *insns, >>> size_t len) >>> +{ >>> + struct text_poke_args patch = { >>> + .addr = addr, >>> + .insns = insns, >>> + .len = len, >>> + .cpu_count = ATOMIC_INIT(0), >>> + }; >>> + >>> + return stop_machine(do_text_poke, &patch, cpu_online_mask); >>> +} >>> + >>> +static int gen_call_or_nops(void *target, void *ip, u32 *insns) >>> +{ >>> + int i, ret; >>> + s64 rvoff; >>> + struct rv_jit_context ctx; >>> + >>> + ctx.ninsns = 0; >>> + ctx.insns = (u16 *)insns; >>> + >>> + if (!target) { >>> + for (i = 0; i < 4; i++) >>> + emit(rv_nop(), &ctx); >>> + return 0; >>> + } >>> + >>> + rvoff = (s64)(target - ip); >>> + emit(rv_sd(RV_REG_SP, -8, RV_REG_RA), &ctx); >>> + ret = emit_jump_and_link(RV_REG_RA, rvoff, false, &ctx); >>> + if (ret) >>> + return ret; >>> + emit(rv_ld(RV_REG_RA, -8, RV_REG_SP), &ctx); >>> + >>> + return 0; >>> + >>> +} >>> + >>> +static int bpf_text_poke_call(void *ip, void *old_addr, void *new_addr) >>> +{ >>> + int ret; >>> + u32 old_insns[4], new_insns[4]; >>> + >>> + ret = gen_call_or_nops(old_addr, ip + 4, old_insns); >>> + if (ret) >>> + return ret; >>> + >>> + ret = gen_call_or_nops(new_addr, ip + 4, new_insns); >>> + if (ret) >>> + return ret; >>> + >>> + mutex_lock(&text_mutex); >>> + if (memcmp(ip, old_insns, sizeof(old_insns))) { >>> + ret = -EFAULT; >>> + goto out; >>> + } >>> + >>> + if (memcmp(ip, new_insns, sizeof(new_insns))) >>> + ret = bpf_text_poke_stop_machine(ip, new_insns, >>> sizeof(new_insns)); >> >> I'd rather see that you added a patch_text variant to >> arch/riscv/kernel/patch.c (something like your >> bpf_text_poke_stop_machine()), and use that here. Might be other users >> of that as well -- Andy's ftrace patch maybe? :-) >> > > Good idea. > >>> +out: >>> + mutex_unlock(&text_mutex); >>> + return ret; >>> +} >>> + >>> +static int bpf_text_poke_jump(void *ip, void *old_addr, void *new_addr) >>> +{ >>> + int ret; >>> + u32 old_insn, new_insn; >>> + >>> + old_insn = old_addr ? rv_jal(RV_REG_ZERO, (s64)(old_addr - ip) >>> >> 1) : rv_nop(); >>> + new_insn = new_addr ? rv_jal(RV_REG_ZERO, (s64)(new_addr - ip) >>> >> 1) : rv_nop(); >>> + >>> + mutex_lock(&text_mutex); >>> + if (memcmp(ip, &old_insn, sizeof(old_insn))) { >>> + ret = -EFAULT; >>> + goto out; >>> + } >>> + >>> + if (memcmp(ip, &new_insn, sizeof(new_insn))) >>> + ret = patch_text_nosync(ip, &new_insn, sizeof(new_insn)); >>> +out: >>> + mutex_unlock(&text_mutex); >>> + return ret; >>> +} >>> + >>> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, >>> + void *old_addr, void *new_addr) >> >> AFAIU there's nothing in the bpf_arch_text_poke() API that say that >> BPF_MOD_JUMP is jumps within the trampoline. That is one usage, but not >> the only one. In general, the jal might not have enough reach. >> >> I believe that this needs to be an auipc/jalr pair similar to >> BPF_MOD_CALL (w/o linked register). >> > > The initial idea was that currently BPF_MOD_JUMP only serves for small nit,the current riscv BPF_MOD_JUMP > bpf_tramp_image_put, and jal, which range is +/- 1MB, is sufficient for > the distance between im->ip_after_call and im->ip_epilogue, and try to > not use not-atomic auipc/jalr pair. But take deep consideration, this > might be extended to other uses, such as tailcall optimization. So agree > with your suggestion. > >> >> And again, thanks for working on the RV trampoline! >> Björn >
On 12/20/2022 10:13 AM, Pu Lehui wrote: > From: Pu Lehui <pulehui@huawei.com> > > Implement bpf_arch_text_poke for RV64. For call scenario, > ftrace framework reserve 4 nops for RV64 kernel function > as function entry, and use auipc+jalr instructions to call > kernel or module functions. However, since the auipc+jalr > call instructions is non-atomic operation, we need to use > stop-machine to make sure instruction patching in atomic > context. As for jump scenario, since we only jump inside > the trampoline, a jal instruction is sufficient. > > Signed-off-by: Pu Lehui <pulehui@huawei.com> > --- > arch/riscv/net/bpf_jit.h | 5 ++ > arch/riscv/net/bpf_jit_comp64.c | 131 +++++++++++++++++++++++++++++++- > 2 files changed, 134 insertions(+), 2 deletions(-) > > diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h > index d926e0f7ef57..bf9802a63061 100644 > --- a/arch/riscv/net/bpf_jit.h > +++ b/arch/riscv/net/bpf_jit.h > @@ -573,6 +573,11 @@ static inline u32 rv_fence(u8 pred, u8 succ) > return rv_i_insn(imm11_0, 0, 0, 0, 0xf); > } > > +static inline u32 rv_nop(void) > +{ > + return rv_i_insn(0, 0, 0, 0, 0x13); > +} > + > /* RVC instrutions. */ > > static inline u16 rvc_addi4spn(u8 rd, u32 imm10) > diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c > index bf4721a99a09..fa8b03c52463 100644 > --- a/arch/riscv/net/bpf_jit_comp64.c > +++ b/arch/riscv/net/bpf_jit_comp64.c > @@ -8,6 +8,8 @@ > #include <linux/bitfield.h> > #include <linux/bpf.h> > #include <linux/filter.h> > +#include <linux/memory.h> > +#include <linux/stop_machine.h> > #include "bpf_jit.h" > > #define RV_REG_TCC RV_REG_A6 > @@ -238,7 +240,7 @@ static void __build_epilogue(bool is_tail_call, struct rv_jit_context *ctx) > if (!is_tail_call) > emit_mv(RV_REG_A0, RV_REG_A5, ctx); > emit_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA, > - is_tail_call ? 4 : 0, /* skip TCC init */ > + is_tail_call ? 20 : 0, /* skip reserved nops and TCC init */ > ctx); > } > > @@ -615,6 +617,127 @@ static int add_exception_handler(const struct bpf_insn *insn, > return 0; > } > > +struct text_poke_args { > + void *addr; > + const void *insns; > + size_t len; > + atomic_t cpu_count; > +}; > + > +static int do_text_poke(void *data) > +{ > + int ret = 0; > + struct text_poke_args *patch = data; > + > + if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) { seems this sync is not needed, why not calling stop machine like this: stop_machine(do_text_poke, &patch, NULL); > + ret = patch_text_nosync(patch->addr, patch->insns, patch->len); > + atomic_inc(&patch->cpu_count); > + } else { > + while (atomic_read(&patch->cpu_count) <= num_online_cpus()) > + cpu_relax(); > + smp_mb(); > + } > + > + return ret; > +} > + > +static int bpf_text_poke_stop_machine(void *addr, const void *insns, size_t len) > +{ > + struct text_poke_args patch = { > + .addr = addr, > + .insns = insns, > + .len = len, > + .cpu_count = ATOMIC_INIT(0), > + }; > + > + return stop_machine(do_text_poke, &patch, cpu_online_mask); > +} > + > +static int gen_call_or_nops(void *target, void *ip, u32 *insns) > +{ > + int i, ret; > + s64 rvoff; > + struct rv_jit_context ctx; > + > + ctx.ninsns = 0; > + ctx.insns = (u16 *)insns; > + > + if (!target) { > + for (i = 0; i < 4; i++) > + emit(rv_nop(), &ctx); > + return 0; > + } > + > + rvoff = (s64)(target - ip); > + emit(rv_sd(RV_REG_SP, -8, RV_REG_RA), &ctx); > + ret = emit_jump_and_link(RV_REG_RA, rvoff, false, &ctx); > + if (ret) > + return ret; > + emit(rv_ld(RV_REG_RA, -8, RV_REG_SP), &ctx); > + > + return 0; > + > +} > + > +static int bpf_text_poke_call(void *ip, void *old_addr, void *new_addr) > +{ > + int ret; > + u32 old_insns[4], new_insns[4]; > + > + ret = gen_call_or_nops(old_addr, ip + 4, old_insns); > + if (ret) > + return ret; > + > + ret = gen_call_or_nops(new_addr, ip + 4, new_insns); > + if (ret) > + return ret; > + > + mutex_lock(&text_mutex); > + if (memcmp(ip, old_insns, sizeof(old_insns))) { > + ret = -EFAULT; > + goto out; > + } > + > + if (memcmp(ip, new_insns, sizeof(new_insns))) > + ret = bpf_text_poke_stop_machine(ip, new_insns, sizeof(new_insns)); since there are 4 instructions being replaced, it is possible that serveral old instructions were already executed before stop machine. > +out: > + mutex_unlock(&text_mutex); > + return ret; > +} > + > +static int bpf_text_poke_jump(void *ip, void *old_addr, void *new_addr) > +{ > + int ret; > + u32 old_insn, new_insn; > + > + old_insn = old_addr ? rv_jal(RV_REG_ZERO, (s64)(old_addr - ip) >> 1) : rv_nop(); > + new_insn = new_addr ? rv_jal(RV_REG_ZERO, (s64)(new_addr - ip) >> 1) : rv_nop(); > + > + mutex_lock(&text_mutex); > + if (memcmp(ip, &old_insn, sizeof(old_insn))) { > + ret = -EFAULT; > + goto out; > + } > + > + if (memcmp(ip, &new_insn, sizeof(new_insn))) > + ret = patch_text_nosync(ip, &new_insn, sizeof(new_insn)); > +out: > + mutex_unlock(&text_mutex); > + return ret; > +} > + > +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, > + void *old_addr, void *new_addr) > +{ > + if (!is_kernel_text((unsigned long)ip) && > + !is_bpf_text_address((unsigned long)ip)) > + return -ENOTSUPP; > + > + return poke_type == BPF_MOD_CALL ? > + bpf_text_poke_call(ip, old_addr, new_addr) : > + bpf_text_poke_jump(ip, old_addr, new_addr); > +} > + > int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, > bool extra_pass) > { > @@ -1266,7 +1389,7 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, > > void bpf_jit_build_prologue(struct rv_jit_context *ctx) > { > - int stack_adjust = 0, store_offset, bpf_stack_adjust; > + int i, stack_adjust = 0, store_offset, bpf_stack_adjust; > bool is_main_prog = ctx->prog->aux->func_idx == 0; > > bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16); > @@ -1294,6 +1417,10 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx) > > store_offset = stack_adjust - 8; > > + /* reserve 4 nop insns */ > + for (i = 0; i < 4; i++) > + emit(rv_nop(), ctx); > + > /* First instruction is always setting the tail-call-counter > * (TCC) register. This instruction is skipped for tail calls. > * Force using a 4-byte (non-compressed) instruction.
On 2022/12/20 11:00, Xu Kuohai wrote: > On 12/20/2022 10:13 AM, Pu Lehui wrote: >> From: Pu Lehui <pulehui@huawei.com> >> >> Implement bpf_arch_text_poke for RV64. For call scenario, >> ftrace framework reserve 4 nops for RV64 kernel function >> as function entry, and use auipc+jalr instructions to call >> kernel or module functions. However, since the auipc+jalr >> call instructions is non-atomic operation, we need to use >> stop-machine to make sure instruction patching in atomic >> context. As for jump scenario, since we only jump inside >> the trampoline, a jal instruction is sufficient. >> >> Signed-off-by: Pu Lehui <pulehui@huawei.com> >> --- >> arch/riscv/net/bpf_jit.h | 5 ++ >> arch/riscv/net/bpf_jit_comp64.c | 131 +++++++++++++++++++++++++++++++- >> 2 files changed, 134 insertions(+), 2 deletions(-) >> >> diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h >> index d926e0f7ef57..bf9802a63061 100644 >> --- a/arch/riscv/net/bpf_jit.h >> +++ b/arch/riscv/net/bpf_jit.h >> @@ -573,6 +573,11 @@ static inline u32 rv_fence(u8 pred, u8 succ) >> return rv_i_insn(imm11_0, 0, 0, 0, 0xf); >> } >> +static inline u32 rv_nop(void) >> +{ >> + return rv_i_insn(0, 0, 0, 0, 0x13); >> +} >> + >> /* RVC instrutions. */ >> static inline u16 rvc_addi4spn(u8 rd, u32 imm10) >> diff --git a/arch/riscv/net/bpf_jit_comp64.c >> b/arch/riscv/net/bpf_jit_comp64.c >> index bf4721a99a09..fa8b03c52463 100644 >> --- a/arch/riscv/net/bpf_jit_comp64.c >> +++ b/arch/riscv/net/bpf_jit_comp64.c >> @@ -8,6 +8,8 @@ >> #include <linux/bitfield.h> >> #include <linux/bpf.h> >> #include <linux/filter.h> >> +#include <linux/memory.h> >> +#include <linux/stop_machine.h> >> #include "bpf_jit.h" >> #define RV_REG_TCC RV_REG_A6 >> @@ -238,7 +240,7 @@ static void __build_epilogue(bool is_tail_call, >> struct rv_jit_context *ctx) >> if (!is_tail_call) >> emit_mv(RV_REG_A0, RV_REG_A5, ctx); >> emit_jalr(RV_REG_ZERO, is_tail_call ? RV_REG_T3 : RV_REG_RA, >> - is_tail_call ? 4 : 0, /* skip TCC init */ >> + is_tail_call ? 20 : 0, /* skip reserved nops and TCC init */ >> ctx); >> } >> @@ -615,6 +617,127 @@ static int add_exception_handler(const struct >> bpf_insn *insn, >> return 0; >> } >> +struct text_poke_args { >> + void *addr; >> + const void *insns; >> + size_t len; >> + atomic_t cpu_count; >> +}; >> + >> +static int do_text_poke(void *data) >> +{ >> + int ret = 0; >> + struct text_poke_args *patch = data; >> + >> + if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) { > > seems this sync is not needed, why not calling stop machine like this: > > stop_machine(do_text_poke, &patch, NULL); > I think both version should be ok. While the NULL version uses first cpu as master to patch_text and the others touch nmi watchdog, this version attempts to let other cpus enter cpu_relax to calm down, and let master cpu owns system resources as possible. >> + ret = patch_text_nosync(patch->addr, patch->insns, patch->len); >> + atomic_inc(&patch->cpu_count); >> + } else { >> + while (atomic_read(&patch->cpu_count) <= num_online_cpus()) >> + cpu_relax(); >> + smp_mb(); >> + } >> + >> + return ret; >> +} >> + >> +static int bpf_text_poke_stop_machine(void *addr, const void *insns, >> size_t len) >> +{ >> + struct text_poke_args patch = { >> + .addr = addr, >> + .insns = insns, >> + .len = len, >> + .cpu_count = ATOMIC_INIT(0), >> + }; >> + >> + return stop_machine(do_text_poke, &patch, cpu_online_mask); >> +} >> + >> +static int gen_call_or_nops(void *target, void *ip, u32 *insns) >> +{ >> + int i, ret; >> + s64 rvoff; >> + struct rv_jit_context ctx; >> + >> + ctx.ninsns = 0; >> + ctx.insns = (u16 *)insns; >> + >> + if (!target) { >> + for (i = 0; i < 4; i++) >> + emit(rv_nop(), &ctx); >> + return 0; >> + } >> + >> + rvoff = (s64)(target - ip); >> + emit(rv_sd(RV_REG_SP, -8, RV_REG_RA), &ctx); >> + ret = emit_jump_and_link(RV_REG_RA, rvoff, false, &ctx); >> + if (ret) >> + return ret; >> + emit(rv_ld(RV_REG_RA, -8, RV_REG_SP), &ctx); >> + >> + return 0; >> + >> +} >> + >> +static int bpf_text_poke_call(void *ip, void *old_addr, void *new_addr) >> +{ >> + int ret; >> + u32 old_insns[4], new_insns[4]; >> + >> + ret = gen_call_or_nops(old_addr, ip + 4, old_insns); >> + if (ret) >> + return ret; >> + >> + ret = gen_call_or_nops(new_addr, ip + 4, new_insns); >> + if (ret) >> + return ret; >> + >> + mutex_lock(&text_mutex); >> + if (memcmp(ip, old_insns, sizeof(old_insns))) { >> + ret = -EFAULT; >> + goto out; >> + } >> + >> + if (memcmp(ip, new_insns, sizeof(new_insns))) >> + ret = bpf_text_poke_stop_machine(ip, new_insns, >> sizeof(new_insns)); > > since there are 4 instructions being replaced, it is possible that > serveral old instructions were already executed before stop machine. > Yes, there are unpredictable problems after stop machine in that situation. In addition, probing stop machine sub-functions may lead to loop nesting problems. And this is the current problem with riscv ftrace. But the surprise was that Andy's work [1] was about to solve these problems. We can synchronize changes in the future. [1] https://lore.kernel.org/linux-riscv/20220913094252.3555240-1-andy.chiu@sifive.com >> +out: >> + mutex_unlock(&text_mutex); >> + return ret; >> +} >> + >> +static int bpf_text_poke_jump(void *ip, void *old_addr, void *new_addr) >> +{ >> + int ret; >> + u32 old_insn, new_insn; >> + >> + old_insn = old_addr ? rv_jal(RV_REG_ZERO, (s64)(old_addr - ip) >> >> 1) : rv_nop(); >> + new_insn = new_addr ? rv_jal(RV_REG_ZERO, (s64)(new_addr - ip) >> >> 1) : rv_nop(); >> + >> + mutex_lock(&text_mutex); >> + if (memcmp(ip, &old_insn, sizeof(old_insn))) { >> + ret = -EFAULT; >> + goto out; >> + } >> + >> + if (memcmp(ip, &new_insn, sizeof(new_insn))) >> + ret = patch_text_nosync(ip, &new_insn, sizeof(new_insn)); >> +out: >> + mutex_unlock(&text_mutex); >> + return ret; >> +} >> + >> +int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type poke_type, >> + void *old_addr, void *new_addr) >> +{ >> + if (!is_kernel_text((unsigned long)ip) && >> + !is_bpf_text_address((unsigned long)ip)) >> + return -ENOTSUPP; >> + >> + return poke_type == BPF_MOD_CALL ? >> + bpf_text_poke_call(ip, old_addr, new_addr) : >> + bpf_text_poke_jump(ip, old_addr, new_addr); >> +} >> + >> int bpf_jit_emit_insn(const struct bpf_insn *insn, struct >> rv_jit_context *ctx, >> bool extra_pass) >> { >> @@ -1266,7 +1389,7 @@ int bpf_jit_emit_insn(const struct bpf_insn >> *insn, struct rv_jit_context *ctx, >> void bpf_jit_build_prologue(struct rv_jit_context *ctx) >> { >> - int stack_adjust = 0, store_offset, bpf_stack_adjust; >> + int i, stack_adjust = 0, store_offset, bpf_stack_adjust; >> bool is_main_prog = ctx->prog->aux->func_idx == 0; >> bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16); >> @@ -1294,6 +1417,10 @@ void bpf_jit_build_prologue(struct >> rv_jit_context *ctx) >> store_offset = stack_adjust - 8; >> + /* reserve 4 nop insns */ >> + for (i = 0; i < 4; i++) >> + emit(rv_nop(), ctx); >> + >> /* First instruction is always setting the tail-call-counter >> * (TCC) register. This instruction is skipped for tail calls. >> * Force using a 4-byte (non-compressed) instruction.
© 2016 - 2025 Red Hat, Inc.