bpf: Introduce 64-bit bitops kfuncs

[PATCH bpf-next v2 3/6] bpf, arm64: Add 64-bit bitops kfuncs support

Posted by Leon Hwang 1 month, 1 week ago

Implement JIT inlining of the 64-bit bitops kfuncs on arm64.

bpf_clz64(), bpf_ffs64(), bpf_fls64(), and bpf_bitrev64() are always
inlined using mandatory ARMv8 CLZ/RBIT instructions. bpf_ctz64() is
inlined via RBIT + CLZ, or via the native CTZ instruction when
FEAT_CSSC is available. bpf_rol64() and bpf_ror64() are always inlined
via RORV.

bpf_popcnt64() is not inlined as the native population count instruction
requires NEON/SIMD registers, which should not be touched from BPF
programs. It therefore falls back to a regular function call.

Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
 arch/arm64/net/bpf_jit_comp.c | 123 ++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)

diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 7a530ea4f5ae..f03f732063d9 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -1192,6 +1192,127 @@ static int add_exception_handler(const struct bpf_insn *insn,
 	return 0;
 }
 
+static inline u32 a64_clz64(u8 rd, u8 rn)
+{
+	/*
+	 * Arm Architecture Reference Manual for A-profile architecture
+	 * (Document number: ARM DDI 0487)
+	 *
+	 *   A64 Base Instruction Descriptions
+	 *   C6.2 Alphabetical list of A64 base instructions
+	 *
+	 *   C6.2.91 CLZ
+	 *
+	 *     Count leading zeros
+	 *
+	 *     This instruction counts the number of consecutive binary zero bits,
+	 *     starting from the most significant bit in the source register,
+	 *     and places the count in the destination register.
+	 */
+	/* CLZ Xd, Xn */
+	return 0xdac01000 | (rn << 5) | rd;
+}
+
+static inline u32 a64_ctz64(u8 rd, u8 rn)
+{
+	/*
+	 * Arm Architecture Reference Manual for A-profile architecture
+	 * (Document number: ARM DDI 0487)
+	 *
+	 *   A64 Base Instruction Descriptions
+	 *   C6.2 Alphabetical list of A64 base instructions
+	 *
+	 *   C6.2.144 CTZ
+	 *
+	 *     Count trailing zeros
+	 *
+	 *     This instruction counts the number of consecutive binary zero bits,
+	 *     starting from the least significant bit in the source register,
+	 *     and places the count in the destination register.
+	 *
+	 *     This instruction requires FEAT_CSSC.
+	 */
+	/* CTZ Xd, Xn */
+	return 0xdac01800 | (rn << 5) | rd;
+}
+
+static inline u32 a64_rbit64(u8 rd, u8 rn)
+{
+	/*
+	 * Arm Architecture Reference Manual for A-profile architecture
+	 * (Document number: ARM DDI 0487)
+	 *
+	 *   A64 Base Instruction Descriptions
+	 *   C6.2 Alphabetical list of A64 base instructions
+	 *
+	 *   C6.2.320 RBIT
+	 *
+	 *     Reverse bits
+	 *
+	 *     This instruction reverses the bit order in a register.
+	 */
+	/* RBIT Xd, Xn */
+	return 0xdac00000 | (rn << 5) | rd;
+}
+
+static inline bool boot_cpu_supports_cssc(void)
+{
+	/*
+	 * Documentation/arch/arm64/cpu-feature-registers.rst
+	 *
+	 *   ID_AA64ISAR2_EL1 - Instruction set attribute register 2
+	 *
+	 *     CSSC
+	 */
+	return cpuid_feature_extract_unsigned_field(read_sanitised_ftr_reg(SYS_ID_AA64ISAR2_EL1),
+						    ID_AA64ISAR2_EL1_CSSC_SHIFT);
+}
+
+static bool bpf_inlines_func_call(struct jit_ctx *ctx, void *func_addr)
+{
+	const u8 tmp = bpf2a64[TMP_REG_1];
+	const u8 r0 = bpf2a64[BPF_REG_0];
+	const u8 r1 = bpf2a64[BPF_REG_1];
+	const u8 r2 = bpf2a64[BPF_REG_2];
+	bool inlined = true;
+
+	if (func_addr == bpf_clz64) {
+		emit(a64_clz64(r0, r1), ctx);
+	} else if (func_addr == bpf_ctz64 || func_addr == bpf_ffs64) {
+		if (boot_cpu_supports_cssc()) {
+			emit(a64_ctz64(r0, r1), ctx);
+		} else {
+			emit(a64_rbit64(tmp, r1), ctx);
+			emit(a64_clz64(r0, tmp), ctx);
+		}
+	} else if (func_addr == bpf_fls64) {
+		emit(a64_clz64(tmp, r1), ctx);
+		emit(A64_NEG(1, tmp, tmp), ctx);
+		emit(A64_ADD_I(1, r0, tmp, 64), ctx);
+	} else if (func_addr == bpf_bitrev64) {
+		emit(a64_rbit64(r0, r1), ctx);
+	} else if (func_addr == bpf_rol64) {
+		emit(A64_NEG(1, tmp, r2), ctx);
+		emit(A64_DATA2(1, r0, r1, tmp, RORV), ctx);
+	} else if (func_addr == bpf_ror64) {
+		emit(A64_DATA2(1, r0, r1, r2, RORV), ctx);
+	} else {
+		inlined = false;
+	}
+
+	return inlined;
+}
+
+bool bpf_jit_inlines_kfunc_call(void *func_addr)
+{
+	if (func_addr == bpf_clz64 || func_addr == bpf_ctz64 ||
+	    func_addr == bpf_ffs64 || func_addr == bpf_fls64 ||
+	    func_addr == bpf_rol64 || func_addr == bpf_ror64 ||
+	    func_addr == bpf_bitrev64)
+		return true;
+	return false;
+}
+
 /* JITs an eBPF instruction.
  * Returns:
  * 0  - successfully JITed an 8-byte eBPF instruction.
@@ -1598,6 +1719,8 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 					    &func_addr, &func_addr_fixed);
 		if (ret < 0)
 			return ret;
+		if (bpf_inlines_func_call(ctx, (void *) func_addr))
+			break;
 		emit_call(func_addr, ctx);
 		/*
 		 * Call to arch_bpf_timed_may_goto() is emitted by the
-- 
2.52.0

Re: [PATCH bpf-next v2 3/6] bpf, arm64: Add 64-bit bitops kfuncs support

Posted by Puranjay Mohan 1 month, 1 week ago

Leon Hwang <leon.hwang@linux.dev> writes:

> Implement JIT inlining of the 64-bit bitops kfuncs on arm64.
>
> bpf_clz64(), bpf_ffs64(), bpf_fls64(), and bpf_bitrev64() are always
> inlined using mandatory ARMv8 CLZ/RBIT instructions. bpf_ctz64() is
> inlined via RBIT + CLZ, or via the native CTZ instruction when
> FEAT_CSSC is available. bpf_rol64() and bpf_ror64() are always inlined
> via RORV.
>
> bpf_popcnt64() is not inlined as the native population count instruction
> requires NEON/SIMD registers, which should not be touched from BPF
> programs. It therefore falls back to a regular function call.
>
> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> ---
>  arch/arm64/net/bpf_jit_comp.c | 123 ++++++++++++++++++++++++++++++++++
>  1 file changed, 123 insertions(+)
>
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 7a530ea4f5ae..f03f732063d9 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -1192,6 +1192,127 @@ static int add_exception_handler(const struct bpf_insn *insn,
>  	return 0;
>  }
>  
> +static inline u32 a64_clz64(u8 rd, u8 rn)
> +{
> +	/*
> +	 * Arm Architecture Reference Manual for A-profile architecture
> +	 * (Document number: ARM DDI 0487)
> +	 *
> +	 *   A64 Base Instruction Descriptions
> +	 *   C6.2 Alphabetical list of A64 base instructions
> +	 *
> +	 *   C6.2.91 CLZ
> +	 *
> +	 *     Count leading zeros
> +	 *
> +	 *     This instruction counts the number of consecutive binary zero bits,
> +	 *     starting from the most significant bit in the source register,
> +	 *     and places the count in the destination register.
> +	 */
> +	/* CLZ Xd, Xn */
> +	return 0xdac01000 | (rn << 5) | rd;
> +}
> +
> +static inline u32 a64_ctz64(u8 rd, u8 rn)
> +{
> +	/*
> +	 * Arm Architecture Reference Manual for A-profile architecture
> +	 * (Document number: ARM DDI 0487)
> +	 *
> +	 *   A64 Base Instruction Descriptions
> +	 *   C6.2 Alphabetical list of A64 base instructions
> +	 *
> +	 *   C6.2.144 CTZ
> +	 *
> +	 *     Count trailing zeros
> +	 *
> +	 *     This instruction counts the number of consecutive binary zero bits,
> +	 *     starting from the least significant bit in the source register,
> +	 *     and places the count in the destination register.
> +	 *
> +	 *     This instruction requires FEAT_CSSC.
> +	 */
> +	/* CTZ Xd, Xn */
> +	return 0xdac01800 | (rn << 5) | rd;
> +}
> +
> +static inline u32 a64_rbit64(u8 rd, u8 rn)
> +{
> +	/*
> +	 * Arm Architecture Reference Manual for A-profile architecture
> +	 * (Document number: ARM DDI 0487)
> +	 *
> +	 *   A64 Base Instruction Descriptions
> +	 *   C6.2 Alphabetical list of A64 base instructions
> +	 *
> +	 *   C6.2.320 RBIT
> +	 *
> +	 *     Reverse bits
> +	 *
> +	 *     This instruction reverses the bit order in a register.
> +	 */
> +	/* RBIT Xd, Xn */
> +	return 0xdac00000 | (rn << 5) | rd;
> +}

I don't think adding the above three functions is the best to JIT these
intructions, do it like the other data1 and data2 instructions and add
them to the generic framework like the following patch(untested) does:

-- >8 --

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 18c7811774d3..b2696af0b817 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -221,6 +221,9 @@ enum aarch64_insn_data1_type {
 	AARCH64_INSN_DATA1_REVERSE_16,
 	AARCH64_INSN_DATA1_REVERSE_32,
 	AARCH64_INSN_DATA1_REVERSE_64,
+	AARCH64_INSN_DATA1_RBIT,
+	AARCH64_INSN_DATA1_CLZ,
+	AARCH64_INSN_DATA1_CTZ,
 };

 enum aarch64_insn_data2_type {
@@ -389,6 +392,9 @@ __AARCH64_INSN_FUNCS(rorv,	0x7FE0FC00, 0x1AC02C00)
 __AARCH64_INSN_FUNCS(rev16,	0x7FFFFC00, 0x5AC00400)
 __AARCH64_INSN_FUNCS(rev32,	0x7FFFFC00, 0x5AC00800)
 __AARCH64_INSN_FUNCS(rev64,	0x7FFFFC00, 0x5AC00C00)
+__AARCH64_INSN_FUNCS(rbit,	0x7FFFFC00, 0x5AC00000)
+__AARCH64_INSN_FUNCS(clz,	0x7FFFFC00, 0x5AC01000)
+__AARCH64_INSN_FUNCS(ctz,	0x7FFFFC00, 0x5AC01800)
 __AARCH64_INSN_FUNCS(and,	0x7F200000, 0x0A000000)
 __AARCH64_INSN_FUNCS(bic,	0x7F200000, 0x0A200000)
 __AARCH64_INSN_FUNCS(orr,	0x7F200000, 0x2A000000)
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index 4e298baddc2e..2229ab596cda 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -1008,6 +1008,15 @@ u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
 		}
 		insn = aarch64_insn_get_rev64_value();
 		break;
+	case AARCH64_INSN_DATA1_CLZ:
+		insn = aarch64_insn_get_clz_value();
+		break;
+	case AARCH64_INSN_DATA1_RBIT:
+		insn = aarch64_insn_get_rbit_value();
+		break;
+	case AARCH64_INSN_DATA1_CTZ:
+		insn = aarch64_insn_get_ctz_value();
+		break;
 	default:
 		pr_err("%s: unknown data1 encoding %d\n", __func__, type);
 		return AARCH64_BREAK_FAULT;
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index bbea4f36f9f2..af806c39dadb 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -248,6 +248,12 @@
 #define A64_REV16(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_16)
 #define A64_REV32(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_32)
 #define A64_REV64(Rd, Rn)     A64_DATA1(1, Rd, Rn, REVERSE_64)
+/* Rd = RBIT(Rn) */
+#define A64_RBIT(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, RBIT)
+/* Rd = CLZ(Rn) */
+#define A64_CLZ(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, CLZ)
+/* Rd = CTZ(Rn) */
+#define A64_CTZ(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, CTZ)

 /* Data-processing (2 source) */
 /* Rd = Rn OP Rm */

-- 8< --

Thanks,
Puranjay

Re: [PATCH bpf-next v2 3/6] bpf, arm64: Add 64-bit bitops kfuncs support

Posted by Leon Hwang 1 month, 1 week ago


On 2026/2/19 23:25, Puranjay Mohan wrote:
> Leon Hwang <leon.hwang@linux.dev> writes:
> 
>> Implement JIT inlining of the 64-bit bitops kfuncs on arm64.
>>
>> bpf_clz64(), bpf_ffs64(), bpf_fls64(), and bpf_bitrev64() are always
>> inlined using mandatory ARMv8 CLZ/RBIT instructions. bpf_ctz64() is
>> inlined via RBIT + CLZ, or via the native CTZ instruction when
>> FEAT_CSSC is available. bpf_rol64() and bpf_ror64() are always inlined
>> via RORV.
>>
>> bpf_popcnt64() is not inlined as the native population count instruction
>> requires NEON/SIMD registers, which should not be touched from BPF
>> programs. It therefore falls back to a regular function call.
>>
>> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
>> ---
>>  arch/arm64/net/bpf_jit_comp.c | 123 ++++++++++++++++++++++++++++++++++
>>  1 file changed, 123 insertions(+)
>>
>> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
>> index 7a530ea4f5ae..f03f732063d9 100644
>> --- a/arch/arm64/net/bpf_jit_comp.c
>> +++ b/arch/arm64/net/bpf_jit_comp.c
>> @@ -1192,6 +1192,127 @@ static int add_exception_handler(const struct bpf_insn *insn,
>>  	return 0;
>>  }
>>  
>> +static inline u32 a64_clz64(u8 rd, u8 rn)
>> +{
>> +	/*
>> +	 * Arm Architecture Reference Manual for A-profile architecture
>> +	 * (Document number: ARM DDI 0487)
>> +	 *
>> +	 *   A64 Base Instruction Descriptions
>> +	 *   C6.2 Alphabetical list of A64 base instructions
>> +	 *
>> +	 *   C6.2.91 CLZ
>> +	 *
>> +	 *     Count leading zeros
>> +	 *
>> +	 *     This instruction counts the number of consecutive binary zero bits,
>> +	 *     starting from the most significant bit in the source register,
>> +	 *     and places the count in the destination register.
>> +	 */
>> +	/* CLZ Xd, Xn */
>> +	return 0xdac01000 | (rn << 5) | rd;
>> +}
>> +
>> +static inline u32 a64_ctz64(u8 rd, u8 rn)
>> +{
>> +	/*
>> +	 * Arm Architecture Reference Manual for A-profile architecture
>> +	 * (Document number: ARM DDI 0487)
>> +	 *
>> +	 *   A64 Base Instruction Descriptions
>> +	 *   C6.2 Alphabetical list of A64 base instructions
>> +	 *
>> +	 *   C6.2.144 CTZ
>> +	 *
>> +	 *     Count trailing zeros
>> +	 *
>> +	 *     This instruction counts the number of consecutive binary zero bits,
>> +	 *     starting from the least significant bit in the source register,
>> +	 *     and places the count in the destination register.
>> +	 *
>> +	 *     This instruction requires FEAT_CSSC.
>> +	 */
>> +	/* CTZ Xd, Xn */
>> +	return 0xdac01800 | (rn << 5) | rd;
>> +}
>> +
>> +static inline u32 a64_rbit64(u8 rd, u8 rn)
>> +{
>> +	/*
>> +	 * Arm Architecture Reference Manual for A-profile architecture
>> +	 * (Document number: ARM DDI 0487)
>> +	 *
>> +	 *   A64 Base Instruction Descriptions
>> +	 *   C6.2 Alphabetical list of A64 base instructions
>> +	 *
>> +	 *   C6.2.320 RBIT
>> +	 *
>> +	 *     Reverse bits
>> +	 *
>> +	 *     This instruction reverses the bit order in a register.
>> +	 */
>> +	/* RBIT Xd, Xn */
>> +	return 0xdac00000 | (rn << 5) | rd;
>> +}
> 
> I don't think adding the above three functions is the best to JIT these
> intructions, do it like the other data1 and data2 instructions and add
> them to the generic framework like the following patch(untested) does:
> > -- >8 --
> 
> diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
> index 18c7811774d3..b2696af0b817 100644
> --- a/arch/arm64/include/asm/insn.h
> +++ b/arch/arm64/include/asm/insn.h
> @@ -221,6 +221,9 @@ enum aarch64_insn_data1_type {
>  	AARCH64_INSN_DATA1_REVERSE_16,
>  	AARCH64_INSN_DATA1_REVERSE_32,
>  	AARCH64_INSN_DATA1_REVERSE_64,
> +	AARCH64_INSN_DATA1_RBIT,
> +	AARCH64_INSN_DATA1_CLZ,
> +	AARCH64_INSN_DATA1_CTZ,
>  };
> 
>  enum aarch64_insn_data2_type {
> @@ -389,6 +392,9 @@ __AARCH64_INSN_FUNCS(rorv,	0x7FE0FC00, 0x1AC02C00)
>  __AARCH64_INSN_FUNCS(rev16,	0x7FFFFC00, 0x5AC00400)
>  __AARCH64_INSN_FUNCS(rev32,	0x7FFFFC00, 0x5AC00800)
>  __AARCH64_INSN_FUNCS(rev64,	0x7FFFFC00, 0x5AC00C00)
> +__AARCH64_INSN_FUNCS(rbit,	0x7FFFFC00, 0x5AC00000)
> +__AARCH64_INSN_FUNCS(clz,	0x7FFFFC00, 0x5AC01000)
> +__AARCH64_INSN_FUNCS(ctz,	0x7FFFFC00, 0x5AC01800)
>  __AARCH64_INSN_FUNCS(and,	0x7F200000, 0x0A000000)
>  __AARCH64_INSN_FUNCS(bic,	0x7F200000, 0x0A200000)
>  __AARCH64_INSN_FUNCS(orr,	0x7F200000, 0x2A000000)
> diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
> index 4e298baddc2e..2229ab596cda 100644
> --- a/arch/arm64/lib/insn.c
> +++ b/arch/arm64/lib/insn.c
> @@ -1008,6 +1008,15 @@ u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
>  		}
>  		insn = aarch64_insn_get_rev64_value();
>  		break;
> +	case AARCH64_INSN_DATA1_CLZ:
> +		insn = aarch64_insn_get_clz_value();
> +		break;
> +	case AARCH64_INSN_DATA1_RBIT:
> +		insn = aarch64_insn_get_rbit_value();
> +		break;
> +	case AARCH64_INSN_DATA1_CTZ:
> +		insn = aarch64_insn_get_ctz_value();
> +		break;
>  	default:
>  		pr_err("%s: unknown data1 encoding %d\n", __func__, type);
>  		return AARCH64_BREAK_FAULT;
> diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
> index bbea4f36f9f2..af806c39dadb 100644
> --- a/arch/arm64/net/bpf_jit.h
> +++ b/arch/arm64/net/bpf_jit.h
> @@ -248,6 +248,12 @@
>  #define A64_REV16(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_16)
>  #define A64_REV32(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_32)
>  #define A64_REV64(Rd, Rn)     A64_DATA1(1, Rd, Rn, REVERSE_64)
> +/* Rd = RBIT(Rn) */
> +#define A64_RBIT(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, RBIT)
> +/* Rd = CLZ(Rn) */
> +#define A64_CLZ(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, CLZ)
> +/* Rd = CTZ(Rn) */
> +#define A64_CTZ(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, CTZ)
> 
>  /* Data-processing (2 source) */
>  /* Rd = Rn OP Rm */
> 
> -- 8< --
> 
> Thanks,
> Puranjay

Ack.

I'll do it in the next revision.

Thanks,
Leon

Re: [PATCH bpf-next v2 3/6] bpf, arm64: Add 64-bit bitops kfuncs support

Posted by Puranjay Mohan 1 month, 1 week ago

Leon Hwang <leon.hwang@linux.dev> writes:

> Implement JIT inlining of the 64-bit bitops kfuncs on arm64.
>
> bpf_clz64(), bpf_ffs64(), bpf_fls64(), and bpf_bitrev64() are always
> inlined using mandatory ARMv8 CLZ/RBIT instructions. bpf_ctz64() is
> inlined via RBIT + CLZ, or via the native CTZ instruction when
> FEAT_CSSC is available. bpf_rol64() and bpf_ror64() are always inlined
> via RORV.
>
> bpf_popcnt64() is not inlined as the native population count instruction
> requires NEON/SIMD registers, which should not be touched from BPF
> programs. It therefore falls back to a regular function call.
>
> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> ---
>  arch/arm64/net/bpf_jit_comp.c | 123 ++++++++++++++++++++++++++++++++++
>  1 file changed, 123 insertions(+)
>
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 7a530ea4f5ae..f03f732063d9 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -1192,6 +1192,127 @@ static int add_exception_handler(const struct bpf_insn *insn,
>  	return 0;
>  }
>  
> +static inline u32 a64_clz64(u8 rd, u8 rn)
> +{
> +	/*
> +	 * Arm Architecture Reference Manual for A-profile architecture
> +	 * (Document number: ARM DDI 0487)
> +	 *
> +	 *   A64 Base Instruction Descriptions
> +	 *   C6.2 Alphabetical list of A64 base instructions
> +	 *
> +	 *   C6.2.91 CLZ
> +	 *
> +	 *     Count leading zeros
> +	 *
> +	 *     This instruction counts the number of consecutive binary zero bits,
> +	 *     starting from the most significant bit in the source register,
> +	 *     and places the count in the destination register.
> +	 */
> +	/* CLZ Xd, Xn */
> +	return 0xdac01000 | (rn << 5) | rd;
> +}
> +
> +static inline u32 a64_ctz64(u8 rd, u8 rn)
> +{
> +	/*
> +	 * Arm Architecture Reference Manual for A-profile architecture
> +	 * (Document number: ARM DDI 0487)
> +	 *
> +	 *   A64 Base Instruction Descriptions
> +	 *   C6.2 Alphabetical list of A64 base instructions
> +	 *
> +	 *   C6.2.144 CTZ
> +	 *
> +	 *     Count trailing zeros
> +	 *
> +	 *     This instruction counts the number of consecutive binary zero bits,
> +	 *     starting from the least significant bit in the source register,
> +	 *     and places the count in the destination register.
> +	 *
> +	 *     This instruction requires FEAT_CSSC.
> +	 */
> +	/* CTZ Xd, Xn */
> +	return 0xdac01800 | (rn << 5) | rd;
> +}
> +
> +static inline u32 a64_rbit64(u8 rd, u8 rn)
> +{
> +	/*
> +	 * Arm Architecture Reference Manual for A-profile architecture
> +	 * (Document number: ARM DDI 0487)
> +	 *
> +	 *   A64 Base Instruction Descriptions
> +	 *   C6.2 Alphabetical list of A64 base instructions
> +	 *
> +	 *   C6.2.320 RBIT
> +	 *
> +	 *     Reverse bits
> +	 *
> +	 *     This instruction reverses the bit order in a register.
> +	 */
> +	/* RBIT Xd, Xn */
> +	return 0xdac00000 | (rn << 5) | rd;
> +}

I don't think adding the above three functions is the best to JIT these
intructions, do it like the other data1 and data2 instructions and add
them to the generic framework like the following patch(untested) does:

-- >8 --

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 18c7811774d3..b2696af0b817 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -221,6 +221,9 @@ enum aarch64_insn_data1_type {
 	AARCH64_INSN_DATA1_REVERSE_16,
 	AARCH64_INSN_DATA1_REVERSE_32,
 	AARCH64_INSN_DATA1_REVERSE_64,
+	AARCH64_INSN_DATA1_RBIT,
+	AARCH64_INSN_DATA1_CLZ,
+	AARCH64_INSN_DATA1_CTZ,
 };

 enum aarch64_insn_data2_type {
@@ -389,6 +392,9 @@ __AARCH64_INSN_FUNCS(rorv,	0x7FE0FC00, 0x1AC02C00)
 __AARCH64_INSN_FUNCS(rev16,	0x7FFFFC00, 0x5AC00400)
 __AARCH64_INSN_FUNCS(rev32,	0x7FFFFC00, 0x5AC00800)
 __AARCH64_INSN_FUNCS(rev64,	0x7FFFFC00, 0x5AC00C00)
+__AARCH64_INSN_FUNCS(rbit,	0x7FFFFC00, 0x5AC00000)
+__AARCH64_INSN_FUNCS(clz,	0x7FFFFC00, 0x5AC01000)
+__AARCH64_INSN_FUNCS(ctz,	0x7FFFFC00, 0x5AC01800)
 __AARCH64_INSN_FUNCS(and,	0x7F200000, 0x0A000000)
 __AARCH64_INSN_FUNCS(bic,	0x7F200000, 0x0A200000)
 __AARCH64_INSN_FUNCS(orr,	0x7F200000, 0x2A000000)
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index 4e298baddc2e..2229ab596cda 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -1008,6 +1008,15 @@ u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
 		}
 		insn = aarch64_insn_get_rev64_value();
 		break;
+	case AARCH64_INSN_DATA1_CLZ:
+		insn = aarch64_insn_get_clz_value();
+		break;
+	case AARCH64_INSN_DATA1_RBIT:
+		insn = aarch64_insn_get_rbit_value();
+		break;
+	case AARCH64_INSN_DATA1_CTZ:
+		insn = aarch64_insn_get_ctz_value();
+		break;
 	default:
 		pr_err("%s: unknown data1 encoding %d\n", __func__, type);
 		return AARCH64_BREAK_FAULT;
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index bbea4f36f9f2..af806c39dadb 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -248,6 +248,12 @@
 #define A64_REV16(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_16)
 #define A64_REV32(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_32)
 #define A64_REV64(Rd, Rn)     A64_DATA1(1, Rd, Rn, REVERSE_64)
+/* Rd = RBIT(Rn) */
+#define A64_RBIT(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, RBIT)
+/* Rd = CLZ(Rn) */
+#define A64_CLZ(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, CLZ)
+/* Rd = CTZ(Rn) */
+#define A64_CTZ(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, CTZ)

 /* Data-processing (2 source) */
 /* Rd = Rn OP Rm */

-- 8< --

Thanks,
Puranjay

Re: [PATCH bpf-next v2 3/6] bpf, arm64: Add 64-bit bitops kfuncs support

Posted by Puranjay Mohan 1 month, 1 week ago

> Implement JIT inlining of the 64-bit bitops kfuncs on arm64.
>
> bpf_clz64(), bpf_ffs64(), bpf_fls64(), and bpf_bitrev64() are always
> inlined using mandatory ARMv8 CLZ/RBIT instructions. bpf_ctz64() is
> inlined via RBIT + CLZ, or via the native CTZ instruction when
> FEAT_CSSC is available. bpf_rol64() and bpf_ror64() are always inlined
> via RORV.
>
> bpf_popcnt64() is not inlined as the native population count instruction
> requires NEON/SIMD registers, which should not be touched from BPF
> programs. It therefore falls back to a regular function call.
>
> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> ---
>  arch/arm64/net/bpf_jit_comp.c | 123 ++++++++++++++++++++++++++++++++++
>  1 file changed, 123 insertions(+)
>
> diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
> index 7a530ea4f5ae..f03f732063d9 100644
> --- a/arch/arm64/net/bpf_jit_comp.c
> +++ b/arch/arm64/net/bpf_jit_comp.c
> @@ -1192,6 +1192,127 @@ static int add_exception_handler(const struct bpf_insn *insn,
>  	return 0;
>  }
>  
> +static inline u32 a64_clz64(u8 rd, u8 rn)
> +{
> +	/*
> +	 * Arm Architecture Reference Manual for A-profile architecture
> +	 * (Document number: ARM DDI 0487)
> +	 *
> +	 *   A64 Base Instruction Descriptions
> +	 *   C6.2 Alphabetical list of A64 base instructions
> +	 *
> +	 *   C6.2.91 CLZ
> +	 *
> +	 *     Count leading zeros
> +	 *
> +	 *     This instruction counts the number of consecutive binary zero bits,
> +	 *     starting from the most significant bit in the source register,
> +	 *     and places the count in the destination register.
> +	 */
> +	/* CLZ Xd, Xn */
> +	return 0xdac01000 | (rn << 5) | rd;
> +}
> +
> +static inline u32 a64_ctz64(u8 rd, u8 rn)
> +{
> +	/*
> +	 * Arm Architecture Reference Manual for A-profile architecture
> +	 * (Document number: ARM DDI 0487)
> +	 *
> +	 *   A64 Base Instruction Descriptions
> +	 *   C6.2 Alphabetical list of A64 base instructions
> +	 *
> +	 *   C6.2.144 CTZ
> +	 *
> +	 *     Count trailing zeros
> +	 *
> +	 *     This instruction counts the number of consecutive binary zero bits,
> +	 *     starting from the least significant bit in the source register,
> +	 *     and places the count in the destination register.
> +	 *
> +	 *     This instruction requires FEAT_CSSC.
> +	 */
> +	/* CTZ Xd, Xn */
> +	return 0xdac01800 | (rn << 5) | rd;
> +}
> +
> +static inline u32 a64_rbit64(u8 rd, u8 rn)
> +{
> +	/*
> +	 * Arm Architecture Reference Manual for A-profile architecture
> +	 * (Document number: ARM DDI 0487)
> +	 *
> +	 *   A64 Base Instruction Descriptions
> +	 *   C6.2 Alphabetical list of A64 base instructions
> +	 *
> +	 *   C6.2.320 RBIT
> +	 *
> +	 *     Reverse bits
> +	 *
> +	 *     This instruction reverses the bit order in a register.
> +	 */
> +	/* RBIT Xd, Xn */
> +	return 0xdac00000 | (rn << 5) | rd;
> +}

Instead of hardcoding the instructions with the above functions, do it the
proper way something like the following patch (not compile tested):

-- >8 --

diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h
index 18c7811774d3..b2696af0b817 100644
--- a/arch/arm64/include/asm/insn.h
+++ b/arch/arm64/include/asm/insn.h
@@ -221,6 +221,9 @@ enum aarch64_insn_data1_type {
 	AARCH64_INSN_DATA1_REVERSE_16,
 	AARCH64_INSN_DATA1_REVERSE_32,
 	AARCH64_INSN_DATA1_REVERSE_64,
+	AARCH64_INSN_DATA1_RBIT,
+	AARCH64_INSN_DATA1_CLZ,
+	AARCH64_INSN_DATA1_CTZ,
 };

 enum aarch64_insn_data2_type {
@@ -389,6 +392,9 @@ __AARCH64_INSN_FUNCS(rorv,	0x7FE0FC00, 0x1AC02C00)
 __AARCH64_INSN_FUNCS(rev16,	0x7FFFFC00, 0x5AC00400)
 __AARCH64_INSN_FUNCS(rev32,	0x7FFFFC00, 0x5AC00800)
 __AARCH64_INSN_FUNCS(rev64,	0x7FFFFC00, 0x5AC00C00)
+__AARCH64_INSN_FUNCS(rbit,	0x7FFFFC00, 0x5AC00000)
+__AARCH64_INSN_FUNCS(clz,      0x7FFFFC00, 0x5AC01000)
+__AARCH64_INSN_FUNCS(ctz,      0x7FFFFC00, 0x5AC01800)
 __AARCH64_INSN_FUNCS(and,      0x7F200000, 0x0A000000)
 __AARCH64_INSN_FUNCS(bic,      0x7F200000, 0x0A200000)
 __AARCH64_INSN_FUNCS(orr,      0x7F200000, 0x2A000000)
diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c
index 4e298baddc2e..2229ab596cda 100644
--- a/arch/arm64/lib/insn.c
+++ b/arch/arm64/lib/insn.c
@@ -1008,6 +1008,15 @@ u32 aarch64_insn_gen_data1(enum aarch64_insn_register dst,
 		}
 		insn = aarch64_insn_get_rev64_value();
 		break;
+	case AARCH64_INSN_DATA1_CLZ:
+		insn = aarch64_insn_get_clz_value();
+		break;
+	case AARCH64_INSN_DATA1_RBIT:
+		insn = aarch64_insn_get_rbit_value();
+		break;
+	case AARCH64_INSN_DATA1_CTZ:
+		insn = aarch64_insn_get_ctz_value();
+		break;
 	default:
 		pr_err("%s: unknown data1 encoding %d\n", __func__, type);
 		return AARCH64_BREAK_FAULT;
diff --git a/arch/arm64/net/bpf_jit.h b/arch/arm64/net/bpf_jit.h
index bbea4f36f9f2..af806c39dadb 100644
--- a/arch/arm64/net/bpf_jit.h
+++ b/arch/arm64/net/bpf_jit.h
@@ -248,6 +248,12 @@
 #define A64_REV16(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_16)
 #define A64_REV32(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, REVERSE_32)
 #define A64_REV64(Rd, Rn)     A64_DATA1(1, Rd, Rn, REVERSE_64)
+/* Rd = RBIT(Rn) */
+#define A64_RBIT(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, RBIT)
+/* Rd = CLZ(Rn) */
+#define A64_CLZ(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, CLZ)
+/* Rd = CTZ(Rn) */
+#define A64_CTZ(sf, Rd, Rn) A64_DATA1(sf, Rd, Rn, CTZ)

 /* Data-processing (2 source) */
 /* Rd = Rn OP Rm */

-- 8< --

Thanks,
Puranjay

[PATCH bpf-next v2 1/6] bpf: Introduce 64-bit bitops kfuncs
[PATCH bpf-next v2 2/6] bpf, x86: Add 64-bit bitops kfuncs support for x86_64
[PATCH bpf-next v2 3/6] bpf, arm64: Add 64-bit bitops kfuncs support
[PATCH bpf-next v2 4/6] selftests/bpf: Add tests for 64-bit bitops kfuncs
[PATCH bpf-next v2 5/6] selftests/bpf: Add __cpu_feature annotation for CPU-feature-gated tests
[PATCH bpf-next v2 6/6] selftests/bpf: Add JIT disassembly tests for 64-bit bitops kfuncs