Implement JIT inlining of the 64-bit bitops kfuncs on x86_64.
bpf_rol64() and bpf_ror64() are always supported via ROL/ROR.
bpf_ctz64() and bpf_ffs64() are supported when the CPU has
X86_FEATURE_BMI1 (TZCNT).
bpf_clz64() and bpf_fls64() are supported when the CPU has
X86_FEATURE_ABM (LZCNT).
bpf_popcnt64() is supported when the CPU has X86_FEATURE_POPCNT.
bpf_bitrev64() is not inlined as x86_64 has no native bit-reverse
instruction, so it falls back to a regular function call.
Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
---
arch/x86/net/bpf_jit_comp.c | 141 ++++++++++++++++++++++++++++++++++++
1 file changed, 141 insertions(+)
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 070ba80e39d7..193e1e2d7aa8 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -19,6 +19,7 @@
#include <asm/text-patching.h>
#include <asm/unwind.h>
#include <asm/cfi.h>
+#include <asm/cpufeatures.h>
static bool all_callee_regs_used[4] = {true, true, true, true};
@@ -1604,6 +1605,127 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
*pprog = prog;
}
+static bool bpf_inlines_func_call(u8 **pprog, void *func)
+{
+ bool has_popcnt = boot_cpu_has(X86_FEATURE_POPCNT);
+ bool has_bmi1 = boot_cpu_has(X86_FEATURE_BMI1);
+ bool has_abm = boot_cpu_has(X86_FEATURE_ABM);
+ bool inlined = true;
+ u8 *prog = *pprog;
+
+ /*
+ * x86 Bit manipulation instruction set
+ * https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
+ */
+
+ if (func == bpf_clz64 && has_abm) {
+ /*
+ * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
+ *
+ * LZCNT - Count the Number of Leading Zero Bits
+ *
+ * Opcode/Instruction
+ * F3 REX.W 0F BD /r
+ * LZCNT r64, r/m64
+ *
+ * Op/En
+ * RVM
+ *
+ * 64/32-bit Mode
+ * V/N.E.
+ *
+ * CPUID Feature Flag
+ * LZCNT
+ *
+ * Description
+ * Count the number of leading zero bits in r/m64, return
+ * result in r64.
+ */
+ /* emit: x ? 64 - fls64(x) : 64 */
+ /* lzcnt rax, rdi */
+ EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
+ } else if (func == bpf_ctz64 && has_bmi1) {
+ /*
+ * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
+ *
+ * TZCNT - Count the Number of Trailing Zero Bits
+ *
+ * Opcode/Instruction
+ * F3 REX.W 0F BC /r
+ * TZCNT r64, r/m64
+ *
+ * Op/En
+ * RVM
+ *
+ * 64/32-bit Mode
+ * V/N.E.
+ *
+ * CPUID Feature Flag
+ * BMI1
+ *
+ * Description
+ * Count the number of trailing zero bits in r/m64, return
+ * result in r64.
+ */
+ /* emit: x ? __ffs64(x) : 64 */
+ /* tzcnt rax, rdi */
+ EMIT5(0xF3, 0x48, 0x0F, 0xBC, 0xC7);
+ } else if (func == bpf_ffs64 && has_bmi1) {
+ /* emit: __ffs64(x); x == 0 has been handled in verifier */
+ /* tzcnt rax, rdi */
+ EMIT5(0xF3, 0x48, 0x0F, 0xBC, 0xC7);
+ } else if (func == bpf_fls64 && has_abm) {
+ /* emit: fls64(x) */
+ /* lzcnt rax, rdi */
+ EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
+ EMIT3(0x48, 0xF7, 0xD8); /* neg rax */
+ EMIT4(0x48, 0x83, 0xC0, 0x40); /* add rax, 64 */
+ } else if (func == bpf_popcnt64 && has_popcnt) {
+ /*
+ * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
+ *
+ * POPCNT - Return the Count of Number of Bits Set to 1
+ *
+ * Opcode/Instruction
+ * F3 REX.W 0F B8 /r
+ * POPCNT r64, r/m64
+ *
+ * Op/En
+ * RM
+ *
+ * 64 Mode
+ * Valid
+ *
+ * Compat/Leg Mode
+ * N.E.
+ *
+ * Description
+ * POPCNT on r/m64
+ */
+ /* popcnt rax, rdi */
+ EMIT5(0xF3, 0x48, 0x0F, 0xB8, 0xC7);
+ } else if (func == bpf_rol64) {
+ EMIT1(0x51); /* push rcx */
+ /* emit: rol64(x, s) */
+ EMIT3(0x48, 0x89, 0xF1); /* mov rcx, rsi */
+ EMIT3(0x48, 0x89, 0xF8); /* mov rax, rdi */
+ EMIT3(0x48, 0xD3, 0xC0); /* rol rax, cl */
+ EMIT1(0x59); /* pop rcx */
+ } else if (func == bpf_ror64) {
+ EMIT1(0x51); /* push rcx */
+ /* emit: ror64(x, s) */
+ EMIT3(0x48, 0x89, 0xF1); /* mov rcx, rsi */
+ EMIT3(0x48, 0x89, 0xF8); /* mov rax, rdi */
+ EMIT3(0x48, 0xD3, 0xC8); /* ror rax, cl */
+ EMIT1(0x59); /* pop rcx */
+ } else {
+ inlined = false;
+ }
+
+ *pprog = prog;
+ return inlined;
+}
+
#define INSN_SZ_DIFF (((addrs[i] - addrs[i - 1]) - (prog - temp)))
#define __LOAD_TCC_PTR(off) \
@@ -2452,6 +2574,8 @@ st: if (is_imm8(insn->off))
u8 *ip = image + addrs[i - 1];
func = (u8 *) __bpf_call_base + imm32;
+ if (bpf_inlines_func_call(&prog, func))
+ break;
if (src_reg == BPF_PSEUDO_CALL && tail_call_reachable) {
LOAD_TAIL_CALL_CNT_PTR(stack_depth);
ip += 7;
@@ -4117,3 +4241,20 @@ bool bpf_jit_supports_fsession(void)
{
return true;
}
+
+bool bpf_jit_inlines_kfunc_call(void *func_addr)
+{
+ if (func_addr == bpf_ctz64 || func_addr == bpf_ffs64)
+ return boot_cpu_has(X86_FEATURE_BMI1);
+
+ if (func_addr == bpf_clz64 || func_addr == bpf_fls64)
+ return boot_cpu_has(X86_FEATURE_ABM);
+
+ if (func_addr == bpf_popcnt64)
+ return boot_cpu_has(X86_FEATURE_POPCNT);
+
+ if (func_addr == bpf_rol64 || func_addr == bpf_ror64)
+ return true;
+
+ return false;
+}
--
2.52.0
Hi Leon, kernel test robot noticed the following build errors: [auto build test ERROR on bpf-next/master] url: https://github.com/intel-lab-lkp/linux/commits/Leon-Hwang/bpf-Introduce-64-bit-bitops-kfuncs/20260219-223550 base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master patch link: https://lore.kernel.org/r/20260219142933.13904-3-leon.hwang%40linux.dev patch subject: [PATCH bpf-next v2 2/6] bpf, x86: Add 64-bit bitops kfuncs support for x86_64 config: x86_64-randconfig-012-20260220 (https://download.01.org/0day-ci/archive/20260220/202602201931.LBZGbpvs-lkp@intel.com/config) compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261) rustc: rustc 1.88.0 (6b00bc388 2025-06-23) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260220/202602201931.LBZGbpvs-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202602201931.LBZGbpvs-lkp@intel.com/ All errors (new ones prefixed by >>): >> ld.lld: error: undefined symbol: bpf_clz64 >>> referenced by bpf_jit_comp.c:1621 (arch/x86/net/bpf_jit_comp.c:1621) >>> arch/x86/net/bpf_jit_comp.o:(do_jit) in archive vmlinux.a >>> referenced by bpf_jit_comp.c:4250 (arch/x86/net/bpf_jit_comp.c:4250) >>> arch/x86/net/bpf_jit_comp.o:(bpf_jit_inlines_kfunc_call) in archive vmlinux.a -- >> ld.lld: error: undefined symbol: bpf_ctz64 >>> referenced by bpf_jit_comp.c:1647 (arch/x86/net/bpf_jit_comp.c:1647) >>> arch/x86/net/bpf_jit_comp.o:(do_jit) in archive vmlinux.a >>> referenced by bpf_jit_comp.c:4247 (arch/x86/net/bpf_jit_comp.c:4247) >>> arch/x86/net/bpf_jit_comp.o:(bpf_jit_inlines_kfunc_call) in archive vmlinux.a -- >> ld.lld: error: undefined symbol: bpf_ffs64 >>> referenced by bpf_jit_comp.c:1673 (arch/x86/net/bpf_jit_comp.c:1673) >>> arch/x86/net/bpf_jit_comp.o:(do_jit) in archive vmlinux.a >>> referenced by bpf_jit_comp.c:4247 (arch/x86/net/bpf_jit_comp.c:4247) >>> arch/x86/net/bpf_jit_comp.o:(bpf_jit_inlines_kfunc_call) in archive vmlinux.a -- >> ld.lld: error: undefined symbol: bpf_fls64 >>> referenced by bpf_jit_comp.c:1677 (arch/x86/net/bpf_jit_comp.c:1677) >>> arch/x86/net/bpf_jit_comp.o:(do_jit) in archive vmlinux.a >>> referenced by bpf_jit_comp.c:4250 (arch/x86/net/bpf_jit_comp.c:4250) >>> arch/x86/net/bpf_jit_comp.o:(bpf_jit_inlines_kfunc_call) in archive vmlinux.a -- >> ld.lld: error: undefined symbol: bpf_popcnt64 >>> referenced by bpf_jit_comp.c:1683 (arch/x86/net/bpf_jit_comp.c:1683) >>> arch/x86/net/bpf_jit_comp.o:(do_jit) in archive vmlinux.a >>> referenced by bpf_jit_comp.c:4253 (arch/x86/net/bpf_jit_comp.c:4253) >>> arch/x86/net/bpf_jit_comp.o:(bpf_jit_inlines_kfunc_call) in archive vmlinux.a -- >> ld.lld: error: undefined symbol: bpf_rol64 >>> referenced by bpf_jit_comp.c:1707 (arch/x86/net/bpf_jit_comp.c:1707) >>> arch/x86/net/bpf_jit_comp.o:(do_jit) in archive vmlinux.a >>> referenced by bpf_jit_comp.c:4256 (arch/x86/net/bpf_jit_comp.c:4256) >>> arch/x86/net/bpf_jit_comp.o:(bpf_jit_inlines_kfunc_call) in archive vmlinux.a -- >> ld.lld: error: undefined symbol: bpf_ror64 >>> referenced by bpf_jit_comp.c:1714 (arch/x86/net/bpf_jit_comp.c:1714) >>> arch/x86/net/bpf_jit_comp.o:(do_jit) in archive vmlinux.a >>> referenced by bpf_jit_comp.c:4256 (arch/x86/net/bpf_jit_comp.c:4256) >>> arch/x86/net/bpf_jit_comp.o:(bpf_jit_inlines_kfunc_call) in archive vmlinux.a -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Hi Leon,
kernel test robot noticed the following build errors:
[auto build test ERROR on bpf-next/master]
url: https://github.com/intel-lab-lkp/linux/commits/Leon-Hwang/bpf-Introduce-64-bit-bitops-kfuncs/20260219-223550
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
patch link: https://lore.kernel.org/r/20260219142933.13904-3-leon.hwang%40linux.dev
patch subject: [PATCH bpf-next v2 2/6] bpf, x86: Add 64-bit bitops kfuncs support for x86_64
config: x86_64-randconfig-073-20260220 (https://download.01.org/0day-ci/archive/20260220/202602200536.JWzGHAc6-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260220/202602200536.JWzGHAc6-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602200536.JWzGHAc6-lkp@intel.com/
All errors (new ones prefixed by >>):
ld: arch/x86/net/bpf_jit_comp.o: in function `bpf_inlines_func_call':
>> arch/x86/net/bpf_jit_comp.c:1621:(.text+0xe70b): undefined reference to `bpf_clz64'
>> ld: arch/x86/net/bpf_jit_comp.c:1647:(.text+0xe718): undefined reference to `bpf_ctz64'
>> ld: arch/x86/net/bpf_jit_comp.c:1673:(.text+0xe725): undefined reference to `bpf_ffs64'
>> ld: arch/x86/net/bpf_jit_comp.c:1677:(.text+0xe732): undefined reference to `bpf_fls64'
>> ld: arch/x86/net/bpf_jit_comp.c:1683:(.text+0xe743): undefined reference to `bpf_popcnt64'
>> ld: arch/x86/net/bpf_jit_comp.c:1707:(.text+0xe758): undefined reference to `bpf_rol64'
>> ld: arch/x86/net/bpf_jit_comp.c:1714:(.text+0xe765): undefined reference to `bpf_ror64'
ld: arch/x86/net/bpf_jit_comp.c:1647:(.text+0x10e85): undefined reference to `bpf_ctz64'
ld: arch/x86/net/bpf_jit_comp.c:1673:(.text+0x10e92): undefined reference to `bpf_ffs64'
ld: arch/x86/net/bpf_jit_comp.o: in function `bpf_jit_inlines_kfunc_call':
>> arch/x86/net/bpf_jit_comp.c:4247:(.text+0x177c8): undefined reference to `bpf_ffs64'
ld: arch/x86/net/bpf_jit_comp.c:4247:(.text+0x177d1): undefined reference to `bpf_ctz64'
ld: arch/x86/net/bpf_jit_comp.c:4250:(.text+0x177da): undefined reference to `bpf_fls64'
>> ld: arch/x86/net/bpf_jit_comp.c:4250:(.text+0x177e3): undefined reference to `bpf_clz64'
ld: arch/x86/net/bpf_jit_comp.c:4253:(.text+0x177ec): undefined reference to `bpf_popcnt64'
ld: arch/x86/net/bpf_jit_comp.c:4256:(.text+0x177f5): undefined reference to `bpf_ror64'
ld: arch/x86/net/bpf_jit_comp.c:4256:(.text+0x177ff): undefined reference to `bpf_rol64'
vim +1621 arch/x86/net/bpf_jit_comp.c
1607
1608 static bool bpf_inlines_func_call(u8 **pprog, void *func)
1609 {
1610 bool has_popcnt = boot_cpu_has(X86_FEATURE_POPCNT);
1611 bool has_bmi1 = boot_cpu_has(X86_FEATURE_BMI1);
1612 bool has_abm = boot_cpu_has(X86_FEATURE_ABM);
1613 bool inlined = true;
1614 u8 *prog = *pprog;
1615
1616 /*
1617 * x86 Bit manipulation instruction set
1618 * https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
1619 */
1620
> 1621 if (func == bpf_clz64 && has_abm) {
1622 /*
1623 * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
1624 *
1625 * LZCNT - Count the Number of Leading Zero Bits
1626 *
1627 * Opcode/Instruction
1628 * F3 REX.W 0F BD /r
1629 * LZCNT r64, r/m64
1630 *
1631 * Op/En
1632 * RVM
1633 *
1634 * 64/32-bit Mode
1635 * V/N.E.
1636 *
1637 * CPUID Feature Flag
1638 * LZCNT
1639 *
1640 * Description
1641 * Count the number of leading zero bits in r/m64, return
1642 * result in r64.
1643 */
1644 /* emit: x ? 64 - fls64(x) : 64 */
1645 /* lzcnt rax, rdi */
1646 EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
> 1647 } else if (func == bpf_ctz64 && has_bmi1) {
1648 /*
1649 * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
1650 *
1651 * TZCNT - Count the Number of Trailing Zero Bits
1652 *
1653 * Opcode/Instruction
1654 * F3 REX.W 0F BC /r
1655 * TZCNT r64, r/m64
1656 *
1657 * Op/En
1658 * RVM
1659 *
1660 * 64/32-bit Mode
1661 * V/N.E.
1662 *
1663 * CPUID Feature Flag
1664 * BMI1
1665 *
1666 * Description
1667 * Count the number of trailing zero bits in r/m64, return
1668 * result in r64.
1669 */
1670 /* emit: x ? __ffs64(x) : 64 */
1671 /* tzcnt rax, rdi */
1672 EMIT5(0xF3, 0x48, 0x0F, 0xBC, 0xC7);
> 1673 } else if (func == bpf_ffs64 && has_bmi1) {
1674 /* emit: __ffs64(x); x == 0 has been handled in verifier */
1675 /* tzcnt rax, rdi */
1676 EMIT5(0xF3, 0x48, 0x0F, 0xBC, 0xC7);
> 1677 } else if (func == bpf_fls64 && has_abm) {
1678 /* emit: fls64(x) */
1679 /* lzcnt rax, rdi */
1680 EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
1681 EMIT3(0x48, 0xF7, 0xD8); /* neg rax */
1682 EMIT4(0x48, 0x83, 0xC0, 0x40); /* add rax, 64 */
> 1683 } else if (func == bpf_popcnt64 && has_popcnt) {
1684 /*
1685 * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
1686 *
1687 * POPCNT - Return the Count of Number of Bits Set to 1
1688 *
1689 * Opcode/Instruction
1690 * F3 REX.W 0F B8 /r
1691 * POPCNT r64, r/m64
1692 *
1693 * Op/En
1694 * RM
1695 *
1696 * 64 Mode
1697 * Valid
1698 *
1699 * Compat/Leg Mode
1700 * N.E.
1701 *
1702 * Description
1703 * POPCNT on r/m64
1704 */
1705 /* popcnt rax, rdi */
1706 EMIT5(0xF3, 0x48, 0x0F, 0xB8, 0xC7);
> 1707 } else if (func == bpf_rol64) {
1708 EMIT1(0x51); /* push rcx */
1709 /* emit: rol64(x, s) */
1710 EMIT3(0x48, 0x89, 0xF1); /* mov rcx, rsi */
1711 EMIT3(0x48, 0x89, 0xF8); /* mov rax, rdi */
1712 EMIT3(0x48, 0xD3, 0xC0); /* rol rax, cl */
1713 EMIT1(0x59); /* pop rcx */
> 1714 } else if (func == bpf_ror64) {
1715 EMIT1(0x51); /* push rcx */
1716 /* emit: ror64(x, s) */
1717 EMIT3(0x48, 0x89, 0xF1); /* mov rcx, rsi */
1718 EMIT3(0x48, 0x89, 0xF8); /* mov rax, rdi */
1719 EMIT3(0x48, 0xD3, 0xC8); /* ror rax, cl */
1720 EMIT1(0x59); /* pop rcx */
1721 } else {
1722 inlined = false;
1723 }
1724
1725 *pprog = prog;
1726 return inlined;
1727 }
1728
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On 2026/2/20 06:05, kernel test robot wrote:
> Hi Leon,
>
> kernel test robot noticed the following build errors:
>
> [auto build test ERROR on bpf-next/master]
>
> url: https://github.com/intel-lab-lkp/linux/commits/Leon-Hwang/bpf-Introduce-64-bit-bitops-kfuncs/20260219-223550
> base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
> patch link: https://lore.kernel.org/r/20260219142933.13904-3-leon.hwang%40linux.dev
> patch subject: [PATCH bpf-next v2 2/6] bpf, x86: Add 64-bit bitops kfuncs support for x86_64
> config: x86_64-randconfig-073-20260220 (https://download.01.org/0day-ci/archive/20260220/202602200536.JWzGHAc6-lkp@intel.com/config)
Ack.
It was caused by the missing CONFIG_BPF_SYSCALL.
$ rg _BPF .config
118:CONFIG_BPF=y
120:CONFIG_ARCH_WANT_DEFAULT_BPF_JIT=y
125:# CONFIG_BPF_SYSCALL is not set
126:CONFIG_BPF_JIT=y
127:CONFIG_BPF_JIT_DEFAULT_ON=y
1339:CONFIG_LWTUNNEL_BPF=y
7449:CONFIG_IO_URING_BPF=y
I'll make those symbols relied on CONFIG_BPF_SYSCALL in the next revision.
Thanks,
Leon
> compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260220/202602200536.JWzGHAc6-lkp@intel.com/reproduce)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202602200536.JWzGHAc6-lkp@intel.com/
>
> All errors (new ones prefixed by >>):
>
> ld: arch/x86/net/bpf_jit_comp.o: in function `bpf_inlines_func_call':
>>> arch/x86/net/bpf_jit_comp.c:1621:(.text+0xe70b): undefined reference to `bpf_clz64'
>>> ld: arch/x86/net/bpf_jit_comp.c:1647:(.text+0xe718): undefined reference to `bpf_ctz64'
>>> ld: arch/x86/net/bpf_jit_comp.c:1673:(.text+0xe725): undefined reference to `bpf_ffs64'
>>> ld: arch/x86/net/bpf_jit_comp.c:1677:(.text+0xe732): undefined reference to `bpf_fls64'
>>> ld: arch/x86/net/bpf_jit_comp.c:1683:(.text+0xe743): undefined reference to `bpf_popcnt64'
>>> ld: arch/x86/net/bpf_jit_comp.c:1707:(.text+0xe758): undefined reference to `bpf_rol64'
>>> ld: arch/x86/net/bpf_jit_comp.c:1714:(.text+0xe765): undefined reference to `bpf_ror64'
> ld: arch/x86/net/bpf_jit_comp.c:1647:(.text+0x10e85): undefined reference to `bpf_ctz64'
> ld: arch/x86/net/bpf_jit_comp.c:1673:(.text+0x10e92): undefined reference to `bpf_ffs64'
> ld: arch/x86/net/bpf_jit_comp.o: in function `bpf_jit_inlines_kfunc_call':
>>> arch/x86/net/bpf_jit_comp.c:4247:(.text+0x177c8): undefined reference to `bpf_ffs64'
> ld: arch/x86/net/bpf_jit_comp.c:4247:(.text+0x177d1): undefined reference to `bpf_ctz64'
> ld: arch/x86/net/bpf_jit_comp.c:4250:(.text+0x177da): undefined reference to `bpf_fls64'
>>> ld: arch/x86/net/bpf_jit_comp.c:4250:(.text+0x177e3): undefined reference to `bpf_clz64'
> ld: arch/x86/net/bpf_jit_comp.c:4253:(.text+0x177ec): undefined reference to `bpf_popcnt64'
> ld: arch/x86/net/bpf_jit_comp.c:4256:(.text+0x177f5): undefined reference to `bpf_ror64'
> ld: arch/x86/net/bpf_jit_comp.c:4256:(.text+0x177ff): undefined reference to `bpf_rol64'
>
>
> vim +1621 arch/x86/net/bpf_jit_comp.c
>
> 1607
> 1608 static bool bpf_inlines_func_call(u8 **pprog, void *func)
> 1609 {
> 1610 bool has_popcnt = boot_cpu_has(X86_FEATURE_POPCNT);
> 1611 bool has_bmi1 = boot_cpu_has(X86_FEATURE_BMI1);
> 1612 bool has_abm = boot_cpu_has(X86_FEATURE_ABM);
> 1613 bool inlined = true;
> 1614 u8 *prog = *pprog;
> 1615
> 1616 /*
> 1617 * x86 Bit manipulation instruction set
> 1618 * https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
> 1619 */
> 1620
>> 1621 if (func == bpf_clz64 && has_abm) {
> 1622 /*
> 1623 * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
> 1624 *
> 1625 * LZCNT - Count the Number of Leading Zero Bits
> 1626 *
> 1627 * Opcode/Instruction
> 1628 * F3 REX.W 0F BD /r
> 1629 * LZCNT r64, r/m64
> 1630 *
> 1631 * Op/En
> 1632 * RVM
> 1633 *
> 1634 * 64/32-bit Mode
> 1635 * V/N.E.
> 1636 *
> 1637 * CPUID Feature Flag
> 1638 * LZCNT
> 1639 *
> 1640 * Description
> 1641 * Count the number of leading zero bits in r/m64, return
> 1642 * result in r64.
> 1643 */
> 1644 /* emit: x ? 64 - fls64(x) : 64 */
> 1645 /* lzcnt rax, rdi */
> 1646 EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
>> 1647 } else if (func == bpf_ctz64 && has_bmi1) {
> 1648 /*
> 1649 * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
> 1650 *
> 1651 * TZCNT - Count the Number of Trailing Zero Bits
> 1652 *
> 1653 * Opcode/Instruction
> 1654 * F3 REX.W 0F BC /r
> 1655 * TZCNT r64, r/m64
> 1656 *
> 1657 * Op/En
> 1658 * RVM
> 1659 *
> 1660 * 64/32-bit Mode
> 1661 * V/N.E.
> 1662 *
> 1663 * CPUID Feature Flag
> 1664 * BMI1
> 1665 *
> 1666 * Description
> 1667 * Count the number of trailing zero bits in r/m64, return
> 1668 * result in r64.
> 1669 */
> 1670 /* emit: x ? __ffs64(x) : 64 */
> 1671 /* tzcnt rax, rdi */
> 1672 EMIT5(0xF3, 0x48, 0x0F, 0xBC, 0xC7);
>> 1673 } else if (func == bpf_ffs64 && has_bmi1) {
> 1674 /* emit: __ffs64(x); x == 0 has been handled in verifier */
> 1675 /* tzcnt rax, rdi */
> 1676 EMIT5(0xF3, 0x48, 0x0F, 0xBC, 0xC7);
>> 1677 } else if (func == bpf_fls64 && has_abm) {
> 1678 /* emit: fls64(x) */
> 1679 /* lzcnt rax, rdi */
> 1680 EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
> 1681 EMIT3(0x48, 0xF7, 0xD8); /* neg rax */
> 1682 EMIT4(0x48, 0x83, 0xC0, 0x40); /* add rax, 64 */
>> 1683 } else if (func == bpf_popcnt64 && has_popcnt) {
> 1684 /*
> 1685 * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
> 1686 *
> 1687 * POPCNT - Return the Count of Number of Bits Set to 1
> 1688 *
> 1689 * Opcode/Instruction
> 1690 * F3 REX.W 0F B8 /r
> 1691 * POPCNT r64, r/m64
> 1692 *
> 1693 * Op/En
> 1694 * RM
> 1695 *
> 1696 * 64 Mode
> 1697 * Valid
> 1698 *
> 1699 * Compat/Leg Mode
> 1700 * N.E.
> 1701 *
> 1702 * Description
> 1703 * POPCNT on r/m64
> 1704 */
> 1705 /* popcnt rax, rdi */
> 1706 EMIT5(0xF3, 0x48, 0x0F, 0xB8, 0xC7);
>> 1707 } else if (func == bpf_rol64) {
> 1708 EMIT1(0x51); /* push rcx */
> 1709 /* emit: rol64(x, s) */
> 1710 EMIT3(0x48, 0x89, 0xF1); /* mov rcx, rsi */
> 1711 EMIT3(0x48, 0x89, 0xF8); /* mov rax, rdi */
> 1712 EMIT3(0x48, 0xD3, 0xC0); /* rol rax, cl */
> 1713 EMIT1(0x59); /* pop rcx */
>> 1714 } else if (func == bpf_ror64) {
> 1715 EMIT1(0x51); /* push rcx */
> 1716 /* emit: ror64(x, s) */
> 1717 EMIT3(0x48, 0x89, 0xF1); /* mov rcx, rsi */
> 1718 EMIT3(0x48, 0x89, 0xF8); /* mov rax, rdi */
> 1719 EMIT3(0x48, 0xD3, 0xC8); /* ror rax, cl */
> 1720 EMIT1(0x59); /* pop rcx */
> 1721 } else {
> 1722 inlined = false;
> 1723 }
> 1724
> 1725 *pprog = prog;
> 1726 return inlined;
> 1727 }
> 1728
>
On Thu, Feb 19, 2026 at 6:30 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>
> Implement JIT inlining of the 64-bit bitops kfuncs on x86_64.
>
> bpf_rol64() and bpf_ror64() are always supported via ROL/ROR.
>
> bpf_ctz64() and bpf_ffs64() are supported when the CPU has
> X86_FEATURE_BMI1 (TZCNT).
>
> bpf_clz64() and bpf_fls64() are supported when the CPU has
> X86_FEATURE_ABM (LZCNT).
>
> bpf_popcnt64() is supported when the CPU has X86_FEATURE_POPCNT.
>
> bpf_bitrev64() is not inlined as x86_64 has no native bit-reverse
> instruction, so it falls back to a regular function call.
>
> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> ---
> arch/x86/net/bpf_jit_comp.c | 141 ++++++++++++++++++++++++++++++++++++
> 1 file changed, 141 insertions(+)
>
> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> index 070ba80e39d7..193e1e2d7aa8 100644
> --- a/arch/x86/net/bpf_jit_comp.c
> +++ b/arch/x86/net/bpf_jit_comp.c
> @@ -19,6 +19,7 @@
> #include <asm/text-patching.h>
> #include <asm/unwind.h>
> #include <asm/cfi.h>
> +#include <asm/cpufeatures.h>
>
> static bool all_callee_regs_used[4] = {true, true, true, true};
>
> @@ -1604,6 +1605,127 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
> *pprog = prog;
> }
>
> +static bool bpf_inlines_func_call(u8 **pprog, void *func)
> +{
> + bool has_popcnt = boot_cpu_has(X86_FEATURE_POPCNT);
> + bool has_bmi1 = boot_cpu_has(X86_FEATURE_BMI1);
> + bool has_abm = boot_cpu_has(X86_FEATURE_ABM);
> + bool inlined = true;
> + u8 *prog = *pprog;
> +
> + /*
> + * x86 Bit manipulation instruction set
> + * https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
> + */
> +
> + if (func == bpf_clz64 && has_abm) {
> + /*
> + * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
> + *
> + * LZCNT - Count the Number of Leading Zero Bits
> + *
> + * Opcode/Instruction
> + * F3 REX.W 0F BD /r
> + * LZCNT r64, r/m64
> + *
> + * Op/En
> + * RVM
> + *
> + * 64/32-bit Mode
> + * V/N.E.
> + *
> + * CPUID Feature Flag
> + * LZCNT
> + *
> + * Description
> + * Count the number of leading zero bits in r/m64, return
> + * result in r64.
> + */
> + /* emit: x ? 64 - fls64(x) : 64 */
> + /* lzcnt rax, rdi */
> + EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
Instead of emitting binary in x86 and arm JITs,
let's use in kernel disasm to check that all these kfuncs
conform to kf_fastcall (don't use unnecessary registers,
don't have calls to other functions) and then copy the binary
from code and skip the last 'ret' insn.
This way we can inline all kinds of kfuncs.
pw-bot: cr
On 2026/2/20 01:47, Alexei Starovoitov wrote:
> On Thu, Feb 19, 2026 at 6:30 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>>
>> Implement JIT inlining of the 64-bit bitops kfuncs on x86_64.
>>
>> bpf_rol64() and bpf_ror64() are always supported via ROL/ROR.
>>
>> bpf_ctz64() and bpf_ffs64() are supported when the CPU has
>> X86_FEATURE_BMI1 (TZCNT).
>>
>> bpf_clz64() and bpf_fls64() are supported when the CPU has
>> X86_FEATURE_ABM (LZCNT).
>>
>> bpf_popcnt64() is supported when the CPU has X86_FEATURE_POPCNT.
>>
>> bpf_bitrev64() is not inlined as x86_64 has no native bit-reverse
>> instruction, so it falls back to a regular function call.
>>
>> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
>> ---
>> arch/x86/net/bpf_jit_comp.c | 141 ++++++++++++++++++++++++++++++++++++
>> 1 file changed, 141 insertions(+)
>>
>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>> index 070ba80e39d7..193e1e2d7aa8 100644
>> --- a/arch/x86/net/bpf_jit_comp.c
>> +++ b/arch/x86/net/bpf_jit_comp.c
>> @@ -19,6 +19,7 @@
>> #include <asm/text-patching.h>
>> #include <asm/unwind.h>
>> #include <asm/cfi.h>
>> +#include <asm/cpufeatures.h>
>>
>> static bool all_callee_regs_used[4] = {true, true, true, true};
>>
>> @@ -1604,6 +1605,127 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
>> *pprog = prog;
>> }
>>
>> +static bool bpf_inlines_func_call(u8 **pprog, void *func)
>> +{
>> + bool has_popcnt = boot_cpu_has(X86_FEATURE_POPCNT);
>> + bool has_bmi1 = boot_cpu_has(X86_FEATURE_BMI1);
>> + bool has_abm = boot_cpu_has(X86_FEATURE_ABM);
>> + bool inlined = true;
>> + u8 *prog = *pprog;
>> +
>> + /*
>> + * x86 Bit manipulation instruction set
>> + * https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
>> + */
>> +
>> + if (func == bpf_clz64 && has_abm) {
>> + /*
>> + * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
>> + *
>> + * LZCNT - Count the Number of Leading Zero Bits
>> + *
>> + * Opcode/Instruction
>> + * F3 REX.W 0F BD /r
>> + * LZCNT r64, r/m64
>> + *
>> + * Op/En
>> + * RVM
>> + *
>> + * 64/32-bit Mode
>> + * V/N.E.
>> + *
>> + * CPUID Feature Flag
>> + * LZCNT
>> + *
>> + * Description
>> + * Count the number of leading zero bits in r/m64, return
>> + * result in r64.
>> + */
>> + /* emit: x ? 64 - fls64(x) : 64 */
>> + /* lzcnt rax, rdi */
>> + EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
>
> Instead of emitting binary in x86 and arm JITs,
> let's use in kernel disasm to check that all these kfuncs
> conform to kf_fastcall (don't use unnecessary registers,
> don't have calls to other functions) and then copy the binary
> from code and skip the last 'ret' insn.
> This way we can inline all kinds of kfuncs.
>
Good idea.
Quick question on “in-kernel disasm”: do you mean adding a kernel
instruction decoder/disassembler to validate a whitelist of kfuncs at
load time?
I’m trying to understand the intended scope:
* Is the expectation that we add an in-kernel disassembler/validator for
a small set of supported instructions and patterns (no calls/jumps,
only arg/ret regs touched, etc.)?
* Or is there already infrastructure you had in mind that we can reuse?
Once I understand that piece, I can rework the series to inline by
copying validated machine code (minus the final ret), rather than
emitting raw opcodes in the JITs.
I also noticed you mentioned a similar direction in "bpf/s390: Implement
get_preempt_count()" [1], so I’ve added Ilya to the thread to discuss
this approach further.
[1]
https://lore.kernel.org/bpf/CAADnVQKSMCohZy_HZwzNpFfTSnVu7rfxgmHEDgT9s28XxcDS5g@mail.gmail.com/
Thanks,
Leon
On Fri, Feb 20, 2026 at 7:54 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>
>
>
> On 2026/2/20 01:47, Alexei Starovoitov wrote:
> > On Thu, Feb 19, 2026 at 6:30 AM Leon Hwang <leon.hwang@linux.dev> wrote:
> >>
> >> Implement JIT inlining of the 64-bit bitops kfuncs on x86_64.
> >>
> >> bpf_rol64() and bpf_ror64() are always supported via ROL/ROR.
> >>
> >> bpf_ctz64() and bpf_ffs64() are supported when the CPU has
> >> X86_FEATURE_BMI1 (TZCNT).
> >>
> >> bpf_clz64() and bpf_fls64() are supported when the CPU has
> >> X86_FEATURE_ABM (LZCNT).
> >>
> >> bpf_popcnt64() is supported when the CPU has X86_FEATURE_POPCNT.
> >>
> >> bpf_bitrev64() is not inlined as x86_64 has no native bit-reverse
> >> instruction, so it falls back to a regular function call.
> >>
> >> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> >> ---
> >> arch/x86/net/bpf_jit_comp.c | 141 ++++++++++++++++++++++++++++++++++++
> >> 1 file changed, 141 insertions(+)
> >>
> >> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> >> index 070ba80e39d7..193e1e2d7aa8 100644
> >> --- a/arch/x86/net/bpf_jit_comp.c
> >> +++ b/arch/x86/net/bpf_jit_comp.c
> >> @@ -19,6 +19,7 @@
> >> #include <asm/text-patching.h>
> >> #include <asm/unwind.h>
> >> #include <asm/cfi.h>
> >> +#include <asm/cpufeatures.h>
> >>
> >> static bool all_callee_regs_used[4] = {true, true, true, true};
> >>
> >> @@ -1604,6 +1605,127 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
> >> *pprog = prog;
> >> }
> >>
> >> +static bool bpf_inlines_func_call(u8 **pprog, void *func)
> >> +{
> >> + bool has_popcnt = boot_cpu_has(X86_FEATURE_POPCNT);
> >> + bool has_bmi1 = boot_cpu_has(X86_FEATURE_BMI1);
> >> + bool has_abm = boot_cpu_has(X86_FEATURE_ABM);
> >> + bool inlined = true;
> >> + u8 *prog = *pprog;
> >> +
> >> + /*
> >> + * x86 Bit manipulation instruction set
> >> + * https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
> >> + */
> >> +
> >> + if (func == bpf_clz64 && has_abm) {
> >> + /*
> >> + * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
> >> + *
> >> + * LZCNT - Count the Number of Leading Zero Bits
> >> + *
> >> + * Opcode/Instruction
> >> + * F3 REX.W 0F BD /r
> >> + * LZCNT r64, r/m64
> >> + *
> >> + * Op/En
> >> + * RVM
> >> + *
> >> + * 64/32-bit Mode
> >> + * V/N.E.
> >> + *
> >> + * CPUID Feature Flag
> >> + * LZCNT
> >> + *
> >> + * Description
> >> + * Count the number of leading zero bits in r/m64, return
> >> + * result in r64.
> >> + */
> >> + /* emit: x ? 64 - fls64(x) : 64 */
> >> + /* lzcnt rax, rdi */
> >> + EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
> >
> > Instead of emitting binary in x86 and arm JITs,
> > let's use in kernel disasm to check that all these kfuncs
> > conform to kf_fastcall (don't use unnecessary registers,
> > don't have calls to other functions) and then copy the binary
> > from code and skip the last 'ret' insn.
> > This way we can inline all kinds of kfuncs.
> >
>
> Good idea.
>
> Quick question on “in-kernel disasm”: do you mean adding a kernel
> instruction decoder/disassembler to validate a whitelist of kfuncs at
> load time?
>
> I’m trying to understand the intended scope:
>
> * Is the expectation that we add an in-kernel disassembler/validator for
> a small set of supported instructions and patterns (no calls/jumps,
> only arg/ret regs touched, etc.)?
> * Or is there already infrastructure you had in mind that we can reuse?
>
> Once I understand that piece, I can rework the series to inline by
> copying validated machine code (minus the final ret), rather than
> emitting raw opcodes in the JITs.
>
> I also noticed you mentioned a similar direction in "bpf/s390: Implement
> get_preempt_count()" [1], so I’ve added Ilya to the thread to discuss
> this approach further.
You really sound like LLM. Do your homework as a human.
On 2026/2/21 01:50, Alexei Starovoitov wrote:
> On Fri, Feb 20, 2026 at 7:54 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>>
>>
>>
>> On 2026/2/20 01:47, Alexei Starovoitov wrote:
>>> On Thu, Feb 19, 2026 at 6:30 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>>>>
>>>> Implement JIT inlining of the 64-bit bitops kfuncs on x86_64.
>>>>
>>>> bpf_rol64() and bpf_ror64() are always supported via ROL/ROR.
>>>>
>>>> bpf_ctz64() and bpf_ffs64() are supported when the CPU has
>>>> X86_FEATURE_BMI1 (TZCNT).
>>>>
>>>> bpf_clz64() and bpf_fls64() are supported when the CPU has
>>>> X86_FEATURE_ABM (LZCNT).
>>>>
>>>> bpf_popcnt64() is supported when the CPU has X86_FEATURE_POPCNT.
>>>>
>>>> bpf_bitrev64() is not inlined as x86_64 has no native bit-reverse
>>>> instruction, so it falls back to a regular function call.
>>>>
>>>> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
>>>> ---
>>>> arch/x86/net/bpf_jit_comp.c | 141 ++++++++++++++++++++++++++++++++++++
>>>> 1 file changed, 141 insertions(+)
>>>>
>>>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
>>>> index 070ba80e39d7..193e1e2d7aa8 100644
>>>> --- a/arch/x86/net/bpf_jit_comp.c
>>>> +++ b/arch/x86/net/bpf_jit_comp.c
>>>> @@ -19,6 +19,7 @@
>>>> #include <asm/text-patching.h>
>>>> #include <asm/unwind.h>
>>>> #include <asm/cfi.h>
>>>> +#include <asm/cpufeatures.h>
>>>>
>>>> static bool all_callee_regs_used[4] = {true, true, true, true};
>>>>
>>>> @@ -1604,6 +1605,127 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
>>>> *pprog = prog;
>>>> }
>>>>
>>>> +static bool bpf_inlines_func_call(u8 **pprog, void *func)
>>>> +{
>>>> + bool has_popcnt = boot_cpu_has(X86_FEATURE_POPCNT);
>>>> + bool has_bmi1 = boot_cpu_has(X86_FEATURE_BMI1);
>>>> + bool has_abm = boot_cpu_has(X86_FEATURE_ABM);
>>>> + bool inlined = true;
>>>> + u8 *prog = *pprog;
>>>> +
>>>> + /*
>>>> + * x86 Bit manipulation instruction set
>>>> + * https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
>>>> + */
>>>> +
>>>> + if (func == bpf_clz64 && has_abm) {
>>>> + /*
>>>> + * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
>>>> + *
>>>> + * LZCNT - Count the Number of Leading Zero Bits
>>>> + *
>>>> + * Opcode/Instruction
>>>> + * F3 REX.W 0F BD /r
>>>> + * LZCNT r64, r/m64
>>>> + *
>>>> + * Op/En
>>>> + * RVM
>>>> + *
>>>> + * 64/32-bit Mode
>>>> + * V/N.E.
>>>> + *
>>>> + * CPUID Feature Flag
>>>> + * LZCNT
>>>> + *
>>>> + * Description
>>>> + * Count the number of leading zero bits in r/m64, return
>>>> + * result in r64.
>>>> + */
>>>> + /* emit: x ? 64 - fls64(x) : 64 */
>>>> + /* lzcnt rax, rdi */
>>>> + EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
>>>
>>> Instead of emitting binary in x86 and arm JITs,
>>> let's use in kernel disasm to check that all these kfuncs
>>> conform to kf_fastcall (don't use unnecessary registers,
>>> don't have calls to other functions) and then copy the binary
>>> from code and skip the last 'ret' insn.
>>> This way we can inline all kinds of kfuncs.
>>>
>>
>> Good idea.
>>
>> Quick question on “in-kernel disasm”: do you mean adding a kernel
>> instruction decoder/disassembler to validate a whitelist of kfuncs at
>> load time?
>>
>> I’m trying to understand the intended scope:
>>
>> * Is the expectation that we add an in-kernel disassembler/validator for
>> a small set of supported instructions and patterns (no calls/jumps,
>> only arg/ret regs touched, etc.)?
>> * Or is there already infrastructure you had in mind that we can reuse?
>>
>> Once I understand that piece, I can rework the series to inline by
>> copying validated machine code (minus the final ret), rather than
>> emitting raw opcodes in the JITs.
>>
>> I also noticed you mentioned a similar direction in "bpf/s390: Implement
>> get_preempt_count()" [1], so I’ve added Ilya to the thread to discuss
>> this approach further.
>
> You really sound like LLM. Do your homework as a human.
Got it.
I polished my draft using ChatGPT, which would leave LLM smell in my reply.
Here's my original draft:
Good idea. But I concern about the "in kernel disasm". Do you mean we
will build a disassembler for whitelist kfuncs at starting?
I noticed you've mentioned the same direction in "bpf/s390: Implement
get_preempt_count()" [1]. So, I added Ilya here to discuss this direction.
[1]
https://lore.kernel.org/bpf/CAADnVQKSMCohZy_HZwzNpFfTSnVu7rfxgmHEDgT9s28XxcDS5g@mail.gmail.com/
Thanks,
Leon
On Sat, Feb 21, 2026 at 4:45 AM Leon Hwang <leon.hwang@linux.dev> wrote:
>
>
>
> On 2026/2/21 01:50, Alexei Starovoitov wrote:
> > On Fri, Feb 20, 2026 at 7:54 AM Leon Hwang <leon.hwang@linux.dev> wrote:
> >>
> >>
> >>
> >> On 2026/2/20 01:47, Alexei Starovoitov wrote:
> >>> On Thu, Feb 19, 2026 at 6:30 AM Leon Hwang <leon.hwang@linux.dev> wrote:
> >>>>
> >>>> Implement JIT inlining of the 64-bit bitops kfuncs on x86_64.
> >>>>
> >>>> bpf_rol64() and bpf_ror64() are always supported via ROL/ROR.
> >>>>
> >>>> bpf_ctz64() and bpf_ffs64() are supported when the CPU has
> >>>> X86_FEATURE_BMI1 (TZCNT).
> >>>>
> >>>> bpf_clz64() and bpf_fls64() are supported when the CPU has
> >>>> X86_FEATURE_ABM (LZCNT).
> >>>>
> >>>> bpf_popcnt64() is supported when the CPU has X86_FEATURE_POPCNT.
> >>>>
> >>>> bpf_bitrev64() is not inlined as x86_64 has no native bit-reverse
> >>>> instruction, so it falls back to a regular function call.
> >>>>
> >>>> Signed-off-by: Leon Hwang <leon.hwang@linux.dev>
> >>>> ---
> >>>> arch/x86/net/bpf_jit_comp.c | 141 ++++++++++++++++++++++++++++++++++++
> >>>> 1 file changed, 141 insertions(+)
> >>>>
> >>>> diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
> >>>> index 070ba80e39d7..193e1e2d7aa8 100644
> >>>> --- a/arch/x86/net/bpf_jit_comp.c
> >>>> +++ b/arch/x86/net/bpf_jit_comp.c
> >>>> @@ -19,6 +19,7 @@
> >>>> #include <asm/text-patching.h>
> >>>> #include <asm/unwind.h>
> >>>> #include <asm/cfi.h>
> >>>> +#include <asm/cpufeatures.h>
> >>>>
> >>>> static bool all_callee_regs_used[4] = {true, true, true, true};
> >>>>
> >>>> @@ -1604,6 +1605,127 @@ static void emit_priv_frame_ptr(u8 **pprog, void __percpu *priv_frame_ptr)
> >>>> *pprog = prog;
> >>>> }
> >>>>
> >>>> +static bool bpf_inlines_func_call(u8 **pprog, void *func)
> >>>> +{
> >>>> + bool has_popcnt = boot_cpu_has(X86_FEATURE_POPCNT);
> >>>> + bool has_bmi1 = boot_cpu_has(X86_FEATURE_BMI1);
> >>>> + bool has_abm = boot_cpu_has(X86_FEATURE_ABM);
> >>>> + bool inlined = true;
> >>>> + u8 *prog = *pprog;
> >>>> +
> >>>> + /*
> >>>> + * x86 Bit manipulation instruction set
> >>>> + * https://en.wikipedia.org/wiki/X86_Bit_manipulation_instruction_set
> >>>> + */
> >>>> +
> >>>> + if (func == bpf_clz64 && has_abm) {
> >>>> + /*
> >>>> + * Intel® 64 and IA-32 Architectures Software Developer's Manual (June 2023)
> >>>> + *
> >>>> + * LZCNT - Count the Number of Leading Zero Bits
> >>>> + *
> >>>> + * Opcode/Instruction
> >>>> + * F3 REX.W 0F BD /r
> >>>> + * LZCNT r64, r/m64
> >>>> + *
> >>>> + * Op/En
> >>>> + * RVM
> >>>> + *
> >>>> + * 64/32-bit Mode
> >>>> + * V/N.E.
> >>>> + *
> >>>> + * CPUID Feature Flag
> >>>> + * LZCNT
> >>>> + *
> >>>> + * Description
> >>>> + * Count the number of leading zero bits in r/m64, return
> >>>> + * result in r64.
> >>>> + */
> >>>> + /* emit: x ? 64 - fls64(x) : 64 */
> >>>> + /* lzcnt rax, rdi */
> >>>> + EMIT5(0xF3, 0x48, 0x0F, 0xBD, 0xC7);
> >>>
> >>> Instead of emitting binary in x86 and arm JITs,
> >>> let's use in kernel disasm to check that all these kfuncs
> >>> conform to kf_fastcall (don't use unnecessary registers,
> >>> don't have calls to other functions) and then copy the binary
> >>> from code and skip the last 'ret' insn.
> >>> This way we can inline all kinds of kfuncs.
> >>>
> >>
> >> Good idea.
> >>
> >> Quick question on “in-kernel disasm”: do you mean adding a kernel
> >> instruction decoder/disassembler to validate a whitelist of kfuncs at
> >> load time?
> >>
> >> I’m trying to understand the intended scope:
> >>
> >> * Is the expectation that we add an in-kernel disassembler/validator for
> >> a small set of supported instructions and patterns (no calls/jumps,
> >> only arg/ret regs touched, etc.)?
> >> * Or is there already infrastructure you had in mind that we can reuse?
> >>
> >> Once I understand that piece, I can rework the series to inline by
> >> copying validated machine code (minus the final ret), rather than
> >> emitting raw opcodes in the JITs.
> >>
> >> I also noticed you mentioned a similar direction in "bpf/s390: Implement
> >> get_preempt_count()" [1], so I’ve added Ilya to the thread to discuss
> >> this approach further.
> >
> > You really sound like LLM. Do your homework as a human.
>
> Got it.
>
> I polished my draft using ChatGPT, which would leave LLM smell in my reply.
... and for anyone reading it the smell is ohh too strong.
> Here's my original draft:
>
> Good idea. But I concern about the "in kernel disasm". Do you mean we
> will build a disassembler for whitelist kfuncs at starting?
>
> I noticed you've mentioned the same direction in "bpf/s390: Implement
> get_preempt_count()" [1]. So, I added Ilya here to discuss this direction.
Much better. Keep it human.
"in kernel disasm" already exists for some architectures
(at least x86 and arm64) since it's being used by kprobes.
The ask here is to figure out whether they're usable for such
insn analysis. x86 disasm is likely capable.
re:"whitelist kfunc"
I suspect an additional list is not necessary.
kf_fastcall is a good enough signal that such kfunc should
be inlinable.
On 2026/2/22 00:51, Alexei Starovoitov wrote: > On Sat, Feb 21, 2026 at 4:45 AM Leon Hwang <leon.hwang@linux.dev> wrote: >> [...] >> >> Good idea. But I concern about the "in kernel disasm". Do you mean we >> will build a disassembler for whitelist kfuncs at starting? >> >> I noticed you've mentioned the same direction in "bpf/s390: Implement >> get_preempt_count()" [1]. So, I added Ilya here to discuss this direction. > > Much better. Keep it human. > > "in kernel disasm" already exists for some architectures > (at least x86 and arm64) since it's being used by kprobes. > The ask here is to figure out whether they're usable for such > insn analysis. x86 disasm is likely capable. > After looking into x86&arm insn decoder, they are able to do insn analysis. > re:"whitelist kfunc" > I suspect an additional list is not necessary. > kf_fastcall is a good enough signal that such kfunc should > be inlinable. I thought it was to build a light-weight custom disassembler, which would only support limited machine codes (whitelist kfunc). Obviously, I was wrong. We can reuse the in-kernel insn decoding ability to validate fastcall function by checking the registers use. I'll post RFC after finishing poc, on both x86_64 and arm64 of course. Thanks, Leon
© 2016 - 2026 Red Hat, Inc.