User space needs access to kernel BTF for many modern features of BPF.
Right now each process needs to read the BTF blob either in pieces or
as a whole. Allow mmaping the sysfs file so that processes can directly
access the memory allocated for it in the kernel.
remap_pfn_range is used instead of vm_insert_page due to aarch64
compatibility issues.
Tested-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
---
include/asm-generic/vmlinux.lds.h | 3 ++-
kernel/bpf/sysfs_btf.c | 32 ++++++++++++++++++++++++++++++++
2 files changed, 34 insertions(+), 1 deletion(-)
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
*/
#ifdef CONFIG_DEBUG_INFO_BTF
#define BTF \
+ . = ALIGN(PAGE_SIZE); \
.BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \
BOUNDED_SECTION_BY(.BTF, _BTF) \
} \
- . = ALIGN(4); \
+ . = ALIGN(PAGE_SIZE); \
.BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \
*(.BTF_ids) \
}
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -7,14 +7,46 @@
#include <linux/kobject.h>
#include <linux/init.h>
#include <linux/sysfs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/btf.h>
/* See scripts/link-vmlinux.sh, gen_btf() func for details */
extern char __start_BTF[];
extern char __stop_BTF[];
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
+ const struct bin_attribute *attr,
+ struct vm_area_struct *vma)
+{
+ unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
+ size_t vm_size = vma->vm_end - vma->vm_start;
+ phys_addr_t addr = virt_to_phys(__start_BTF);
+ unsigned long pfn = addr >> PAGE_SHIFT;
+
+ if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
+ return -EINVAL;
+
+ if (vma->vm_pgoff)
+ return -EINVAL;
+
+ if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
+ return -EACCES;
+
+ if (pfn + pages < pfn)
+ return -EINVAL;
+
+ if ((vm_size >> PAGE_SHIFT) > pages)
+ return -EINVAL;
+
+ vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
+ return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
+}
+
static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
.attr = { .name = "vmlinux", .mode = 0444, },
.read_new = sysfs_bin_attr_simple_read,
+ .mmap = btf_sysfs_vmlinux_mmap,
};
struct kobject *btf_kobj;
--
2.49.0
Hello Lorenz,
On Tue, May 20, 2025 at 02:01:17PM +0100, Lorenz Bauer wrote:
> diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
> index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644
> --- a/kernel/bpf/sysfs_btf.c
> +++ b/kernel/bpf/sysfs_btf.c
> extern char __start_BTF[];
> extern char __stop_BTF[];
> +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
> + const struct bin_attribute *attr,
> + struct vm_area_struct *vma)
> +{
> + unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
> + size_t vm_size = vma->vm_end - vma->vm_start;
> + phys_addr_t addr = virt_to_phys(__start_BTF);
I am getting the following warning on arm64 which seems related to this
code here. lines are based on cd031354087d8ae ("Merge branch
'net-mlx5e-add-support-for-pcie-congestion-events') net-next branch
[ 58.896157] virt_to_phys used for non-linear address: 000000009fea9737 (__start_BTF+0x0/0x685530)
[ 23.988669] WARNING: CPU: 25 PID: 1442 at arch/arm64/mm/physaddr.c:15 __virt_to_phys (arch/arm64/mm/physaddr.c:?)
[ 24.018136] Modules linked in: nvidia_cspmu(E) mlx5_ib(E) ipmi_ssif(E) arm_smmuv3_pmu(E) arm_cspmu_module(E) coresight_trbe(E) ib_uverbs(E) ipmi_devintf(E) ipmi_msghandler(E) coresight_stm(E) coresight_etm4x(E) coresight_tmc(E) coresight_funnel(E) stm_core(E) coresight(E) cppc_cpufreq(E) sch_fq_codel(E) drm(E) backlight(E) drm_panel_orientation_quirks(E) xhci_pci(E) xhci_hcd(E) sm3_ce(E) sha3_ce(E) sha512_ce(E) spi_tegra210_quad(E) acpi_power_meter(E) loop(E) efivarfs(E) autofs4(E)
[ 24.075371] Tainted: [E]=UNSIGNED_MODULE, [N]=TEST
[ 24.080276] Hardware name: Quanta S7GM 20S7GCU0010/S7G MB (CG1), BIOS 3D22 07/03/2024
[ 24.088295] pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
[ 24.098440] pc : __virt_to_phys (arch/arm64/mm/physaddr.c:?)
[ 24.105398] lr : __virt_to_phys (arch/arm64/mm/physaddr.c:?)
[ 24.112227] sp : ffff8000ba00f8e0
[ 24.115620] x29: ffff8000ba00f8e0 x28: ffff8000ba00faf0 x27: ffff8000ba00fa88
[ 24.122919] x26: ffff8000ba00fa40 x25: ffff800082772000 x24: 0000fffd6db70000
[ 24.130226] x23: 0000000000685530 x22: 0000fffd6e200000 x21: ffff800081cc0000
[ 24.140540] x20: ffff800081be02d8 x19: ffff800081cc0000 x18: 5f5f282037333739
[ 24.150708] x17: 6165663930303030 x16: 0000000000000fc4 x15: 0000000000000003
[ 24.160737] x14: ffff800082923398 x13: 0000000000000003 x12: 0000000000000003
[ 24.168042] x11: 00000000fffeffff x10: ffff800082663784 x9 : cc38fcac5cdabe00
[ 24.175348] x8 : 0001000000000000 x7 : ffff8000813dd878 x6 : 0000000000000000
[ 24.182653] x5 : 0000000000000001 x4 : 0000000000000001 x3 : 0000000000000000
[ 24.189959] x2 : 0000000000000000 x1 : ffff800081a3a6d0 x0 : 0000000000000055
[ 24.197257] Call trace:
[ 24.199761] __virt_to_phys (arch/arm64/mm/physaddr.c:?) (P)
[ 24.206883] btf_sysfs_vmlinux_mmap (kernel/bpf/sysfs_btf.c:27)
[ 24.214264] sysfs_kf_bin_mmap (fs/sysfs/file.c:179)
[ 24.218536] kernfs_fop_mmap (fs/kernfs/file.c:462)
[ 24.222461] mmap_region (./include/linux/fs.h:? mm/internal.h:167 mm/vma.c:2405 mm/vma.c:2467 mm/vma.c:2622 mm/vma.c:2692)
Should __pa_symbol() be used instead of virt_to_phys()?
Thanks
--breno
Hi Breno, Thanks for reaching out. On Thu, Jul 17, 2025 at 1:39 PM Breno Leitao <leitao@debian.org> wrote: > Should __pa_symbol() be used instead of virt_to_phys()? I'm not really well versed with mm in general. Looking around a bit I found some explanation in [1]. Your suggested fix does make sense to me based on that. Let me run the patch against bpf-ci and see what happens. 1: https://lore.kernel.org/all/90667b2b7f773308318261f96ebefd1a67133c4c.1732464395.git.lukas@wunner.de/ Lorenz
On Thu, Jul 17, 2025 at 6:18 AM Lorenz Bauer <lmb@isovalent.com> wrote: > > Hi Breno, > > Thanks for reaching out. > > On Thu, Jul 17, 2025 at 1:39 PM Breno Leitao <leitao@debian.org> wrote: > > > Should __pa_symbol() be used instead of virt_to_phys()? > > I'm not really well versed with mm in general. Looking around a bit I > found some explanation in [1]. Your suggested fix does make sense to > me based on that. > > Let me run the patch against bpf-ci and see what happens. > > 1: https://lore.kernel.org/all/90667b2b7f773308318261f96ebefd1a67133c4c.1732464395.git.lukas@wunner.de/ Thanks for the link. Key quote: "arm64 maps the kernel in the vmalloc space." I think the map shouldn't be destroying linearity of kernel rodata. __pa_symbol() should work for start_BTF, but would be good to double check with Ard that the rest stays linear.
On Thu, Jul 17, 2025 at 3:49 PM Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote: > __pa_symbol() should work for start_BTF, but would be good > to double check with Ard that the rest stays linear. Alexei, This code in the arm64 setup does make me think we'll be OK. kernel_code.start = __pa_symbol(_stext); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); Using these as start and end only makes sense to me if the addresses are linear? See https://elixir.bootlin.com/linux/v6.15.6/source/arch/arm64/kernel/setup.c#L217 Let me know if you want me to double check with Ard regardless. Best Lorenz
On Thu, Jul 17, 2025 at 8:15 AM Lorenz Bauer <lmb@isovalent.com> wrote: > > On Thu, Jul 17, 2025 at 3:49 PM Alexei Starovoitov > <alexei.starovoitov@gmail.com> wrote: > > > __pa_symbol() should work for start_BTF, but would be good > > to double check with Ard that the rest stays linear. > > Alexei, > > This code in the arm64 setup does make me think we'll be OK. > > kernel_code.start = __pa_symbol(_stext); > kernel_code.end = __pa_symbol(__init_begin - 1); > kernel_data.start = __pa_symbol(_sdata); > kernel_data.end = __pa_symbol(_end - 1); > > Using these as start and end only makes sense to me if the addresses > are linear? See > https://elixir.bootlin.com/linux/v6.15.6/source/arch/arm64/kernel/setup.c#L217 Thanks for checking. lgtm.
On Tue, May 20, 2025 at 02:01:17PM +0100, Lorenz Bauer wrote:
> User space needs access to kernel BTF for many modern features of BPF.
> Right now each process needs to read the BTF blob either in pieces or
> as a whole. Allow mmaping the sysfs file so that processes can directly
> access the memory allocated for it in the kernel.
>
> remap_pfn_range is used instead of vm_insert_page due to aarch64
> compatibility issues.
>
> Tested-by: Alan Maguire <alan.maguire@oracle.com>
> Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
> ---
> include/asm-generic/vmlinux.lds.h | 3 ++-
> kernel/bpf/sysfs_btf.c | 32 ++++++++++++++++++++++++++++++++
> 2 files changed, 34 insertions(+), 1 deletion(-)
>
> diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644
> --- a/include/asm-generic/vmlinux.lds.h
> +++ b/include/asm-generic/vmlinux.lds.h
> @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
> */
> #ifdef CONFIG_DEBUG_INFO_BTF
> #define BTF \
> + . = ALIGN(PAGE_SIZE); \
> .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \
> BOUNDED_SECTION_BY(.BTF, _BTF) \
> } \
> - . = ALIGN(4); \
> + . = ALIGN(PAGE_SIZE); \
> .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \
> *(.BTF_ids) \
> }
> diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
> index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644
> --- a/kernel/bpf/sysfs_btf.c
> +++ b/kernel/bpf/sysfs_btf.c
> @@ -7,14 +7,46 @@
> #include <linux/kobject.h>
> #include <linux/init.h>
> #include <linux/sysfs.h>
> +#include <linux/mm.h>
> +#include <linux/io.h>
> +#include <linux/btf.h>
>
> /* See scripts/link-vmlinux.sh, gen_btf() func for details */
> extern char __start_BTF[];
> extern char __stop_BTF[];
>
> +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
> + const struct bin_attribute *attr,
> + struct vm_area_struct *vma)
> +{
> + unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
> + size_t vm_size = vma->vm_end - vma->vm_start;
> + phys_addr_t addr = virt_to_phys(__start_BTF);
> + unsigned long pfn = addr >> PAGE_SHIFT;
> +
> + if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
With vmlinux.lds.h change above, is the page aligned check still needed?
Oh also can the size of btf region be non-page aligned?
> + return -EINVAL;
> +
> + if (vma->vm_pgoff)
> + return -EINVAL;
> +
> + if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
> + return -EACCES;
> +
> + if (pfn + pages < pfn)
> + return -EINVAL;
> +
> + if ((vm_size >> PAGE_SHIFT) > pages)
> + return -EINVAL;
> +
> + vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
Is it ok for fork() to keep the mapping in the child? (i.e. do you need
VM_DONTCOPY). BTW VM_DONTDUMP is added by remap_pfn_range(), so if you
want you can remove it here.
> + return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
> +}
> +
> static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
> .attr = { .name = "vmlinux", .mode = 0444, },
> .read_new = sysfs_bin_attr_simple_read,
> + .mmap = btf_sysfs_vmlinux_mmap,
> };
>
> struct kobject *btf_kobj;
>
Overall this looks good to me, so you can add:
Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
On Thu, May 22, 2025 at 4:01 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Tue, May 20, 2025 at 02:01:17PM +0100, Lorenz Bauer wrote:
> > User space needs access to kernel BTF for many modern features of BPF.
> > Right now each process needs to read the BTF blob either in pieces or
> > as a whole. Allow mmaping the sysfs file so that processes can directly
> > access the memory allocated for it in the kernel.
> >
> > remap_pfn_range is used instead of vm_insert_page due to aarch64
> > compatibility issues.
> >
> > Tested-by: Alan Maguire <alan.maguire@oracle.com>
> > Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
> > ---
> > include/asm-generic/vmlinux.lds.h | 3 ++-
> > kernel/bpf/sysfs_btf.c | 32 ++++++++++++++++++++++++++++++++
> > 2 files changed, 34 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> > index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644
> > --- a/include/asm-generic/vmlinux.lds.h
> > +++ b/include/asm-generic/vmlinux.lds.h
> > @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
> > */
> > #ifdef CONFIG_DEBUG_INFO_BTF
> > #define BTF \
> > + . = ALIGN(PAGE_SIZE); \
> > .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) { \
> > BOUNDED_SECTION_BY(.BTF, _BTF) \
> > } \
> > - . = ALIGN(4); \
> > + . = ALIGN(PAGE_SIZE); \
> > .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) { \
> > *(.BTF_ids) \
> > }
> > diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
> > index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644
> > --- a/kernel/bpf/sysfs_btf.c
> > +++ b/kernel/bpf/sysfs_btf.c
> > @@ -7,14 +7,46 @@
> > #include <linux/kobject.h>
> > #include <linux/init.h>
> > #include <linux/sysfs.h>
> > +#include <linux/mm.h>
> > +#include <linux/io.h>
> > +#include <linux/btf.h>
> >
> > /* See scripts/link-vmlinux.sh, gen_btf() func for details */
> > extern char __start_BTF[];
> > extern char __stop_BTF[];
> >
> > +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
> > + const struct bin_attribute *attr,
> > + struct vm_area_struct *vma)
> > +{
> > + unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
> > + size_t vm_size = vma->vm_end - vma->vm_start;
> > + phys_addr_t addr = virt_to_phys(__start_BTF);
> > + unsigned long pfn = addr >> PAGE_SHIFT;
> > +
> > + if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
>
> With vmlinux.lds.h change above, is the page aligned check still needed?
>
> Oh also can the size of btf region be non-page aligned?
I'd probably leave this as a sanity/safety check, just in case someone
modifies linker script and we miss this.
BTF region size isn't page-aligned but in the linker script we
page-align .BTF_ids that follows it, so the padding should be zeroed
out. And Lorenz added a check in the selftest to validate this, so we
should be covered.
>
> > + return -EINVAL;
> > +
> > + if (vma->vm_pgoff)
> > + return -EINVAL;
> > +
> > + if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
> > + return -EACCES;
> > +
> > + if (pfn + pages < pfn)
> > + return -EINVAL;
> > +
> > + if ((vm_size >> PAGE_SHIFT) > pages)
> > + return -EINVAL;
> > +
> > + vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
>
> Is it ok for fork() to keep the mapping in the child? (i.e. do you need
> VM_DONTCOPY). BTW VM_DONTDUMP is added by remap_pfn_range(), so if you
> want you can remove it here.
I think it's good to keep it in the fork, otherwise libbpf might crash
after work due to BTF data suddenly disappearing.
>
> > + return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
> > +}
> > +
> > static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
> > .attr = { .name = "vmlinux", .mode = 0444, },
> > .read_new = sysfs_bin_attr_simple_read,
> > + .mmap = btf_sysfs_vmlinux_mmap,
> > };
> >
> > struct kobject *btf_kobj;
> >
>
> Overall this looks good to me, so you can add:
>
> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Thanks Shakeel, I've applied the patches to bpf-next!
© 2016 - 2025 Red Hat, Inc.