[PATCH bpf-next v5 1/3] btf: allow mmap of vmlinux btf

Lorenz Bauer posted 3 patches 6 months, 4 weeks ago
[PATCH bpf-next v5 1/3] btf: allow mmap of vmlinux btf
Posted by Lorenz Bauer 6 months, 4 weeks ago
User space needs access to kernel BTF for many modern features of BPF.
Right now each process needs to read the BTF blob either in pieces or
as a whole. Allow mmaping the sysfs file so that processes can directly
access the memory allocated for it in the kernel.

remap_pfn_range is used instead of vm_insert_page due to aarch64
compatibility issues.

Tested-by: Alan Maguire <alan.maguire@oracle.com>
Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
---
 include/asm-generic/vmlinux.lds.h |  3 ++-
 kernel/bpf/sysfs_btf.c            | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
  */
 #ifdef CONFIG_DEBUG_INFO_BTF
 #define BTF								\
+	. = ALIGN(PAGE_SIZE);						\
 	.BTF : AT(ADDR(.BTF) - LOAD_OFFSET) {				\
 		BOUNDED_SECTION_BY(.BTF, _BTF)				\
 	}								\
-	. = ALIGN(4);							\
+	. = ALIGN(PAGE_SIZE);						\
 	.BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) {			\
 		*(.BTF_ids)						\
 	}
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -7,14 +7,46 @@
 #include <linux/kobject.h>
 #include <linux/init.h>
 #include <linux/sysfs.h>
+#include <linux/mm.h>
+#include <linux/io.h>
+#include <linux/btf.h>
 
 /* See scripts/link-vmlinux.sh, gen_btf() func for details */
 extern char __start_BTF[];
 extern char __stop_BTF[];
 
+static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
+				  const struct bin_attribute *attr,
+				  struct vm_area_struct *vma)
+{
+	unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
+	size_t vm_size = vma->vm_end - vma->vm_start;
+	phys_addr_t addr = virt_to_phys(__start_BTF);
+	unsigned long pfn = addr >> PAGE_SHIFT;
+
+	if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
+		return -EINVAL;
+
+	if (vma->vm_pgoff)
+		return -EINVAL;
+
+	if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
+		return -EACCES;
+
+	if (pfn + pages < pfn)
+		return -EINVAL;
+
+	if ((vm_size >> PAGE_SHIFT) > pages)
+		return -EINVAL;
+
+	vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
+	return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
+}
+
 static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
 	.attr = { .name = "vmlinux", .mode = 0444, },
 	.read_new = sysfs_bin_attr_simple_read,
+	.mmap = btf_sysfs_vmlinux_mmap,
 };
 
 struct kobject *btf_kobj;

-- 
2.49.0
Re: [PATCH bpf-next v5 1/3] btf: allow mmap of vmlinux btf
Posted by Breno Leitao 5 months ago
Hello Lorenz,

On Tue, May 20, 2025 at 02:01:17PM +0100, Lorenz Bauer wrote:
> diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
> index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644
> --- a/kernel/bpf/sysfs_btf.c
> +++ b/kernel/bpf/sysfs_btf.c

>  extern char __start_BTF[];
>  extern char __stop_BTF[];

> +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
> +				  const struct bin_attribute *attr,
> +				  struct vm_area_struct *vma)
> +{
> +	unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
> +	size_t vm_size = vma->vm_end - vma->vm_start;
> +	phys_addr_t addr = virt_to_phys(__start_BTF);

I am getting the following warning on arm64 which seems related to this
code here. lines are based on cd031354087d8ae ("Merge branch
'net-mlx5e-add-support-for-pcie-congestion-events') net-next branch

	[   58.896157] virt_to_phys used for non-linear address: 000000009fea9737 (__start_BTF+0x0/0x685530)
	[   23.988669] WARNING: CPU: 25 PID: 1442 at arch/arm64/mm/physaddr.c:15 __virt_to_phys (arch/arm64/mm/physaddr.c:?)
	[   24.018136] Modules linked in: nvidia_cspmu(E) mlx5_ib(E) ipmi_ssif(E) arm_smmuv3_pmu(E) arm_cspmu_module(E) coresight_trbe(E) ib_uverbs(E) ipmi_devintf(E) ipmi_msghandler(E) coresight_stm(E) coresight_etm4x(E) coresight_tmc(E) coresight_funnel(E) stm_core(E) coresight(E) cppc_cpufreq(E) sch_fq_codel(E) drm(E) backlight(E) drm_panel_orientation_quirks(E) xhci_pci(E) xhci_hcd(E) sm3_ce(E) sha3_ce(E) sha512_ce(E) spi_tegra210_quad(E) acpi_power_meter(E) loop(E) efivarfs(E) autofs4(E)
	[   24.075371] Tainted: [E]=UNSIGNED_MODULE, [N]=TEST
	[   24.080276] Hardware name: Quanta S7GM 20S7GCU0010/S7G MB (CG1), BIOS 3D22 07/03/2024
	[   24.088295] pstate: 63400009 (nZCv daif +PAN -UAO +TCO +DIT -SSBS BTYPE=--)
	[   24.098440] pc : __virt_to_phys (arch/arm64/mm/physaddr.c:?)
	[   24.105398] lr : __virt_to_phys (arch/arm64/mm/physaddr.c:?)
	[   24.112227] sp : ffff8000ba00f8e0
	[   24.115620] x29: ffff8000ba00f8e0 x28: ffff8000ba00faf0 x27: ffff8000ba00fa88
	[   24.122919] x26: ffff8000ba00fa40 x25: ffff800082772000 x24: 0000fffd6db70000
	[   24.130226] x23: 0000000000685530 x22: 0000fffd6e200000 x21: ffff800081cc0000
	[   24.140540] x20: ffff800081be02d8 x19: ffff800081cc0000 x18: 5f5f282037333739
	[   24.150708] x17: 6165663930303030 x16: 0000000000000fc4 x15: 0000000000000003
	[   24.160737] x14: ffff800082923398 x13: 0000000000000003 x12: 0000000000000003
	[   24.168042] x11: 00000000fffeffff x10: ffff800082663784 x9 : cc38fcac5cdabe00
	[   24.175348] x8 : 0001000000000000 x7 : ffff8000813dd878 x6 : 0000000000000000
	[   24.182653] x5 : 0000000000000001 x4 : 0000000000000001 x3 : 0000000000000000
	[   24.189959] x2 : 0000000000000000 x1 : ffff800081a3a6d0 x0 : 0000000000000055
	[   24.197257] Call trace:
	[   24.199761] __virt_to_phys (arch/arm64/mm/physaddr.c:?) (P)
	[   24.206883] btf_sysfs_vmlinux_mmap (kernel/bpf/sysfs_btf.c:27)
	[   24.214264] sysfs_kf_bin_mmap (fs/sysfs/file.c:179)
	[   24.218536] kernfs_fop_mmap (fs/kernfs/file.c:462)
	[   24.222461] mmap_region (./include/linux/fs.h:? mm/internal.h:167 mm/vma.c:2405 mm/vma.c:2467 mm/vma.c:2622 mm/vma.c:2692)


Should __pa_symbol() be used instead of virt_to_phys()?

Thanks
--breno
Re: [PATCH bpf-next v5 1/3] btf: allow mmap of vmlinux btf
Posted by Lorenz Bauer 5 months ago
Hi Breno,

Thanks for reaching out.

On Thu, Jul 17, 2025 at 1:39 PM Breno Leitao <leitao@debian.org> wrote:

> Should __pa_symbol() be used instead of virt_to_phys()?

I'm not really well versed with mm in general. Looking around a bit I
found some explanation in [1]. Your suggested fix does make sense to
me based on that.

Let me run the patch against bpf-ci and see what happens.

1: https://lore.kernel.org/all/90667b2b7f773308318261f96ebefd1a67133c4c.1732464395.git.lukas@wunner.de/

Lorenz
Re: [PATCH bpf-next v5 1/3] btf: allow mmap of vmlinux btf
Posted by Alexei Starovoitov 5 months ago
On Thu, Jul 17, 2025 at 6:18 AM Lorenz Bauer <lmb@isovalent.com> wrote:
>
> Hi Breno,
>
> Thanks for reaching out.
>
> On Thu, Jul 17, 2025 at 1:39 PM Breno Leitao <leitao@debian.org> wrote:
>
> > Should __pa_symbol() be used instead of virt_to_phys()?
>
> I'm not really well versed with mm in general. Looking around a bit I
> found some explanation in [1]. Your suggested fix does make sense to
> me based on that.
>
> Let me run the patch against bpf-ci and see what happens.
>
> 1: https://lore.kernel.org/all/90667b2b7f773308318261f96ebefd1a67133c4c.1732464395.git.lukas@wunner.de/

Thanks for the link.
Key quote: "arm64 maps the kernel in the vmalloc space."
I think the map shouldn't be destroying linearity of kernel rodata.
__pa_symbol() should work for start_BTF, but would be good
to double check with Ard that the rest stays linear.
Re: [PATCH bpf-next v5 1/3] btf: allow mmap of vmlinux btf
Posted by Lorenz Bauer 5 months ago
On Thu, Jul 17, 2025 at 3:49 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:

> __pa_symbol() should work for start_BTF, but would be good
> to double check with Ard that the rest stays linear.

Alexei,

This code in the arm64 setup does make me think we'll be OK.

kernel_code.start   = __pa_symbol(_stext);
kernel_code.end     = __pa_symbol(__init_begin - 1);
kernel_data.start   = __pa_symbol(_sdata);
kernel_data.end     = __pa_symbol(_end - 1);

Using these as start and end only makes sense to me if the addresses
are linear? See
https://elixir.bootlin.com/linux/v6.15.6/source/arch/arm64/kernel/setup.c#L217

Let me know if you want me to double check with Ard regardless.

Best
Lorenz
Re: [PATCH bpf-next v5 1/3] btf: allow mmap of vmlinux btf
Posted by Alexei Starovoitov 5 months ago
On Thu, Jul 17, 2025 at 8:15 AM Lorenz Bauer <lmb@isovalent.com> wrote:
>
> On Thu, Jul 17, 2025 at 3:49 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>
> > __pa_symbol() should work for start_BTF, but would be good
> > to double check with Ard that the rest stays linear.
>
> Alexei,
>
> This code in the arm64 setup does make me think we'll be OK.
>
> kernel_code.start   = __pa_symbol(_stext);
> kernel_code.end     = __pa_symbol(__init_begin - 1);
> kernel_data.start   = __pa_symbol(_sdata);
> kernel_data.end     = __pa_symbol(_end - 1);
>
> Using these as start and end only makes sense to me if the addresses
> are linear? See
> https://elixir.bootlin.com/linux/v6.15.6/source/arch/arm64/kernel/setup.c#L217

Thanks for checking. lgtm.
Re: [PATCH bpf-next v5 1/3] btf: allow mmap of vmlinux btf
Posted by Shakeel Butt 6 months, 3 weeks ago
On Tue, May 20, 2025 at 02:01:17PM +0100, Lorenz Bauer wrote:
> User space needs access to kernel BTF for many modern features of BPF.
> Right now each process needs to read the BTF blob either in pieces or
> as a whole. Allow mmaping the sysfs file so that processes can directly
> access the memory allocated for it in the kernel.
> 
> remap_pfn_range is used instead of vm_insert_page due to aarch64
> compatibility issues.
> 
> Tested-by: Alan Maguire <alan.maguire@oracle.com>
> Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
> ---
>  include/asm-generic/vmlinux.lds.h |  3 ++-
>  kernel/bpf/sysfs_btf.c            | 32 ++++++++++++++++++++++++++++++++
>  2 files changed, 34 insertions(+), 1 deletion(-)
> 
> diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644
> --- a/include/asm-generic/vmlinux.lds.h
> +++ b/include/asm-generic/vmlinux.lds.h
> @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
>   */
>  #ifdef CONFIG_DEBUG_INFO_BTF
>  #define BTF								\
> +	. = ALIGN(PAGE_SIZE);						\
>  	.BTF : AT(ADDR(.BTF) - LOAD_OFFSET) {				\
>  		BOUNDED_SECTION_BY(.BTF, _BTF)				\
>  	}								\
> -	. = ALIGN(4);							\
> +	. = ALIGN(PAGE_SIZE);						\
>  	.BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) {			\
>  		*(.BTF_ids)						\
>  	}
> diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
> index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644
> --- a/kernel/bpf/sysfs_btf.c
> +++ b/kernel/bpf/sysfs_btf.c
> @@ -7,14 +7,46 @@
>  #include <linux/kobject.h>
>  #include <linux/init.h>
>  #include <linux/sysfs.h>
> +#include <linux/mm.h>
> +#include <linux/io.h>
> +#include <linux/btf.h>
>  
>  /* See scripts/link-vmlinux.sh, gen_btf() func for details */
>  extern char __start_BTF[];
>  extern char __stop_BTF[];
>  
> +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
> +				  const struct bin_attribute *attr,
> +				  struct vm_area_struct *vma)
> +{
> +	unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
> +	size_t vm_size = vma->vm_end - vma->vm_start;
> +	phys_addr_t addr = virt_to_phys(__start_BTF);
> +	unsigned long pfn = addr >> PAGE_SHIFT;
> +
> +	if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))

With vmlinux.lds.h change above, is the page aligned check still needed?

Oh also can the size of btf region be non-page aligned?

> +		return -EINVAL;
> +
> +	if (vma->vm_pgoff)
> +		return -EINVAL;
> +
> +	if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
> +		return -EACCES;
> +
> +	if (pfn + pages < pfn)
> +		return -EINVAL;
> +
> +	if ((vm_size >> PAGE_SHIFT) > pages)
> +		return -EINVAL;
> +
> +	vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);

Is it ok for fork() to keep the mapping in the child? (i.e. do you need
VM_DONTCOPY). BTW VM_DONTDUMP is added by remap_pfn_range(), so if you
want you can remove it here.

> +	return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
> +}
> +
>  static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
>  	.attr = { .name = "vmlinux", .mode = 0444, },
>  	.read_new = sysfs_bin_attr_simple_read,
> +	.mmap = btf_sysfs_vmlinux_mmap,
>  };
>  
>  struct kobject *btf_kobj;
> 

Overall this looks good to me, so you can add:

Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
Re: [PATCH bpf-next v5 1/3] btf: allow mmap of vmlinux btf
Posted by Andrii Nakryiko 6 months, 3 weeks ago
On Thu, May 22, 2025 at 4:01 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Tue, May 20, 2025 at 02:01:17PM +0100, Lorenz Bauer wrote:
> > User space needs access to kernel BTF for many modern features of BPF.
> > Right now each process needs to read the BTF blob either in pieces or
> > as a whole. Allow mmaping the sysfs file so that processes can directly
> > access the memory allocated for it in the kernel.
> >
> > remap_pfn_range is used instead of vm_insert_page due to aarch64
> > compatibility issues.
> >
> > Tested-by: Alan Maguire <alan.maguire@oracle.com>
> > Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
> > ---
> >  include/asm-generic/vmlinux.lds.h |  3 ++-
> >  kernel/bpf/sysfs_btf.c            | 32 ++++++++++++++++++++++++++++++++
> >  2 files changed, 34 insertions(+), 1 deletion(-)
> >
> > diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
> > index 58a635a6d5bdf0c53c267c2a3d21a5ed8678ce73..1750390735fac7637cc4d2fa05f96cb2a36aa448 100644
> > --- a/include/asm-generic/vmlinux.lds.h
> > +++ b/include/asm-generic/vmlinux.lds.h
> > @@ -667,10 +667,11 @@ defined(CONFIG_AUTOFDO_CLANG) || defined(CONFIG_PROPELLER_CLANG)
> >   */
> >  #ifdef CONFIG_DEBUG_INFO_BTF
> >  #define BTF                                                          \
> > +     . = ALIGN(PAGE_SIZE);                                           \
> >       .BTF : AT(ADDR(.BTF) - LOAD_OFFSET) {                           \
> >               BOUNDED_SECTION_BY(.BTF, _BTF)                          \
> >       }                                                               \
> > -     . = ALIGN(4);                                                   \
> > +     . = ALIGN(PAGE_SIZE);                                           \
> >       .BTF_ids : AT(ADDR(.BTF_ids) - LOAD_OFFSET) {                   \
> >               *(.BTF_ids)                                             \
> >       }
> > diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
> > index 81d6cf90584a7157929c50f62a5c6862e7a3d081..941d0d2427e3a2d27e8f1cff7b6424d0d41817c1 100644
> > --- a/kernel/bpf/sysfs_btf.c
> > +++ b/kernel/bpf/sysfs_btf.c
> > @@ -7,14 +7,46 @@
> >  #include <linux/kobject.h>
> >  #include <linux/init.h>
> >  #include <linux/sysfs.h>
> > +#include <linux/mm.h>
> > +#include <linux/io.h>
> > +#include <linux/btf.h>
> >
> >  /* See scripts/link-vmlinux.sh, gen_btf() func for details */
> >  extern char __start_BTF[];
> >  extern char __stop_BTF[];
> >
> > +static int btf_sysfs_vmlinux_mmap(struct file *filp, struct kobject *kobj,
> > +                               const struct bin_attribute *attr,
> > +                               struct vm_area_struct *vma)
> > +{
> > +     unsigned long pages = PAGE_ALIGN(attr->size) >> PAGE_SHIFT;
> > +     size_t vm_size = vma->vm_end - vma->vm_start;
> > +     phys_addr_t addr = virt_to_phys(__start_BTF);
> > +     unsigned long pfn = addr >> PAGE_SHIFT;
> > +
> > +     if (attr->private != __start_BTF || !PAGE_ALIGNED(addr))
>
> With vmlinux.lds.h change above, is the page aligned check still needed?
>
> Oh also can the size of btf region be non-page aligned?

I'd probably leave this as a sanity/safety check, just in case someone
modifies linker script and we miss this.

BTF region size isn't page-aligned but in the linker script we
page-align .BTF_ids that follows it, so the padding should be zeroed
out. And Lorenz added a check in the selftest to validate this, so we
should be covered.

>
> > +             return -EINVAL;
> > +
> > +     if (vma->vm_pgoff)
> > +             return -EINVAL;
> > +
> > +     if (vma->vm_flags & (VM_WRITE | VM_EXEC | VM_MAYSHARE))
> > +             return -EACCES;
> > +
> > +     if (pfn + pages < pfn)
> > +             return -EINVAL;
> > +
> > +     if ((vm_size >> PAGE_SHIFT) > pages)
> > +             return -EINVAL;
> > +
> > +     vm_flags_mod(vma, VM_DONTDUMP, VM_MAYEXEC | VM_MAYWRITE);
>
> Is it ok for fork() to keep the mapping in the child? (i.e. do you need
> VM_DONTCOPY). BTW VM_DONTDUMP is added by remap_pfn_range(), so if you
> want you can remove it here.

I think it's good to keep it in the fork, otherwise libbpf might crash
after work due to BTF data suddenly disappearing.

>
> > +     return remap_pfn_range(vma, vma->vm_start, pfn, vm_size, vma->vm_page_prot);
> > +}
> > +
> >  static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = {
> >       .attr = { .name = "vmlinux", .mode = 0444, },
> >       .read_new = sysfs_bin_attr_simple_read,
> > +     .mmap = btf_sysfs_vmlinux_mmap,
> >  };
> >
> >  struct kobject *btf_kobj;
> >
>
> Overall this looks good to me, so you can add:
>
> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>

Thanks Shakeel, I've applied the patches to bpf-next!