Some distros have started compiling frame pointers into all their
packages to enable the kernel to do system-wide profiling of user space.
Unfortunately that creates a runtime performance penalty across the
entire system. Using DWARF instead isn't feasible due to the complexity
it would add to the kernel.
For in-kernel unwinding we solved this problem with the creation of the
ORC unwinder for x86_64. Similarly, for user space the GNU assembler
has created the sframe format starting with binutils 2.41 for sframe v2.
Sframe is a simpler version of .eh_frame. It gets placed in the .sframe
section.
Add support for unwinding user space using sframe.
More information about sframe can be found here:
- https://lwn.net/Articles/932209/
- https://lwn.net/Articles/940686/
- https://sourceware.org/binutils/docs/sframe-spec.html
Glibc support is needed to implement the prctl() calls to tell the
kernel where the .sframe segments are.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
arch/Kconfig | 4 +
arch/x86/include/asm/mmu.h | 2 +-
fs/binfmt_elf.c | 35 +++-
include/linux/mm_types.h | 3 +
include/linux/sframe.h | 41 ++++
include/linux/unwind_user.h | 2 +
include/uapi/linux/elf.h | 1 +
include/uapi/linux/prctl.h | 3 +
kernel/fork.c | 10 +
kernel/sys.c | 11 ++
kernel/unwind/Makefile | 3 +-
kernel/unwind/sframe.c | 380 ++++++++++++++++++++++++++++++++++++
kernel/unwind/sframe.h | 215 ++++++++++++++++++++
kernel/unwind/user.c | 24 ++-
mm/init-mm.c | 6 +
15 files changed, 732 insertions(+), 8 deletions(-)
create mode 100644 include/linux/sframe.h
create mode 100644 kernel/unwind/sframe.c
create mode 100644 kernel/unwind/sframe.h
diff --git a/arch/Kconfig b/arch/Kconfig
index ee8ec97ea0ef..e769c39dd221 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -442,6 +442,10 @@ config HAVE_UNWIND_USER_FP
bool
select UNWIND_USER
+config HAVE_UNWIND_USER_SFRAME
+ bool
+ select UNWIND_USER
+
config HAVE_PERF_REGS
bool
help
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index ce4677b8b735..12ea831978cc 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -73,7 +73,7 @@ typedef struct {
.context = { \
.ctx_id = 1, \
.lock = __MUTEX_INITIALIZER(mm.context.lock), \
- }
+ },
void leave_mm(void);
#define leave_mm leave_mm
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 06dc4a57ba78..434c548f0837 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -47,6 +47,7 @@
#include <linux/dax.h>
#include <linux/uaccess.h>
#include <linux/rseq.h>
+#include <linux/sframe.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -633,11 +634,13 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
unsigned long no_base, struct elf_phdr *interp_elf_phdata,
struct arch_elf_state *arch_state)
{
- struct elf_phdr *eppnt;
+ struct elf_phdr *eppnt, *sframe_phdr = NULL;
unsigned long load_addr = 0;
int load_addr_set = 0;
unsigned long error = ~0UL;
unsigned long total_size;
+ unsigned long start_code = ~0UL;
+ unsigned long end_code = 0;
int i;
/* First of all, some simple consistency checks */
@@ -659,7 +662,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
eppnt = interp_elf_phdata;
for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
- if (eppnt->p_type == PT_LOAD) {
+ switch (eppnt->p_type) {
+ case PT_LOAD: {
int elf_type = MAP_PRIVATE;
int elf_prot = make_prot(eppnt->p_flags, arch_state,
true, true);
@@ -688,7 +692,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
/*
* Check to see if the section's size will overflow the
* allowed task size. Note that p_filesz must always be
- * <= p_memsize so it's only necessary to check p_memsz.
+ * <= p_memsz so it's only necessary to check p_memsz.
*/
k = load_addr + eppnt->p_vaddr;
if (BAD_ADDR(k) ||
@@ -698,9 +702,24 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
error = -ENOMEM;
goto out;
}
+
+ if ((eppnt->p_flags & PF_X) && k < start_code)
+ start_code = k;
+
+ if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
+ end_code = k + eppnt->p_filesz;
+ break;
+ }
+ case PT_GNU_SFRAME:
+ sframe_phdr = eppnt;
+ break;
}
}
+ if (sframe_phdr)
+ sframe_add_section(load_addr + sframe_phdr->p_vaddr,
+ start_code, end_code);
+
error = load_addr;
out:
return error;
@@ -823,7 +842,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
int first_pt_load = 1;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
- struct elf_phdr *elf_property_phdata = NULL;
+ struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
unsigned long elf_brk;
int retval, i;
unsigned long elf_entry;
@@ -931,6 +950,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
executable_stack = EXSTACK_DISABLE_X;
break;
+ case PT_GNU_SFRAME:
+ sframe_phdr = elf_ppnt;
+ break;
+
case PT_LOPROC ... PT_HIPROC:
retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
bprm->file, false,
@@ -1321,6 +1344,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
task_pid_nr(current), retval);
}
+ if (sframe_phdr)
+ sframe_add_section(load_bias + sframe_phdr->p_vaddr,
+ start_code, end_code);
+
regs = current_pt_regs();
#ifdef ELF_PLAT_INIT
/*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 381d22eba088..6e7561c1a5fc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1052,6 +1052,9 @@ struct mm_struct {
#endif
} lru_gen;
#endif /* CONFIG_LRU_GEN_WALKS_MMU */
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+ struct maple_tree sframe_mt;
+#endif
} __randomize_layout;
/*
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
new file mode 100644
index 000000000000..d167e01817c4
--- /dev/null
+++ b/include/linux/sframe.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SFRAME_H
+#define _LINUX_SFRAME_H
+
+#include <linux/mm_types.h>
+
+struct unwind_user_frame;
+
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+
+#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
+
+extern void sframe_free_mm(struct mm_struct *mm);
+
+/* text_start, text_end, file_name are optional */
+extern int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
+ unsigned long text_end);
+
+extern int sframe_remove_section(unsigned long sframe_addr);
+extern int sframe_find(unsigned long ip, struct unwind_user_frame *frame);
+
+static inline bool current_has_sframe(void)
+{
+ struct mm_struct *mm = current->mm;
+
+ return mm && !mtree_empty(&mm->sframe_mt);
+}
+
+#else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
+
+static inline void sframe_free_mm(struct mm_struct *mm) {}
+
+static inline int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end) { return -EINVAL; }
+static inline int sframe_remove_section(unsigned long sframe_addr) { return -EINVAL; }
+static inline int sframe_find(unsigned long ip, struct unwind_user_frame *frame) { return -EINVAL; }
+
+static inline bool current_has_sframe(void) { return false; }
+
+#endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
+
+#endif /* _LINUX_SFRAME_H */
diff --git a/include/linux/unwind_user.h b/include/linux/unwind_user.h
index 9d28db06f33f..cde0fde4923e 100644
--- a/include/linux/unwind_user.h
+++ b/include/linux/unwind_user.h
@@ -5,7 +5,9 @@
#include <linux/types.h>
enum unwind_user_type {
+ UNWIND_USER_TYPE_NONE,
UNWIND_USER_TYPE_FP,
+ UNWIND_USER_TYPE_SFRAME,
};
struct unwind_stacktrace {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b9935988da5c..4dc3f0ca5af5 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -39,6 +39,7 @@ typedef __s64 Elf64_Sxword;
#define PT_GNU_STACK (PT_LOOS + 0x474e551)
#define PT_GNU_RELRO (PT_LOOS + 0x474e552)
#define PT_GNU_PROPERTY (PT_LOOS + 0x474e553)
+#define PT_GNU_SFRAME (PT_LOOS + 0x474e554)
/* ARM MTE memory tag segment type */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879..69511077c910 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -328,4 +328,7 @@ struct prctl_mm_map {
# define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC 0x10 /* Clear the aspect on exec */
# define PR_PPC_DEXCR_CTRL_MASK 0x1f
+#define PR_ADD_SFRAME 74
+#define PR_REMOVE_SFRAME 75
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index c056ea95fe8c..60f14fbab956 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
#include <linux/rseq.h>
#include <uapi/linux/pidfd.h>
#include <linux/pidfs.h>
+#include <linux/sframe.h>
#include <asm/pgalloc.h>
#include <linux/uaccess.h>
@@ -924,6 +925,7 @@ void __mmdrop(struct mm_struct *mm)
mm_pasid_drop(mm);
mm_destroy_cid(mm);
percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+ sframe_free_mm(mm);
free_mm(mm);
}
@@ -1251,6 +1253,13 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
#endif
}
+static void mm_init_sframe(struct mm_struct *mm)
+{
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+ mt_init(&mm->sframe_mt);
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
@@ -1282,6 +1291,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm->pmd_huge_pte = NULL;
#endif
mm_init_uprobes_state(mm);
+ mm_init_sframe(mm);
hugetlb_count_init(mm);
if (current->mm) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f28fda8..7d05a67727db 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -64,6 +64,7 @@
#include <linux/rcupdate.h>
#include <linux/uidgid.h>
#include <linux/cred.h>
+#include <linux/sframe.h>
#include <linux/nospec.h>
@@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_RISCV_SET_ICACHE_FLUSH_CTX:
error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
break;
+ case PR_ADD_SFRAME:
+ if (arg5)
+ return -EINVAL;
+ error = sframe_add_section(arg2, arg3, arg4);
+ break;
+ case PR_REMOVE_SFRAME:
+ if (arg3 || arg4 || arg5)
+ return -EINVAL;
+ error = sframe_remove_section(arg2);
+ break;
default:
error = -EINVAL;
break;
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index 349ce3677526..f70380d7a6a6 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1 +1,2 @@
- obj-$(CONFIG_UNWIND_USER) += user.o
+ obj-$(CONFIG_UNWIND_USER) += user.o
+ obj-$(CONFIG_HAVE_UNWIND_USER_SFRAME) += sframe.o
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
new file mode 100644
index 000000000000..933e47696e29
--- /dev/null
+++ b/kernel/unwind/sframe.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt) "sframe: " fmt
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <linux/sframe.h>
+#include <linux/unwind_user.h>
+
+#include "sframe.h"
+
+#define SFRAME_FILENAME_LEN 32
+
+struct sframe_section {
+ struct rcu_head rcu;
+
+ unsigned long sframe_addr;
+ unsigned long text_addr;
+
+ unsigned long fdes_addr;
+ unsigned long fres_addr;
+ unsigned int fdes_nr;
+ signed char ra_off;
+ signed char fp_off;
+};
+
+DEFINE_STATIC_SRCU(sframe_srcu);
+
+#define __SFRAME_GET_USER(out, user_ptr, type) \
+({ \
+ type __tmp; \
+ if (get_user(__tmp, (type __user *)user_ptr)) \
+ return -EFAULT; \
+ user_ptr += sizeof(__tmp); \
+ out = __tmp; \
+})
+
+#define SFRAME_GET_USER(out, user_ptr, size) \
+({ \
+ switch (size) { \
+ case 1: \
+ __SFRAME_GET_USER(out, user_ptr, u8); \
+ break; \
+ case 2: \
+ __SFRAME_GET_USER(out, user_ptr, u16); \
+ break; \
+ case 4: \
+ __SFRAME_GET_USER(out, user_ptr, u32); \
+ break; \
+ default: \
+ return -EINVAL; \
+ } \
+})
+
+static unsigned char fre_type_to_size(unsigned char fre_type)
+{
+ if (fre_type > 2)
+ return 0;
+ return 1 << fre_type;
+}
+
+static unsigned char offset_size_enum_to_size(unsigned char off_size)
+{
+ if (off_size > 2)
+ return 0;
+ return 1 << off_size;
+}
+
+static int find_fde(struct sframe_section *sec, unsigned long ip,
+ struct sframe_fde *fde)
+{
+ struct sframe_fde __user *first, *last, *found = NULL;
+ u32 ip_off, func_off_low = 0, func_off_high = -1;
+
+ ip_off = ip - sec->sframe_addr;
+
+ first = (void __user *)sec->fdes_addr;
+ last = first + sec->fdes_nr;
+ while (first <= last) {
+ struct sframe_fde __user *mid;
+ u32 func_off;
+
+ mid = first + ((last - first) / 2);
+
+ if (get_user(func_off, (s32 __user *)mid))
+ return -EFAULT;
+
+ if (ip_off >= func_off) {
+ /* validate sort order */
+ if (func_off < func_off_low)
+ return -EINVAL;
+
+ func_off_low = func_off;
+
+ found = mid;
+ first = mid + 1;
+ } else {
+ /* validate sort order */
+ if (func_off > func_off_high)
+ return -EINVAL;
+
+ func_off_high = func_off;
+
+ last = mid - 1;
+ }
+ }
+
+ if (!found)
+ return -EINVAL;
+
+ if (copy_from_user(fde, found, sizeof(*fde)))
+ return -EFAULT;
+
+ /* check for gaps */
+ if (ip_off < fde->start_addr || ip_off >= fde->start_addr + fde->size)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int find_fre(struct sframe_section *sec, struct sframe_fde *fde,
+ unsigned long ip, struct unwind_user_frame *frame)
+{
+ unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info);
+ unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info);
+ unsigned char offset_count, offset_size;
+ s32 cfa_off, ra_off, fp_off, ip_off;
+ void __user *f, *last_f = NULL;
+ unsigned char addr_size;
+ u32 last_fre_ip_off = 0;
+ u8 fre_info = 0;
+ int i;
+
+ addr_size = fre_type_to_size(fre_type);
+ if (!addr_size)
+ return -EINVAL;
+
+ ip_off = ip - (sec->sframe_addr + fde->start_addr);
+
+ f = (void __user *)sec->fres_addr + fde->fres_off;
+
+ for (i = 0; i < fde->fres_num; i++) {
+ u32 fre_ip_off;
+
+ SFRAME_GET_USER(fre_ip_off, f, addr_size);
+
+ if (fre_ip_off < last_fre_ip_off)
+ return -EINVAL;
+
+ last_fre_ip_off = fre_ip_off;
+
+ if (fde_type == SFRAME_FDE_TYPE_PCINC) {
+ if (ip_off < fre_ip_off)
+ break;
+ } else {
+ /* SFRAME_FDE_TYPE_PCMASK */
+ if (ip_off % fde->rep_size < fre_ip_off)
+ break;
+ }
+
+ SFRAME_GET_USER(fre_info, f, 1);
+
+ offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info);
+ offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info));
+
+ if (!offset_count || !offset_size)
+ return -EINVAL;
+
+ last_f = f;
+ f += offset_count * offset_size;
+ }
+
+ if (!last_f)
+ return -EINVAL;
+
+ f = last_f;
+
+ SFRAME_GET_USER(cfa_off, f, offset_size);
+ offset_count--;
+
+ ra_off = sec->ra_off;
+ if (!ra_off) {
+ if (!offset_count--)
+ return -EINVAL;
+
+ SFRAME_GET_USER(ra_off, f, offset_size);
+ }
+
+ fp_off = sec->fp_off;
+ if (!fp_off && offset_count) {
+ offset_count--;
+ SFRAME_GET_USER(fp_off, f, offset_size);
+ }
+
+ if (offset_count)
+ return -EINVAL;
+
+ frame->cfa_off = cfa_off;
+ frame->ra_off = ra_off;
+ frame->fp_off = fp_off;
+ frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP;
+
+ return 0;
+}
+
+int sframe_find(unsigned long ip, struct unwind_user_frame *frame)
+{
+ struct mm_struct *mm = current->mm;
+ struct sframe_section *sec;
+ struct sframe_fde fde;
+ int ret = -EINVAL;
+
+ if (!mm)
+ return -EINVAL;
+
+ guard(srcu)(&sframe_srcu);
+
+ sec = mtree_load(&mm->sframe_mt, ip);
+ if (!sec)
+ return ret;
+
+ ret = find_fde(sec, ip, &fde);
+ if (ret)
+ return ret;
+
+ ret = find_fre(sec, &fde, ip, frame);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int __sframe_add_section(unsigned long sframe_addr,
+ unsigned long text_start,
+ unsigned long text_end)
+{
+ struct maple_tree *sframe_mt = ¤t->mm->sframe_mt;
+ struct sframe_section *sec;
+ struct sframe_header shdr;
+ unsigned long header_end;
+ int ret;
+
+ if (copy_from_user(&shdr, (void __user *)sframe_addr, sizeof(shdr)))
+ return -EFAULT;
+
+ if (shdr.preamble.magic != SFRAME_MAGIC ||
+ shdr.preamble.version != SFRAME_VERSION_2 ||
+ !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
+ shdr.auxhdr_len || !shdr.num_fdes || !shdr.num_fres ||
+ shdr.fdes_off > shdr.fres_off) {
+ return -EINVAL;
+ }
+
+ sec = kmalloc(sizeof(*sec), GFP_KERNEL);
+ if (!sec)
+ return -ENOMEM;
+
+ header_end = sframe_addr + SFRAME_HDR_SIZE(shdr);
+
+ sec->sframe_addr = sframe_addr;
+ sec->text_addr = text_start;
+ sec->fdes_addr = header_end + shdr.fdes_off;
+ sec->fres_addr = header_end + shdr.fres_off;
+ sec->fdes_nr = shdr.num_fdes;
+ sec->ra_off = shdr.cfa_fixed_ra_offset;
+ sec->fp_off = shdr.cfa_fixed_fp_offset;
+
+ ret = mtree_insert_range(sframe_mt, text_start, text_end, sec, GFP_KERNEL);
+ if (ret) {
+ kfree(sec);
+ return ret;
+ }
+
+ return 0;
+}
+
+int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
+ unsigned long text_end)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *sframe_vma;
+
+ mmap_read_lock(mm);
+
+ sframe_vma = vma_lookup(mm, sframe_addr);
+ if (!sframe_vma)
+ goto err_unlock;
+
+ if (text_start && text_end) {
+ struct vm_area_struct *text_vma;
+
+ text_vma = vma_lookup(mm, text_start);
+ if (!(text_vma->vm_flags & VM_EXEC))
+ goto err_unlock;
+
+ if (PAGE_ALIGN(text_end) != text_vma->vm_end)
+ goto err_unlock;
+ } else {
+ struct vm_area_struct *vma, *text_vma = NULL;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ for_each_vma(vmi, vma) {
+ if (vma->vm_file != sframe_vma->vm_file ||
+ !(vma->vm_flags & VM_EXEC))
+ continue;
+
+ if (text_vma) {
+ pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
+ current->comm, current->pid);
+ goto err_unlock;
+ }
+
+ text_vma = vma;
+ }
+
+ if (!text_vma)
+ goto err_unlock;
+
+ text_start = text_vma->vm_start;
+ text_end = text_vma->vm_end;
+ }
+
+ mmap_read_unlock(mm);
+
+ return __sframe_add_section(sframe_addr, text_start, text_end);
+
+err_unlock:
+ mmap_read_unlock(mm);
+ return -EINVAL;
+}
+
+static void sframe_free_srcu(struct rcu_head *rcu)
+{
+ struct sframe_section *sec = container_of(rcu, struct sframe_section, rcu);
+
+ kfree(sec);
+}
+
+static int __sframe_remove_section(struct mm_struct *mm,
+ struct sframe_section *sec)
+{
+ sec = mtree_erase(&mm->sframe_mt, sec->text_addr);
+ if (!sec)
+ return -EINVAL;
+
+ call_srcu(&sframe_srcu, &sec->rcu, sframe_free_srcu);
+
+ return 0;
+}
+
+int sframe_remove_section(unsigned long sframe_addr)
+{
+ struct mm_struct *mm = current->mm;
+ struct sframe_section *sec;
+ unsigned long index = 0;
+
+ mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
+ if (sec->sframe_addr == sframe_addr)
+ return __sframe_remove_section(mm, sec);
+ }
+
+ return -EINVAL;
+}
+
+void sframe_free_mm(struct mm_struct *mm)
+{
+ struct sframe_section *sec;
+ unsigned long index = 0;
+
+ if (!mm)
+ return;
+
+ mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX)
+ kfree(sec);
+
+ mtree_destroy(&mm->sframe_mt);
+}
diff --git a/kernel/unwind/sframe.h b/kernel/unwind/sframe.h
new file mode 100644
index 000000000000..11b9be7ad82e
--- /dev/null
+++ b/kernel/unwind/sframe.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023, Oracle and/or its affiliates.
+ *
+ * This file contains definitions for the SFrame stack tracing format, which is
+ * documented at https://sourceware.org/binutils/docs
+ */
+#ifndef _SFRAME_H
+#define _SFRAME_H
+
+#include <linux/types.h>
+
+#define SFRAME_VERSION_1 1
+#define SFRAME_VERSION_2 2
+#define SFRAME_MAGIC 0xdee2
+
+/* Function Descriptor Entries are sorted on PC. */
+#define SFRAME_F_FDE_SORTED 0x1
+/* Frame-pointer based stack tracing. Defined, but not set. */
+#define SFRAME_F_FRAME_POINTER 0x2
+
+#define SFRAME_CFA_FIXED_FP_INVALID 0
+#define SFRAME_CFA_FIXED_RA_INVALID 0
+
+/* Supported ABIs/Arch. */
+#define SFRAME_ABI_AARCH64_ENDIAN_BIG 1 /* AARCH64 big endian. */
+#define SFRAME_ABI_AARCH64_ENDIAN_LITTLE 2 /* AARCH64 little endian. */
+#define SFRAME_ABI_AMD64_ENDIAN_LITTLE 3 /* AMD64 little endian. */
+
+/* SFrame FRE types. */
+#define SFRAME_FRE_TYPE_ADDR1 0
+#define SFRAME_FRE_TYPE_ADDR2 1
+#define SFRAME_FRE_TYPE_ADDR4 2
+
+/*
+ * SFrame Function Descriptor Entry types.
+ *
+ * The SFrame format has two possible representations for functions. The
+ * choice of which type to use is made according to the instruction patterns
+ * in the relevant program stub.
+ */
+
+/* Unwinders perform a (PC >= FRE_START_ADDR) to look up a matching FRE. */
+#define SFRAME_FDE_TYPE_PCINC 0
+/*
+ * Unwinders perform a (PC & FRE_START_ADDR_AS_MASK >= FRE_START_ADDR_AS_MASK)
+ * to look up a matching FRE. Typical usecases are pltN entries, trampolines
+ * etc.
+ */
+#define SFRAME_FDE_TYPE_PCMASK 1
+
+/**
+ * struct sframe_preamble - SFrame Preamble.
+ * @magic: Magic number (SFRAME_MAGIC).
+ * @version: Format version number (SFRAME_VERSION).
+ * @flags: Various flags.
+ */
+struct sframe_preamble {
+ u16 magic;
+ u8 version;
+ u8 flags;
+} __packed;
+
+/**
+ * struct sframe_header - SFrame Header.
+ * @preamble: SFrame preamble.
+ * @abi_arch: Identify the arch (including endianness) and ABI.
+ * @cfa_fixed_fp_offset: Offset for the Frame Pointer (FP) from CFA may be
+ * fixed for some ABIs ((e.g, in AMD64 when -fno-omit-frame-pointer is
+ * used). When fixed, this field specifies the fixed stack frame offset
+ * and the individual FREs do not need to track it. When not fixed, it
+ * is set to SFRAME_CFA_FIXED_FP_INVALID, and the individual FREs may
+ * provide the applicable stack frame offset, if any.
+ * @cfa_fixed_ra_offset: Offset for the Return Address from CFA is fixed for
+ * some ABIs. When fixed, this field specifies the fixed stack frame
+ * offset and the individual FREs do not need to track it. When not
+ * fixed, it is set to SFRAME_CFA_FIXED_FP_INVALID.
+ * @auxhdr_len: Number of bytes making up the auxiliary header, if any.
+ * Some ABI/arch, in the future, may use this space for extending the
+ * information in SFrame header. Auxiliary header is contained in bytes
+ * sequentially following the sframe_header.
+ * @num_fdes: Number of SFrame FDEs in this SFrame section.
+ * @num_fres: Number of SFrame Frame Row Entries.
+ * @fre_len: Number of bytes in the SFrame Frame Row Entry section.
+ * @fdes_off: Offset of SFrame Function Descriptor Entry section.
+ * @fres_off: Offset of SFrame Frame Row Entry section.
+ */
+struct sframe_header {
+ struct sframe_preamble preamble;
+ u8 abi_arch;
+ s8 cfa_fixed_fp_offset;
+ s8 cfa_fixed_ra_offset;
+ u8 auxhdr_len;
+ u32 num_fdes;
+ u32 num_fres;
+ u32 fre_len;
+ u32 fdes_off;
+ u32 fres_off;
+} __packed;
+
+#define SFRAME_HDR_SIZE(sframe_hdr) \
+ ((sizeof(struct sframe_header) + (sframe_hdr).auxhdr_len))
+
+/* Two possible keys for executable (instruction) pointers signing. */
+#define SFRAME_AARCH64_PAUTH_KEY_A 0 /* Key A. */
+#define SFRAME_AARCH64_PAUTH_KEY_B 1 /* Key B. */
+
+/**
+ * struct sframe_fde - SFrame Function Descriptor Entry.
+ * @start_addr: Function start address. Encoded as a signed offset,
+ * relative to the current FDE.
+ * @size: Size of the function in bytes.
+ * @fres_off: Offset of the first SFrame Frame Row Entry of the function,
+ * relative to the beginning of the SFrame Frame Row Entry sub-section.
+ * @fres_num: Number of frame row entries for the function.
+ * @info: Additional information for deciphering the stack trace
+ * information for the function. Contains information about SFrame FRE
+ * type, SFrame FDE type, PAC authorization A/B key, etc.
+ * @rep_size: Block size for SFRAME_FDE_TYPE_PCMASK
+ * @padding: Unused
+ */
+struct sframe_fde {
+ s32 start_addr;
+ u32 size;
+ u32 fres_off;
+ u32 fres_num;
+ u8 info;
+ u8 rep_size;
+ u16 padding;
+} __packed;
+
+/*
+ * 'func_info' in SFrame FDE contains additional information for deciphering
+ * the stack trace information for the function. In V1, the information is
+ * organized as follows:
+ * - 4-bits: Identify the FRE type used for the function.
+ * - 1-bit: Identify the FDE type of the function - mask or inc.
+ * - 1-bit: PAC authorization A/B key (aarch64).
+ * - 2-bits: Unused.
+ * ---------------------------------------------------------------------
+ * | Unused | PAC auth A/B key (aarch64) | FDE type | FRE type |
+ * | | Unused (amd64) | | |
+ * ---------------------------------------------------------------------
+ * 8 6 5 4 0
+ */
+
+/* Note: Set PAC auth key to SFRAME_AARCH64_PAUTH_KEY_A by default. */
+#define SFRAME_FUNC_INFO(fde_type, fre_enc_type) \
+ (((SFRAME_AARCH64_PAUTH_KEY_A & 0x1) << 5) | \
+ (((fde_type) & 0x1) << 4) | ((fre_enc_type) & 0xf))
+
+#define SFRAME_FUNC_FRE_TYPE(data) ((data) & 0xf)
+#define SFRAME_FUNC_FDE_TYPE(data) (((data) >> 4) & 0x1)
+#define SFRAME_FUNC_PAUTH_KEY(data) (((data) >> 5) & 0x1)
+
+/*
+ * Size of stack frame offsets in an SFrame Frame Row Entry. A single
+ * SFrame FRE has all offsets of the same size. Offset size may vary
+ * across frame row entries.
+ */
+#define SFRAME_FRE_OFFSET_1B 0
+#define SFRAME_FRE_OFFSET_2B 1
+#define SFRAME_FRE_OFFSET_4B 2
+
+/* An SFrame Frame Row Entry can be SP or FP based. */
+#define SFRAME_BASE_REG_FP 0
+#define SFRAME_BASE_REG_SP 1
+
+/*
+ * The index at which a specific offset is presented in the variable length
+ * bytes of an FRE.
+ */
+#define SFRAME_FRE_CFA_OFFSET_IDX 0
+/*
+ * The RA stack offset, if present, will always be at index 1 in the variable
+ * length bytes of the FRE.
+ */
+#define SFRAME_FRE_RA_OFFSET_IDX 1
+/*
+ * The FP stack offset may appear at offset 1 or 2, depending on the ABI as RA
+ * may or may not be tracked.
+ */
+#define SFRAME_FRE_FP_OFFSET_IDX 2
+
+/*
+ * 'fre_info' in SFrame FRE contains information about:
+ * - 1 bit: base reg for CFA
+ * - 4 bits: Number of offsets (N). A value of up to 3 is allowed to track
+ * all three of CFA, FP and RA (fixed implicit order).
+ * - 2 bits: information about size of the offsets (S) in bytes.
+ * Valid values are SFRAME_FRE_OFFSET_1B, SFRAME_FRE_OFFSET_2B,
+ * SFRAME_FRE_OFFSET_4B
+ * - 1 bit: Mangled RA state bit (aarch64 only).
+ * ---------------------------------------------------------------
+ * | Mangled-RA (aarch64) | Size of | Number of | base_reg |
+ * | Unused (amd64) | offsets | offsets | |
+ * ---------------------------------------------------------------
+ * 8 7 5 1 0
+ */
+
+/* Note: Set mangled_ra_p to zero by default. */
+#define SFRAME_FRE_INFO(base_reg_id, offset_num, offset_size) \
+ (((0 & 0x1) << 7) | (((offset_size) & 0x3) << 5) | \
+ (((offset_num) & 0xf) << 1) | ((base_reg_id) & 0x1))
+
+/* Set the mangled_ra_p bit as indicated. */
+#define SFRAME_FRE_INFO_UPDATE_MANGLED_RA_P(mangled_ra_p, fre_info) \
+ ((((mangled_ra_p) & 0x1) << 7) | ((fre_info) & 0x7f))
+
+#define SFRAME_FRE_CFA_BASE_REG_ID(data) ((data) & 0x1)
+#define SFRAME_FRE_OFFSET_COUNT(data) (((data) >> 1) & 0xf)
+#define SFRAME_FRE_OFFSET_SIZE(data) (((data) >> 5) & 0x3)
+#define SFRAME_FRE_MANGLED_RA_P(data) (((data) >> 7) & 0x1)
+
+#endif /* _SFRAME_H */
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 54b989810a0e..8e47c80e3e54 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -8,12 +8,17 @@
#include <linux/sched.h>
#include <linux/sched/task_stack.h>
#include <linux/unwind_user.h>
+#include <linux/sframe.h>
#include <linux/uaccess.h>
-#include <asm/unwind_user.h>
+#ifdef CONFIG_HAVE_UNWIND_USER_FP
+#include <asm/unwind_user.h>
static struct unwind_user_frame fp_frame = {
ARCH_INIT_USER_FP_FRAME
};
+#else
+static struct unwind_user_frame fp_frame;
+#endif
int unwind_user_next(struct unwind_user_state *state)
{
@@ -30,6 +35,16 @@ int unwind_user_next(struct unwind_user_state *state)
case UNWIND_USER_TYPE_FP:
frame = &fp_frame;
break;
+ case UNWIND_USER_TYPE_SFRAME:
+ if (sframe_find(state->ip, frame)) {
+ if (!IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
+ goto the_end;
+
+ frame = &fp_frame;
+ }
+ break;
+ case UNWIND_USER_TYPE_NONE:
+ goto the_end;
default:
BUG();
}
@@ -68,7 +83,12 @@ int unwind_user_start(struct unwind_user_state *state)
return -EINVAL;
}
- state->type = UNWIND_USER_TYPE_FP;
+ if (current_has_sframe())
+ state->type = UNWIND_USER_TYPE_SFRAME;
+ else if (IS_ENABLED(CONFIG_UNWIND_USER_FP))
+ state->type = UNWIND_USER_TYPE_FP;
+ else
+ state->type = UNWIND_USER_TYPE_NONE;
state->sp = user_stack_pointer(regs);
state->ip = instruction_pointer(regs);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 24c809379274..8eb1b122b7bf 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -11,12 +11,17 @@
#include <linux/atomic.h>
#include <linux/user_namespace.h>
#include <linux/iommu.h>
+#include <linux/sframe.h>
#include <asm/mmu.h>
#ifndef INIT_MM_CONTEXT
#define INIT_MM_CONTEXT(name)
#endif
+#ifndef INIT_MM_SFRAME
+#define INIT_MM_SFRAME
+#endif
+
const struct vm_operations_struct vma_dummy_vm_ops;
/*
@@ -45,6 +50,7 @@ struct mm_struct init_mm = {
.user_ns = &init_user_ns,
.cpu_bitmap = CPU_BITS_NONE,
INIT_MM_CONTEXT(init_mm)
+ INIT_MM_SFRAME
};
void setup_initial_init_mm(void *start_code, void *end_code,
--
2.47.0
On 28.10.2024 22:47, Josh Poimboeuf wrote: > diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c > @@ -68,7 +83,12 @@ int unwind_user_start(struct unwind_user_state *state) > return -EINVAL; > } > > - state->type = UNWIND_USER_TYPE_FP; > + if (current_has_sframe()) > + state->type = UNWIND_USER_TYPE_SFRAME; > + else if (IS_ENABLED(CONFIG_UNWIND_USER_FP)) The test must be for CONFIG_HAVE_UNWIND_USER_FP. :-) > + state->type = UNWIND_USER_TYPE_FP; > + else > + state->type = UNWIND_USER_TYPE_NONE; > > state->sp = user_stack_pointer(regs); > state->ip = instruction_pointer(regs); Regards, Jens -- Jens Remus Linux on Z Development (D3303) and z/VSE Support +49-7031-16-1128 Office jremus@de.ibm.com IBM IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294 IBM Data Privacy Statement: https://www.ibm.com/privacy/
On Wed, 13 Nov 2024 16:56:25 +0100 Jens Remus <jremus@linux.ibm.com> wrote: > On 28.10.2024 22:47, Josh Poimboeuf wrote: > > > diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c > > > @@ -68,7 +83,12 @@ int unwind_user_start(struct unwind_user_state *state) > > return -EINVAL; > > } > > > > - state->type = UNWIND_USER_TYPE_FP; > > + if (current_has_sframe()) > > + state->type = UNWIND_USER_TYPE_SFRAME; > > + else if (IS_ENABLED(CONFIG_UNWIND_USER_FP)) > > The test must be for CONFIG_HAVE_UNWIND_USER_FP. :-) Yep, that too. Thanks, -- Steve > > > + state->type = UNWIND_USER_TYPE_FP; > > + else > > + state->type = UNWIND_USER_TYPE_NONE; > > > > state->sp = user_stack_pointer(regs); > > state->ip = instruction_pointer(regs); > > Regards, > Jens
On Wed, Nov 13, 2024 at 03:50:58PM -0500, Steven Rostedt wrote: > On Wed, 13 Nov 2024 16:56:25 +0100 > Jens Remus <jremus@linux.ibm.com> wrote: > > > On 28.10.2024 22:47, Josh Poimboeuf wrote: > > > > > diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c > > > > > @@ -68,7 +83,12 @@ int unwind_user_start(struct unwind_user_state *state) > > > return -EINVAL; > > > } > > > > > > - state->type = UNWIND_USER_TYPE_FP; > > > + if (current_has_sframe()) > > > + state->type = UNWIND_USER_TYPE_SFRAME; > > > + else if (IS_ENABLED(CONFIG_UNWIND_USER_FP)) > > > > The test must be for CONFIG_HAVE_UNWIND_USER_FP. :-) > > Yep, that too. I also found this one, so that makes three of us! It's too bad IS_ENABLED() doesn't catch typos. -- Josh
On 28.10.2024 22:47, Josh Poimboeuf wrote: ... > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c ... > +static int find_fde(struct sframe_section *sec, unsigned long ip, > + struct sframe_fde *fde) > +{ > + struct sframe_fde __user *first, *last, *found = NULL; > + u32 ip_off, func_off_low = 0, func_off_high = -1; > + > + ip_off = ip - sec->sframe_addr; > + > + first = (void __user *)sec->fdes_addr; > + last = first + sec->fdes_nr; Could it be that this needs to be: last = first + sec->fdes_nr - 1; > + while (first <= last) { > + struct sframe_fde __user *mid; > + u32 func_off; > + > + mid = first + ((last - first) / 2); > + > + if (get_user(func_off, (s32 __user *)mid)) > + return -EFAULT; > + > + if (ip_off >= func_off) { > + /* validate sort order */ > + if (func_off < func_off_low) > + return -EINVAL; Otherwise I run into this when the IP is within the function whose FDE is the last one in the .sframe section: find_fde: IP=0x000000000110fbcc: ERROR: func_off < func_off_low (func_off=196608, func_off_low=4294224904) 110fbcc dump_sframe+0x2ec (/opt/binutils-sframe2/bin/objdump) func idx [2275]: pc = 0x110f8e0, size = 3310 bytes <dump_sframe> STARTPC CFA FP RA INFO 000000000110f8e0 sp+160 u u (1*1B) 000000000110f8e6 sp+160 c-72 c-48 (3*1B) 000000000110f8f6 sp+632 c-72 c-48 (3*1B) 000000000110fa82 sp+160 u u (1*1B) 000000000110fa88 sp+632 c-72 c-48 (3*1B) 0000000001110486 sp+160 u u (1*1B) 000000000111048c sp+632 c-72 c-48 (3*1B) 0000000001110574 sp+160 u u (1*1B) 000000000111057a sp+632 c-72 c-48 (3*1B) > + > + func_off_low = func_off; > + > + found = mid; > + first = mid + 1; > + } else { > + /* validate sort order */ > + if (func_off > func_off_high) > + return -EINVAL; > + > + func_off_high = func_off; > + > + last = mid - 1; > + } > + } > + > + if (!found) > + return -EINVAL; > + > + if (copy_from_user(fde, found, sizeof(*fde))) > + return -EFAULT; > + > + /* check for gaps */ > + if (ip_off < fde->start_addr || ip_off >= fde->start_addr + fde->size) > + return -EINVAL; > + > + return 0; > +} Thanks and regards, Jens -- Jens Remus Linux on Z Development (D3303) and z/VSE Support +49-7031-16-1128 Office jremus@de.ibm.com IBM IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294 IBM Data Privacy Statement: https://www.ibm.com/privacy/
On Thu, 7 Nov 2024 17:59:08 +0100 Jens Remus <jremus@linux.ibm.com> wrote: > On 28.10.2024 22:47, Josh Poimboeuf wrote: > ... > > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c > ... > > +static int find_fde(struct sframe_section *sec, unsigned long ip, > > + struct sframe_fde *fde) > > +{ > > + struct sframe_fde __user *first, *last, *found = NULL; > > + u32 ip_off, func_off_low = 0, func_off_high = -1; > > + > > + ip_off = ip - sec->sframe_addr; > > + > > + first = (void __user *)sec->fdes_addr; > > + last = first + sec->fdes_nr; > > Could it be that this needs to be: > > last = first + sec->fdes_nr - 1; Yep, I discovered the same issue. -- Steve > > > + while (first <= last) { > > + struct sframe_fde __user *mid; > > + u32 func_off; > > +
On Wed, Nov 13, 2024 at 03:50:40PM -0500, Steven Rostedt wrote: > On Thu, 7 Nov 2024 17:59:08 +0100 > Jens Remus <jremus@linux.ibm.com> wrote: > > > On 28.10.2024 22:47, Josh Poimboeuf wrote: > > ... > > > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c > > ... > > > +static int find_fde(struct sframe_section *sec, unsigned long ip, > > > + struct sframe_fde *fde) > > > +{ > > > + struct sframe_fde __user *first, *last, *found = NULL; > > > + u32 ip_off, func_off_low = 0, func_off_high = -1; > > > + > > > + ip_off = ip - sec->sframe_addr; > > > + > > > + first = (void __user *)sec->fdes_addr; > > > + last = first + sec->fdes_nr; > > > > Could it be that this needs to be: > > > > last = first + sec->fdes_nr - 1; > > Yep, I discovered the same issue. Indeed, thanks. -- Josh
On Wed, 13 Nov 2024 13:15:35 -0800 Josh Poimboeuf <jpoimboe@kernel.org> wrote: > On Wed, Nov 13, 2024 at 03:50:40PM -0500, Steven Rostedt wrote: > > On Thu, 7 Nov 2024 17:59:08 +0100 > > Jens Remus <jremus@linux.ibm.com> wrote: > > > > > On 28.10.2024 22:47, Josh Poimboeuf wrote: > > > ... > > > > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c > > > ... > > > > +static int find_fde(struct sframe_section *sec, unsigned long ip, > > > > + struct sframe_fde *fde) > > > > +{ > > > > + struct sframe_fde __user *first, *last, *found = NULL; > > > > + u32 ip_off, func_off_low = 0, func_off_high = -1; > > > > + > > > > + ip_off = ip - sec->sframe_addr; > > > > + > > > > + first = (void __user *)sec->fdes_addr; > > > > + last = first + sec->fdes_nr; > > > > > > Could it be that this needs to be: > > > > > > last = first + sec->fdes_nr - 1; > > > > Yep, I discovered the same issue. > > Indeed, thanks. > BTW, the following changes were needed to make it work for me: -- Steve diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 434c548f0837..64cc3c1188ca 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -842,7 +842,8 @@ static int load_elf_binary(struct linux_binprm *bprm) int first_pt_load = 1; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; - struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL; + struct elf_phdr *elf_property_phdata = NULL; + unsigned long sframe_vaddr = 0; unsigned long elf_brk; int retval, i; unsigned long elf_entry; @@ -951,7 +952,7 @@ static int load_elf_binary(struct linux_binprm *bprm) break; case PT_GNU_SFRAME: - sframe_phdr = elf_ppnt; + sframe_vaddr = elf_ppnt->p_vaddr; break; case PT_LOPROC ... PT_HIPROC: @@ -1344,8 +1345,8 @@ static int load_elf_binary(struct linux_binprm *bprm) task_pid_nr(current), retval); } - if (sframe_phdr) - sframe_add_section(load_bias + sframe_phdr->p_vaddr, + if (sframe_vaddr) + sframe_add_section(load_bias + sframe_vaddr, start_code, end_code); regs = current_pt_regs(); diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c index 933e47696e29..ca4ef0b72772 100644 --- a/kernel/unwind/sframe.c +++ b/kernel/unwind/sframe.c @@ -73,15 +73,15 @@ static int find_fde(struct sframe_section *sec, unsigned long ip, struct sframe_fde *fde) { struct sframe_fde __user *first, *last, *found = NULL; - u32 ip_off, func_off_low = 0, func_off_high = -1; + s32 ip_off, func_off_low = INT_MIN, func_off_high = INT_MAX; ip_off = ip - sec->sframe_addr; first = (void __user *)sec->fdes_addr; - last = first + sec->fdes_nr; + last = first + sec->fdes_nr - 1; while (first <= last) { struct sframe_fde __user *mid; - u32 func_off; + s32 func_off; mid = first + ((last - first) / 2); diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c index 11aadfade005..d9cd820150c5 100644 --- a/kernel/unwind/user.c +++ b/kernel/unwind/user.c @@ -97,7 +97,7 @@ int unwind_user_start(struct unwind_user_state *state) if (current_has_sframe()) state->type = UNWIND_USER_TYPE_SFRAME; - else if (IS_ENABLED(CONFIG_UNWIND_USER_FP)) + else if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP)) state->type = UNWIND_USER_TYPE_FP; else state->type = UNWIND_USER_TYPE_NONE; @@ -138,7 +138,7 @@ int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries) static u64 ctx_to_cookie(u64 cpu, u64 ctx) { BUILD_BUG_ON(NR_CPUS > 65535); - return (ctx & ((1UL << 48) - 1)) | cpu; + return (ctx & ((1UL << 48) - 1)) | (cpu << 48); } /*
On 13.11.2024 23:13, Steven Rostedt wrote: > On Wed, 13 Nov 2024 13:15:35 -0800 > BTW, the following changes were needed to make it work for me: > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c > index 933e47696e29..ca4ef0b72772 100644 > --- a/kernel/unwind/sframe.c > +++ b/kernel/unwind/sframe.c > @@ -73,15 +73,15 @@ static int find_fde(struct sframe_section *sec, unsigned long ip, > struct sframe_fde *fde) > { > struct sframe_fde __user *first, *last, *found = NULL; > - u32 ip_off, func_off_low = 0, func_off_high = -1; > + s32 ip_off, func_off_low = INT_MIN, func_off_high = INT_MAX; Coincidentally I was experimenting with exactly the same changes, except that I used S32_MIN and S32_MAX. Any preference? > > ip_off = ip - sec->sframe_addr; > > first = (void __user *)sec->fdes_addr; > - last = first + sec->fdes_nr; > + last = first + sec->fdes_nr - 1; > while (first <= last) { > struct sframe_fde __user *mid; > - u32 func_off; > + s32 func_off; > > mid = first + ((last - first) / 2); > Thanks and regards, Jens -- Jens Remus Linux on Z Development (D3303) and z/VSE Support +49-7031-16-1128 Office jremus@de.ibm.com IBM IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294 IBM Data Privacy Statement: https://www.ibm.com/privacy/
[ This reply fixes the linux-trace-kernel email :-p ] On Wed, 13 Nov 2024 17:13:26 -0500 Steven Rostedt <rostedt@goodmis.org> wrote: > BTW, the following changes were needed to make it work for me: > > -- Steve > > diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c > index 434c548f0837..64cc3c1188ca 100644 > --- a/fs/binfmt_elf.c > +++ b/fs/binfmt_elf.c > @@ -842,7 +842,8 @@ static int load_elf_binary(struct linux_binprm *bprm) > int first_pt_load = 1; > unsigned long error; > struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; > - struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL; > + struct elf_phdr *elf_property_phdata = NULL; > + unsigned long sframe_vaddr = 0; Could not just save the pointer to the sframe phd, as it gets freed before we need it. > unsigned long elf_brk; > int retval, i; > unsigned long elf_entry; > @@ -951,7 +952,7 @@ static int load_elf_binary(struct linux_binprm *bprm) > break; > > case PT_GNU_SFRAME: > - sframe_phdr = elf_ppnt; > + sframe_vaddr = elf_ppnt->p_vaddr; > break; > > case PT_LOPROC ... PT_HIPROC: > @@ -1344,8 +1345,8 @@ static int load_elf_binary(struct linux_binprm *bprm) > task_pid_nr(current), retval); > } > > - if (sframe_phdr) > - sframe_add_section(load_bias + sframe_phdr->p_vaddr, > + if (sframe_vaddr) > + sframe_add_section(load_bias + sframe_vaddr, > start_code, end_code); > > regs = current_pt_regs(); > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c > index 933e47696e29..ca4ef0b72772 100644 > --- a/kernel/unwind/sframe.c > +++ b/kernel/unwind/sframe.c > @@ -73,15 +73,15 @@ static int find_fde(struct sframe_section *sec, unsigned long ip, > struct sframe_fde *fde) > { > struct sframe_fde __user *first, *last, *found = NULL; > - u32 ip_off, func_off_low = 0, func_off_high = -1; > + s32 ip_off, func_off_low = INT_MIN, func_off_high = INT_MAX; The ip_off is a signed it. I wrote a program to dump out the sframe section of files, and I had: ffffed88: (1020) size: 16 off: 146 num: 2 info: 1 rep:16 ffffed98: (1030) size: 336 off: 154 num: 2 info:17 rep:16 ffffefe1: (1279) size: 113 off: 0 num: 4 info: 0 rep: 0 fffff052: (12ea) size: 54 off: 15 num: 3 info: 0 rep: 0 fffff088: (1320) size: 167 off: 26 num: 3 info: 0 rep: 0 fffff12f: (13c7) size: 167 off: 37 num: 4 info: 0 rep: 0 fffff1d6: (146e) size: 167 off: 52 num: 4 info: 0 rep: 0 fffff27d: (1515) size: 22 off: 67 num: 4 info: 0 rep: 0 fffff293: (152b) size: 141 off: 82 num: 4 info: 0 rep: 0 fffff320: (15b8) size: 81 off: 97 num: 4 info: 0 rep: 0 fffff371: (1609) size: 671 off: 112 num: 4 info: 1 rep: 0 fffff610: (18a8) size: 171 off: 131 num: 4 info: 0 rep: 0 The above turns was created by a loop of: fde = (void *)sframes + sizeof(*sframes) + sframes->sfh_fdeoff; for (s = 0; s < sframes->sfh_num_fdes; s++, fde++) { printf("\t%x: (%lx) size:%8u off:%8u num:%8u info:%2u rep:%2u\n", fde->sfde_func_start_address, fde->sfde_func_start_address + shdr->sh_offset, fde->sfde_func_size, fde->sfde_func_start_fre_off, fde->sfde_func_num_fres, fde->sfde_func_info, fde->sfde_func_rep_size); } As you can see, all the ip_off are negative. > > ip_off = ip - sec->sframe_addr; > > first = (void __user *)sec->fdes_addr; > - last = first + sec->fdes_nr; > + last = first + sec->fdes_nr - 1; The above was mentioned before. > while (first <= last) { > struct sframe_fde __user *mid; > - u32 func_off; > + s32 func_off; > > mid = first + ((last - first) / 2); > > diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c > index 11aadfade005..d9cd820150c5 100644 > --- a/kernel/unwind/user.c > +++ b/kernel/unwind/user.c > @@ -97,7 +97,7 @@ int unwind_user_start(struct unwind_user_state *state) > > if (current_has_sframe()) > state->type = UNWIND_USER_TYPE_SFRAME; > - else if (IS_ENABLED(CONFIG_UNWIND_USER_FP)) > + else if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP)) This was mentioned too. > state->type = UNWIND_USER_TYPE_FP; > else > state->type = UNWIND_USER_TYPE_NONE; > @@ -138,7 +138,7 @@ int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries) > static u64 ctx_to_cookie(u64 cpu, u64 ctx) > { > BUILD_BUG_ON(NR_CPUS > 65535); > - return (ctx & ((1UL << 48) - 1)) | cpu; > + return (ctx & ((1UL << 48) - 1)) | (cpu << 48); And so was this. -- Steve > } > > /*
On Wed, 13 Nov 2024 17:21:18 -0500 Steven Rostedt <rostedt@goodmis.org> wrote: > > diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c > > index 434c548f0837..64cc3c1188ca 100644 > > --- a/fs/binfmt_elf.c > > +++ b/fs/binfmt_elf.c > > @@ -842,7 +842,8 @@ static int load_elf_binary(struct linux_binprm *bprm) > > int first_pt_load = 1; > > unsigned long error; > > struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; > > - struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL; > > + struct elf_phdr *elf_property_phdata = NULL; > > + unsigned long sframe_vaddr = 0; > > Could not just save the pointer to the sframe phd, as it gets freed before we need it. ^^^ phdr > > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c > > index 933e47696e29..ca4ef0b72772 100644 > > --- a/kernel/unwind/sframe.c > > +++ b/kernel/unwind/sframe.c > > @@ -73,15 +73,15 @@ static int find_fde(struct sframe_section *sec, unsigned long ip, > > struct sframe_fde *fde) > > { > > struct sframe_fde __user *first, *last, *found = NULL; > > - u32 ip_off, func_off_low = 0, func_off_high = -1; > > + s32 ip_off, func_off_low = INT_MIN, func_off_high = INT_MAX; > > The ip_off is a signed it. I wrote a program to dump out the sframe section ^^ int > of files, and I had: > > ffffed88: (1020) size: 16 off: 146 num: 2 info: 1 rep:16 > ffffed98: (1030) size: 336 off: 154 num: 2 info:17 rep:16 > ffffefe1: (1279) size: 113 off: 0 num: 4 info: 0 rep: 0 > fffff052: (12ea) size: 54 off: 15 num: 3 info: 0 rep: 0 > fffff088: (1320) size: 167 off: 26 num: 3 info: 0 rep: 0 > fffff12f: (13c7) size: 167 off: 37 num: 4 info: 0 rep: 0 > fffff1d6: (146e) size: 167 off: 52 num: 4 info: 0 rep: 0 > fffff27d: (1515) size: 22 off: 67 num: 4 info: 0 rep: 0 > fffff293: (152b) size: 141 off: 82 num: 4 info: 0 rep: 0 > fffff320: (15b8) size: 81 off: 97 num: 4 info: 0 rep: 0 > fffff371: (1609) size: 671 off: 112 num: 4 info: 1 rep: 0 > fffff610: (18a8) size: 171 off: 131 num: 4 info: 0 rep: 0 > > The above turns was created by a loop of: ^^^^^^^^^ items were No idea why I typed that :-p I can't blame jetlag anymore. -- Steve
> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c ... > +static int find_fre(struct sframe_section *sec, struct sframe_fde *fde, > + unsigned long ip, struct unwind_user_frame *frame) > +{ > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > + unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info); > + unsigned char offset_count, offset_size; > + s32 cfa_off, ra_off, fp_off, ip_off; > + void __user *f, *last_f = NULL; > + unsigned char addr_size; > + u32 last_fre_ip_off = 0; > + u8 fre_info = 0; > + int i; > + > + addr_size = fre_type_to_size(fre_type); > + if (!addr_size) > + return -EINVAL; > + > + ip_off = ip - (sec->sframe_addr + fde->start_addr); nit: Since we already know wether the ip_off should mask or not, I think we don't have to check fde_type and mask the ip_off everytime. ip_off = (fde_type == SFRAME_FDE_TYPE_PCINC) ? ip_off : ip_off % fde->rep_size; > + > + f = (void __user *)sec->fres_addr + fde->fres_off; > + > + for (i = 0; i < fde->fres_num; i++) { > + u32 fre_ip_off; > + > + SFRAME_GET_USER(fre_ip_off, f, addr_size); > + > + if (fre_ip_off < last_fre_ip_off) > + return -EINVAL; > + > + last_fre_ip_off = fre_ip_off; > + > + if (fde_type == SFRAME_FDE_TYPE_PCINC) { > + if (ip_off < fre_ip_off) > + break; > + } else { > + /* SFRAME_FDE_TYPE_PCMASK */ > + if (ip_off % fde->rep_size < fre_ip_off) > + break; > + } > + > + SFRAME_GET_USER(fre_info, f, 1); > + > + offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info); > + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info)); > + > + if (!offset_count || !offset_size) > + return -EINVAL; > + > + last_f = f; > + f += offset_count * offset_size; > + } > + > + if (!last_f) > + return -EINVAL; > + > + f = last_f; > + > + SFRAME_GET_USER(cfa_off, f, offset_size); > + offset_count--; > + > + ra_off = sec->ra_off; > + if (!ra_off) { > + if (!offset_count--) > + return -EINVAL; > + > + SFRAME_GET_USER(ra_off, f, offset_size); > + } > + > + fp_off = sec->fp_off; > + if (!fp_off && offset_count) { > + offset_count--; > + SFRAME_GET_USER(fp_off, f, offset_size); > + } > + > + if (offset_count) > + return -EINVAL; > + > + frame->cfa_off = cfa_off; > + frame->ra_off = ra_off; > + frame->fp_off = fp_off; > + frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP; > + > + return 0; > +}
On 28.10.2024 22:47, Josh Poimboeuf wrote: ... > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c ... > +#define __SFRAME_GET_USER(out, user_ptr, type) \ > +({ \ > + type __tmp; \ > + if (get_user(__tmp, (type __user *)user_ptr)) \ > + return -EFAULT; \ > + user_ptr += sizeof(__tmp); \ > + out = __tmp; \ > +}) > + > +#define SFRAME_GET_USER(out, user_ptr, size) \ > +({ \ > + switch (size) { \ > + case 1: \ > + __SFRAME_GET_USER(out, user_ptr, u8); \ > + break; \ > + case 2: \ > + __SFRAME_GET_USER(out, user_ptr, u16); \ > + break; \ > + case 4: \ > + __SFRAME_GET_USER(out, user_ptr, u32); \ > + break; \ > + default: \ > + return -EINVAL; \ > + } \ > +}) > + > +static unsigned char fre_type_to_size(unsigned char fre_type) > +{ > + if (fre_type > 2) > + return 0; > + return 1 << fre_type; > +} > + > +static unsigned char offset_size_enum_to_size(unsigned char off_size) > +{ > + if (off_size > 2) > + return 0; > + return 1 << off_size; > +} ... > +static int find_fre(struct sframe_section *sec, struct sframe_fde *fde, > + unsigned long ip, struct unwind_user_frame *frame) > +{ > + unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info); > + unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info); > + unsigned char offset_count, offset_size; > + s32 cfa_off, ra_off, fp_off, ip_off; > + void __user *f, *last_f = NULL; > + unsigned char addr_size; > + u32 last_fre_ip_off = 0; > + u8 fre_info = 0; > + int i; > + > + addr_size = fre_type_to_size(fre_type); > + if (!addr_size) > + return -EINVAL; > + > + ip_off = ip - (sec->sframe_addr + fde->start_addr); > + > + f = (void __user *)sec->fres_addr + fde->fres_off; > + > + for (i = 0; i < fde->fres_num; i++) { > + u32 fre_ip_off; > + > + SFRAME_GET_USER(fre_ip_off, f, addr_size); > + > + if (fre_ip_off < last_fre_ip_off) > + return -EINVAL; > + > + last_fre_ip_off = fre_ip_off; > + > + if (fde_type == SFRAME_FDE_TYPE_PCINC) { > + if (ip_off < fre_ip_off) > + break; > + } else { > + /* SFRAME_FDE_TYPE_PCMASK */ > + if (ip_off % fde->rep_size < fre_ip_off) > + break; > + } > + > + SFRAME_GET_USER(fre_info, f, 1); > + > + offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info); > + offset_size = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info)); > + > + if (!offset_count || !offset_size) > + return -EINVAL; > + > + last_f = f; > + f += offset_count * offset_size; > + } > + > + if (!last_f) > + return -EINVAL; > + > + f = last_f; > + > + SFRAME_GET_USER(cfa_off, f, offset_size); SFRAME_GET_USER() does not work for the signed SFrame CFA offset. > + offset_count--; > + > + ra_off = sec->ra_off; > + if (!ra_off) { > + if (!offset_count--) > + return -EINVAL; > + > + SFRAME_GET_USER(ra_off, f, offset_size); Likewise for the signed SFrame RA offset. Excerpt from my added trace. Note the RA_off=65488 (unsigned) = 0xFFD0 = -48 (signed): unwind_user_next: WARNING: RA could not be obtained from user space (IP=0x000003ffbb5f4218, CFA=0x000003ffc22f8f10, RA_off=65488) Excerpt from perf script: 3ffbb5f4218 internal_fnwmatch+0x558 (/usr/lib64/libc.so.6) Excerpts from objdump -wt --sframe: 00000000000f3cc0 l F .text 000000000000195c internal_fnwmatch func idx [1715]: pc = 0xf3cc0, size = 6492 bytes STARTPC CFA FP RA INFO 00000000000f3cc0 sp+160 u u (1*1B) 00000000000f3cc6 sp+160 c-72 c-48 (3*1B) 00000000000f3cd0 sp+4256 c-72 c-48 (3*2B) 00000000000f3cdc sp+8352 c-72 c-48 (3*2B) 00000000000f3ce8 sp+10792 c-72 c-48 (3*2B) 00000000000f3f7e sp+160 u u (1*1B) 00000000000f3f80 sp+10792 c-72 c-48 (3*2B) > + } > + > + fp_off = sec->fp_off; > + if (!fp_off && offset_count) { > + offset_count--; > + SFRAME_GET_USER(fp_off, f, offset_size); Likewise for the signed SFrame FP offset. > + } > + > + if (offset_count) > + return -EINVAL; > + > + frame->cfa_off = cfa_off; > + frame->ra_off = ra_off; > + frame->fp_off = fp_off; > + frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP; > + > + return 0; > +} ... I have verified that reintroducing and using SFRAME_GET_USER_SIGNED() works correctly. Regards, Jens -- Jens Remus Linux on Z Development (D3303) and z/VSE Support +49-7031-16-1128 Office jremus@de.ibm.com IBM IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294 IBM Data Privacy Statement: https://www.ibm.com/privacy/
On Mon, 28 Oct 2024 14:47:56 -0700 Josh Poimboeuf <jpoimboe@kernel.org> wrote: > diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c > index 06dc4a57ba78..434c548f0837 100644 > --- a/fs/binfmt_elf.c > +++ b/fs/binfmt_elf.c > @@ -47,6 +47,7 @@ > #include <linux/dax.h> > #include <linux/uaccess.h> > #include <linux/rseq.h> > +#include <linux/sframe.h> > #include <asm/param.h> > #include <asm/page.h> > > @@ -633,11 +634,13 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, > unsigned long no_base, struct elf_phdr *interp_elf_phdata, > struct arch_elf_state *arch_state) > { > - struct elf_phdr *eppnt; > + struct elf_phdr *eppnt, *sframe_phdr = NULL; > unsigned long load_addr = 0; > int load_addr_set = 0; > unsigned long error = ~0UL; > unsigned long total_size; > + unsigned long start_code = ~0UL; > + unsigned long end_code = 0; > int i; > > /* First of all, some simple consistency checks */ > @@ -659,7 +662,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, > > eppnt = interp_elf_phdata; > for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) { > - if (eppnt->p_type == PT_LOAD) { > + switch (eppnt->p_type) { > + case PT_LOAD: { > int elf_type = MAP_PRIVATE; > int elf_prot = make_prot(eppnt->p_flags, arch_state, > true, true); > @@ -688,7 +692,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, > /* > * Check to see if the section's size will overflow the > * allowed task size. Note that p_filesz must always be > - * <= p_memsize so it's only necessary to check p_memsz. > + * <= p_memsz so it's only necessary to check p_memsz. > */ > k = load_addr + eppnt->p_vaddr; > if (BAD_ADDR(k) || > @@ -698,9 +702,24 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, > error = -ENOMEM; > goto out; > } > + > + if ((eppnt->p_flags & PF_X) && k < start_code) > + start_code = k; > + > + if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code) > + end_code = k + eppnt->p_filesz; > + break; > + } > + case PT_GNU_SFRAME: > + sframe_phdr = eppnt; > + break; > } > } > > + if (sframe_phdr) > + sframe_add_section(load_addr + sframe_phdr->p_vaddr, > + start_code, end_code); > + > error = load_addr; > out: > return error; > @@ -823,7 +842,7 @@ static int load_elf_binary(struct linux_binprm *bprm) > int first_pt_load = 1; > unsigned long error; > struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; > - struct elf_phdr *elf_property_phdata = NULL; > + struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL; > unsigned long elf_brk; > int retval, i; > unsigned long elf_entry; > @@ -931,6 +950,10 @@ static int load_elf_binary(struct linux_binprm *bprm) > executable_stack = EXSTACK_DISABLE_X; > break; > > + case PT_GNU_SFRAME: > + sframe_phdr = elf_ppnt; You need to save the p_vaddr here and not the pointer. > + break; > + > case PT_LOPROC ... PT_HIPROC: > retval = arch_elf_pt_proc(elf_ex, elf_ppnt, > bprm->file, false, > @@ -1321,6 +1344,10 @@ static int load_elf_binary(struct linux_binprm *bprm) > task_pid_nr(current), retval); > } Before this code we have: kfree(elf_phdata); And I added: if (sframe_phdr) trace_printk("after sframe vaddr=%x\n", sframe_phdr->p_vaddr); kfree(elf_phdata); if (sframe_phdr) trace_printk("after sframe vaddr=%x\n", sframe_phdr->p_vaddr); Which produced: scan-fs-940 [007] ..... 16.091081: bprint: load_elf_binary: after sframe vaddr=2298 scan-fs-940 [007] ..... 16.091083: bprint: load_elf_binary: after sframe vaddr=0 I was wondering why it wasn't working. -- Steve > > + if (sframe_phdr) > + sframe_add_section(load_bias + sframe_phdr->p_vaddr, > + start_code, end_code); > + > regs = current_pt_regs(); > #ifdef ELF_PLAT_INIT > /* > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 381d22eba088..6e7561c1a5fc 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -1052,6 +1052,9 @@ struct mm_struct { > #endif > } lru_gen; > #endif /* CONFIG_LRU_GEN_WALKS_MMU */ > +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME > + struct maple_tree sframe_mt; > +#endif > } __randomize_layout; > > /*
On Tue, 5 Nov 2024 12:40:53 -0500 Steven Rostedt <rostedt@goodmis.org> wrote: > On Mon, 28 Oct 2024 14:47:56 -0700 > Josh Poimboeuf <jpoimboe@kernel.org> wrote: <linux-trace-kernel@vger.kerne.org>: Host or domain name not found. Name service error for name=vger.kerne.org type=AAAA: Host not found Hmm, no wonder this didn't show up in patchwork :-/ Please fix on your next version. Thanks, -- Steve
On Mon, Oct 28, 2024 at 2:48 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > Some distros have started compiling frame pointers into all their > packages to enable the kernel to do system-wide profiling of user space. > Unfortunately that creates a runtime performance penalty across the > entire system. Using DWARF instead isn't feasible due to the complexity > it would add to the kernel. > > For in-kernel unwinding we solved this problem with the creation of the > ORC unwinder for x86_64. Similarly, for user space the GNU assembler > has created the sframe format starting with binutils 2.41 for sframe v2. > Sframe is a simpler version of .eh_frame. It gets placed in the .sframe > section. > > Add support for unwinding user space using sframe. > > More information about sframe can be found here: > > - https://lwn.net/Articles/932209/ > - https://lwn.net/Articles/940686/ > - https://sourceware.org/binutils/docs/sframe-spec.html > > Glibc support is needed to implement the prctl() calls to tell the > kernel where the .sframe segments are. > > Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org> > --- > arch/Kconfig | 4 + > arch/x86/include/asm/mmu.h | 2 +- > fs/binfmt_elf.c | 35 +++- > include/linux/mm_types.h | 3 + > include/linux/sframe.h | 41 ++++ > include/linux/unwind_user.h | 2 + > include/uapi/linux/elf.h | 1 + > include/uapi/linux/prctl.h | 3 + > kernel/fork.c | 10 + > kernel/sys.c | 11 ++ > kernel/unwind/Makefile | 3 +- > kernel/unwind/sframe.c | 380 ++++++++++++++++++++++++++++++++++++ > kernel/unwind/sframe.h | 215 ++++++++++++++++++++ > kernel/unwind/user.c | 24 ++- > mm/init-mm.c | 6 + > 15 files changed, 732 insertions(+), 8 deletions(-) > create mode 100644 include/linux/sframe.h > create mode 100644 kernel/unwind/sframe.c > create mode 100644 kernel/unwind/sframe.h > It feels like this patch is trying to do too much. There is both new UAPI introduction, and SFrame format definition, and unwinder integration, etc, etc. Do you think it can be split further into more focused smaller patches? > @@ -688,7 +692,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, > /* > * Check to see if the section's size will overflow the > * allowed task size. Note that p_filesz must always be > - * <= p_memsize so it's only necessary to check p_memsz. > + * <= p_memsz so it's only necessary to check p_memsz. > */ > k = load_addr + eppnt->p_vaddr; > if (BAD_ADDR(k) || > @@ -698,9 +702,24 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, > error = -ENOMEM; > goto out; > } > + > + if ((eppnt->p_flags & PF_X) && k < start_code) > + start_code = k; > + > + if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code) > + end_code = k + eppnt->p_filesz; > + break; > + } > + case PT_GNU_SFRAME: > + sframe_phdr = eppnt; if I understand correctly, there has to be only one sframe, is that right? Should we validate that? > + break; > } > } > > + if (sframe_phdr) > + sframe_add_section(load_addr + sframe_phdr->p_vaddr, > + start_code, end_code); > + no error checking? > error = load_addr; > out: > return error; > @@ -823,7 +842,7 @@ static int load_elf_binary(struct linux_binprm *bprm) > int first_pt_load = 1; > unsigned long error; > struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; > - struct elf_phdr *elf_property_phdata = NULL; > + struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL; > unsigned long elf_brk; > int retval, i; > unsigned long elf_entry; > @@ -931,6 +950,10 @@ static int load_elf_binary(struct linux_binprm *bprm) > executable_stack = EXSTACK_DISABLE_X; > break; > > + case PT_GNU_SFRAME: > + sframe_phdr = elf_ppnt; > + break; > + > case PT_LOPROC ... PT_HIPROC: > retval = arch_elf_pt_proc(elf_ex, elf_ppnt, > bprm->file, false, > @@ -1321,6 +1344,10 @@ static int load_elf_binary(struct linux_binprm *bprm) > task_pid_nr(current), retval); > } > > + if (sframe_phdr) > + sframe_add_section(load_bias + sframe_phdr->p_vaddr, > + start_code, end_code); > + error checking missing? > regs = current_pt_regs(); > #ifdef ELF_PLAT_INIT > /* > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 381d22eba088..6e7561c1a5fc 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -1052,6 +1052,9 @@ struct mm_struct { > #endif > } lru_gen; > #endif /* CONFIG_LRU_GEN_WALKS_MMU */ > +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME > + struct maple_tree sframe_mt; > +#endif > } __randomize_layout; > > /* > diff --git a/include/linux/sframe.h b/include/linux/sframe.h > new file mode 100644 > index 000000000000..d167e01817c4 > --- /dev/null > +++ b/include/linux/sframe.h > @@ -0,0 +1,41 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#ifndef _LINUX_SFRAME_H > +#define _LINUX_SFRAME_H > + > +#include <linux/mm_types.h> > + > +struct unwind_user_frame; > + > +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME > + > +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0), > + > +extern void sframe_free_mm(struct mm_struct *mm); > + > +/* text_start, text_end, file_name are optional */ what file_name? was that an extra argument that got removed? > +extern int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, > + unsigned long text_end); > + > +extern int sframe_remove_section(unsigned long sframe_addr); > +extern int sframe_find(unsigned long ip, struct unwind_user_frame *frame); > + > +static inline bool current_has_sframe(void) > +{ > + struct mm_struct *mm = current->mm; > + > + return mm && !mtree_empty(&mm->sframe_mt); > +} > + [...] > diff --git a/kernel/sys.c b/kernel/sys.c > index 4da31f28fda8..7d05a67727db 100644 > --- a/kernel/sys.c > +++ b/kernel/sys.c > @@ -64,6 +64,7 @@ > #include <linux/rcupdate.h> > #include <linux/uidgid.h> > #include <linux/cred.h> > +#include <linux/sframe.h> > > #include <linux/nospec.h> > > @@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, > case PR_RISCV_SET_ICACHE_FLUSH_CTX: > error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); > break; > + case PR_ADD_SFRAME: > + if (arg5) > + return -EINVAL; > + error = sframe_add_section(arg2, arg3, arg4); wouldn't it be better to make this interface extendable from the get go? Instead of passing 3 arguments with fixed meaning, why not pass a pointer to an extendable binary struct like seems to be the trend nowadays with nicely extensible APIs. See [0] for one such example (specifically, struct procmap_query). Seems more prudent, as we'll most probably will be adding flags, options, extra information, etc) [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/ > + break; > + case PR_REMOVE_SFRAME: > + if (arg3 || arg4 || arg5) > + return -EINVAL; > + error = sframe_remove_section(arg2); > + break; > default: > error = -EINVAL; > break; [...] > +static unsigned char fre_type_to_size(unsigned char fre_type) > +{ > + if (fre_type > 2) > + return 0; > + return 1 << fre_type; > +} > + > +static unsigned char offset_size_enum_to_size(unsigned char off_size) > +{ > + if (off_size > 2) > + return 0; > + return 1 << off_size; > +} > + > +static int find_fde(struct sframe_section *sec, unsigned long ip, > + struct sframe_fde *fde) > +{ > + struct sframe_fde __user *first, *last, *found = NULL; > + u32 ip_off, func_off_low = 0, func_off_high = -1; > + > + ip_off = ip - sec->sframe_addr; what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right? and also, does it mean that SFrame doesn't support executables with text bigger than 4GB? > + > + first = (void __user *)sec->fdes_addr; > + last = first + sec->fdes_nr; > + while (first <= last) { > + struct sframe_fde __user *mid; > + u32 func_off; > + > + mid = first + ((last - first) / 2); > + > + if (get_user(func_off, (s32 __user *)mid)) > + return -EFAULT; > + > + if (ip_off >= func_off) { > + /* validate sort order */ > + if (func_off < func_off_low) > + return -EINVAL; > + > + func_off_low = func_off; > + > + found = mid; > + first = mid + 1; > + } else { > + /* validate sort order */ > + if (func_off > func_off_high) > + return -EINVAL; > + > + func_off_high = func_off; > + > + last = mid - 1; > + } > + } > + > + if (!found) > + return -EINVAL; > + > + if (copy_from_user(fde, found, sizeof(*fde))) > + return -EFAULT; > + > + /* check for gaps */ > + if (ip_off < fde->start_addr || ip_off >= fde->start_addr + fde->size) > + return -EINVAL; > + > + return 0; > +} > + [...] > +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, > + unsigned long text_end) > +{ > + struct mm_struct *mm = current->mm; > + struct vm_area_struct *sframe_vma; > + > + mmap_read_lock(mm); > + > + sframe_vma = vma_lookup(mm, sframe_addr); > + if (!sframe_vma) > + goto err_unlock; > + > + if (text_start && text_end) { > + struct vm_area_struct *text_vma; > + > + text_vma = vma_lookup(mm, text_start); > + if (!(text_vma->vm_flags & VM_EXEC)) > + goto err_unlock; > + > + if (PAGE_ALIGN(text_end) != text_vma->vm_end) > + goto err_unlock; > + } else { > + struct vm_area_struct *vma, *text_vma = NULL; > + VMA_ITERATOR(vmi, mm, 0); > + > + for_each_vma(vmi, vma) { > + if (vma->vm_file != sframe_vma->vm_file || > + !(vma->vm_flags & VM_EXEC)) > + continue; > + > + if (text_vma) { > + pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n", > + current->comm, current->pid); is this just something that fundamentally can't be supported by SFrame format? Or just an implementation simplification? It's not illegal to have an executable with multiple VM_EXEC segments, no? Should this be a pr_warn_once() then? > + goto err_unlock; > + } > + > + text_vma = vma; > + } > + > + if (!text_vma) > + goto err_unlock; > + > + text_start = text_vma->vm_start; > + text_end = text_vma->vm_end; > + } > + > + mmap_read_unlock(mm); > + > + return __sframe_add_section(sframe_addr, text_start, text_end); > + [...]
On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote: > It feels like this patch is trying to do too much. There is both new > UAPI introduction, and SFrame format definition, and unwinder > integration, etc, etc. Do you think it can be split further into more > focused smaller patches? True, let me see if I can split it up. > > + > > + if ((eppnt->p_flags & PF_X) && k < start_code) > > + start_code = k; > > + > > + if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code) > > + end_code = k + eppnt->p_filesz; > > + break; > > + } > > + case PT_GNU_SFRAME: > > + sframe_phdr = eppnt; > > if I understand correctly, there has to be only one sframe, is that > right? Should we validate that? Yes, there shouldn't be more than one PT_GNU_SFRAME for the executable itself. I can validate that. > > + break; > > } > > } > > > > + if (sframe_phdr) > > + sframe_add_section(load_addr + sframe_phdr->p_vaddr, > > + start_code, end_code); > > + > > no error checking? Good point. I remember discussing this with some people at Cauldon/LPC, I just forgot to do it! Right now it does all the validation at unwind, which could really slow things down unnecessarily if the sframe isn't valid. > > +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME > > + > > +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0), > > + > > +extern void sframe_free_mm(struct mm_struct *mm); > > + > > +/* text_start, text_end, file_name are optional */ > > what file_name? was that an extra argument that got removed? Indeed, that was for some old code. > > case PR_RISCV_SET_ICACHE_FLUSH_CTX: > > error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); > > break; > > + case PR_ADD_SFRAME: > > + if (arg5) > > + return -EINVAL; > > + error = sframe_add_section(arg2, arg3, arg4); > > wouldn't it be better to make this interface extendable from the get > go? Instead of passing 3 arguments with fixed meaning, why not pass a > pointer to an extendable binary struct like seems to be the trend > nowadays with nicely extensible APIs. See [0] for one such example > (specifically, struct procmap_query). Seems more prudent, as we'll > most probably will be adding flags, options, extra information, etc) > > [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/ This ioctl interface was admittedly hacked together. I was hoping somebody would suggest something better :-) I'll take a look. > > +static int find_fde(struct sframe_section *sec, unsigned long ip, > > + struct sframe_fde *fde) > > +{ > > + struct sframe_fde __user *first, *last, *found = NULL; > > + u32 ip_off, func_off_low = 0, func_off_high = -1; > > + > > + ip_off = ip - sec->sframe_addr; > > what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right? That's baked into sframe v2. > and also, does it mean that SFrame doesn't support executables with > text bigger than 4GB? Yes, but is that a realistic concern? > > + } else { > > + struct vm_area_struct *vma, *text_vma = NULL; > > + VMA_ITERATOR(vmi, mm, 0); > > + > > + for_each_vma(vmi, vma) { > > + if (vma->vm_file != sframe_vma->vm_file || > > + !(vma->vm_flags & VM_EXEC)) > > + continue; > > + > > + if (text_vma) { > > + pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n", > > + current->comm, current->pid); > > is this just something that fundamentally can't be supported by SFrame > format? Or just an implementation simplification? It's a simplification I suppose. > It's not illegal to have an executable with multiple VM_EXEC segments, > no? Should this be a pr_warn_once() then? I don't know, is it allowed? I've never seen it in practice. The pr_warn_once() is not reporting that it's illegal but rather that this corner case actually exists and maybe needs to be looked at. -- Josh
On Tue, Oct 29, 2024 at 10:53 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote: > > It feels like this patch is trying to do too much. There is both new > > UAPI introduction, and SFrame format definition, and unwinder > > integration, etc, etc. Do you think it can be split further into more > > focused smaller patches? > > True, let me see if I can split it up. > > > > + > > > + if ((eppnt->p_flags & PF_X) && k < start_code) > > > + start_code = k; > > > + > > > + if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code) > > > + end_code = k + eppnt->p_filesz; > > > + break; > > > + } > > > + case PT_GNU_SFRAME: > > > + sframe_phdr = eppnt; > > > > if I understand correctly, there has to be only one sframe, is that > > right? Should we validate that? > > Yes, there shouldn't be more than one PT_GNU_SFRAME for the executable > itself. I can validate that. > > > > + break; > > > } > > > } > > > > > > + if (sframe_phdr) > > > + sframe_add_section(load_addr + sframe_phdr->p_vaddr, > > > + start_code, end_code); > > > + > > > > no error checking? > > Good point. I remember discussing this with some people at Cauldon/LPC, > I just forgot to do it! > > Right now it does all the validation at unwind, which could really slow > things down unnecessarily if the sframe isn't valid. > > > > +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME > > > + > > > +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0), > > > + > > > +extern void sframe_free_mm(struct mm_struct *mm); > > > + > > > +/* text_start, text_end, file_name are optional */ > > > > what file_name? was that an extra argument that got removed? > > Indeed, that was for some old code. > > > > case PR_RISCV_SET_ICACHE_FLUSH_CTX: > > > error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); > > > break; > > > + case PR_ADD_SFRAME: > > > + if (arg5) > > > + return -EINVAL; > > > + error = sframe_add_section(arg2, arg3, arg4); > > > > wouldn't it be better to make this interface extendable from the get > > go? Instead of passing 3 arguments with fixed meaning, why not pass a > > pointer to an extendable binary struct like seems to be the trend > > nowadays with nicely extensible APIs. See [0] for one such example > > (specifically, struct procmap_query). Seems more prudent, as we'll > > most probably will be adding flags, options, extra information, etc) > > > > [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/ > > This ioctl interface was admittedly hacked together. I was hoping > somebody would suggest something better :-) I'll take a look. > > > > +static int find_fde(struct sframe_section *sec, unsigned long ip, > > > + struct sframe_fde *fde) > > > +{ > > > + struct sframe_fde __user *first, *last, *found = NULL; > > > + u32 ip_off, func_off_low = 0, func_off_high = -1; > > > + > > > + ip_off = ip - sec->sframe_addr; > > > > what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right? > > That's baked into sframe v2. I believe we do have large production binaries with more than 4GB of text, what are we going to do about them? It would be interesting to hear sframe people's opinion. Adding such a far-reaching new format in 2024 with these limitations is kind of sad. At the very least maybe we should allow some form of chaining sframe definitions to cover more than 4GB segments? Please CC relevant folks, I'm wondering what they're thinking about this. > > > and also, does it mean that SFrame doesn't support executables with > > text bigger than 4GB? > > Yes, but is that a realistic concern? See above, yes. You'd be surprised. As somewhat corroborating evidence, there were tons of problems and churn (within at least Meta) with DWARF not supporting more than 2GB sizes, so yes, this is not an abstract problem for sure. Modern production applications can be ridiculously big. > > > > + } else { > > > + struct vm_area_struct *vma, *text_vma = NULL; > > > + VMA_ITERATOR(vmi, mm, 0); > > > + > > > + for_each_vma(vmi, vma) { > > > + if (vma->vm_file != sframe_vma->vm_file || > > > + !(vma->vm_flags & VM_EXEC)) > > > + continue; > > > + > > > + if (text_vma) { > > > + pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n", > > > + current->comm, current->pid); > > > > is this just something that fundamentally can't be supported by SFrame > > format? Or just an implementation simplification? > > It's a simplification I suppose. That's a rather random limitation, IMO... How hard would it be to not make that assumption? > > > It's not illegal to have an executable with multiple VM_EXEC segments, > > no? Should this be a pr_warn_once() then? > > I don't know, is it allowed? I've never seen it in practice. The I'm pretty sure you can do that with a custom linker script, at the very least. Normally this probably won't happen, but I don't think Linux dictates how many executable VMAs an application can have. And it probably just naturally happens for JIT-ted applications (Java, Go, etc). Linux kernel itself has two executable segments, for instance (though kernel is special, of course, but still). > pr_warn_once() is not reporting that it's illegal but rather that this > corner case actually exists and maybe needs to be looked at. This warn() will be logged across millions of machines in the fleet, triggering alarms, people looking at this, making custom internal patches to disable the known-to-happen warn. Why do we need all this? This is an issue that is trivial to trigger by user process that's not doing anything illegal. Why? > > -- > Josh
On Thu, Oct 31, 2024 at 01:57:10PM -0700, Andrii Nakryiko wrote: > > > what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right? > > > > That's baked into sframe v2. > > I believe we do have large production binaries with more than 4GB of > text, what are we going to do about them? It would be interesting to > hear sframe people's opinion. Adding such a far-reaching new format in > 2024 with these limitations is kind of sad. At the very least maybe we > should allow some form of chaining sframe definitions to cover more > than 4GB segments? Please CC relevant folks, I'm wondering what > they're thinking about this. Personally I find the idea of a single 4GB+ text segment pretty surprising as I've never seen anything even close to that. Anyway it's iterative development and not everybody's requirements are clear from day 1. Which is why we're discussing it now. I think there are already plans to do an sframe v3. > > > > + if (text_vma) { > > > > + pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n", > > > > + current->comm, current->pid); > > > > > > is this just something that fundamentally can't be supported by SFrame > > > format? Or just an implementation simplification? > > > > It's a simplification I suppose. > > That's a rather random limitation, IMO... How hard would it be to not > make that assumption? It's definitely not random, there's no need to complicate the code if this condition doesn't exist. > > > It's not illegal to have an executable with multiple VM_EXEC segments, > > > no? Should this be a pr_warn_once() then? > > > > I don't know, is it allowed? I've never seen it in practice. The > > I'm pretty sure you can do that with a custom linker script, at the > very least. Normally this probably won't happen, but I don't think > Linux dictates how many executable VMAs an application can have. > And it probably just naturally happens for JIT-ted applications (Java, > Go, etc). Actually I just double checked and even the kernel's ELF loader assumes that each executable has only a single text start+end address pair. > > pr_warn_once() is not reporting that it's illegal but rather that this > > corner case actually exists and maybe needs to be looked at. > > This warn() will be logged across millions of machines in the fleet, > triggering alarms, people looking at this, making custom internal > patches to disable the known-to-happen warn. Why do we need all this? > This is an issue that is trivial to trigger by user process that's not > doing anything illegal. Why? There's no point in adding complexity to support some hypothetical. I can remove the printk though. -- Josh
On Thu, Oct 31, 2024 at 04:03:13PM -0700, Josh Poimboeuf wrote: > Personally I find the idea of a single 4GB+ text segment pretty > surprising as I've never seen anything even close to that. It is pretty common I'm afraid. > Actually I just double checked and even the kernel's ELF loader assumes > that each executable has only a single text start+end address pair. Huh? What makes you think that? There can be many executable PT_LOAD segments in each and every binary. Segher
On Fri, Nov 01, 2024 at 02:09:08PM -0500, Segher Boessenkool wrote: > On Thu, Oct 31, 2024 at 04:03:13PM -0700, Josh Poimboeuf wrote: > > Actually I just double checked and even the kernel's ELF loader assumes > > that each executable has only a single text start+end address pair. > > Huh? What makes you think that? There can be many executable PT_LOAD > segments in each and every binary. Right, but for executables (not shared libraries) the kernel seems to assume they're contiguous? See the 'start_code' and 'end_code' variables in load_elf_binary() load_elf_interp(). -- Josh
On Fri, Nov 01, 2024 at 12:33:07PM -0700, Josh Poimboeuf wrote: > On Fri, Nov 01, 2024 at 02:09:08PM -0500, Segher Boessenkool wrote: > > On Thu, Oct 31, 2024 at 04:03:13PM -0700, Josh Poimboeuf wrote: > > > Actually I just double checked and even the kernel's ELF loader assumes > > > that each executable has only a single text start+end address pair. > > > > Huh? What makes you think that? There can be many executable PT_LOAD > > segments in each and every binary. > > Right, but for executables (not shared libraries) the kernel seems to > assume they're contiguous? See the 'start_code' and 'end_code' > variables in load_elf_binary() load_elf_interp(). Typo, see load_elf_binary (not load_elf_interp). -- Josh
On Fri, Nov 01, 2024 at 12:35:13PM -0700, Josh Poimboeuf wrote: > On Fri, Nov 01, 2024 at 12:33:07PM -0700, Josh Poimboeuf wrote: > > On Fri, Nov 01, 2024 at 02:09:08PM -0500, Segher Boessenkool wrote: > > > On Thu, Oct 31, 2024 at 04:03:13PM -0700, Josh Poimboeuf wrote: > > > > Actually I just double checked and even the kernel's ELF loader assumes > > > > that each executable has only a single text start+end address pair. > > > > > > Huh? What makes you think that? There can be many executable PT_LOAD > > > segments in each and every binary. > > > > Right, but for executables (not shared libraries) the kernel seems to > > assume they're contiguous? See the 'start_code' and 'end_code' > > variables in load_elf_binary() load_elf_interp(). > > Typo, see load_elf_binary (not load_elf_interp). Hm, actually AFAICT that's only for reporting things in sysfs/proc. So maybe it's assumed but not really "enforced". -- Josh
Hi! On Fri, Nov 01, 2024 at 12:48:00PM -0700, Josh Poimboeuf wrote: > On Fri, Nov 01, 2024 at 12:35:13PM -0700, Josh Poimboeuf wrote: > > On Fri, Nov 01, 2024 at 12:33:07PM -0700, Josh Poimboeuf wrote: > > > On Fri, Nov 01, 2024 at 02:09:08PM -0500, Segher Boessenkool wrote: > > > > On Thu, Oct 31, 2024 at 04:03:13PM -0700, Josh Poimboeuf wrote: > > > > > Actually I just double checked and even the kernel's ELF loader assumes > > > > > that each executable has only a single text start+end address pair. > > > > > > > > Huh? What makes you think that? There can be many executable PT_LOAD > > > > segments in each and every binary. > > > > > > Right, but for executables (not shared libraries) the kernel seems to > > > assume they're contiguous? See the 'start_code' and 'end_code' > > > variables in load_elf_binary() load_elf_interp(). > > > > Typo, see load_elf_binary (not load_elf_interp). > > Hm, actually AFAICT that's only for reporting things in sysfs/proc. So > maybe it's assumed but not really "enforced". Yes, this is copied to mm->start_code (etc.) This isn't used for anything very important it seems, it seems to be a leftover from when we only had really simple binfmts? For a.out it did actually make sense, for example :-) Segher
+cc bpf@ where people care about stack traces and profiling as well (please cc bpf@vger.kernel.org for next revisions as well, I'm sure a bunch of folks would appreciate it and have something useful to say) On Thu, Oct 31, 2024 at 4:03 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > On Thu, Oct 31, 2024 at 01:57:10PM -0700, Andrii Nakryiko wrote: > > > > what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right? > > > > > > That's baked into sframe v2. > > > > I believe we do have large production binaries with more than 4GB of > > text, what are we going to do about them? It would be interesting to > > hear sframe people's opinion. Adding such a far-reaching new format in > > 2024 with these limitations is kind of sad. At the very least maybe we > > should allow some form of chaining sframe definitions to cover more > > than 4GB segments? Please CC relevant folks, I'm wondering what > > they're thinking about this. > > Personally I find the idea of a single 4GB+ text segment pretty > surprising as I've never seen anything even close to that. I grabbed one of Meta production servers running one of the big-ish (I don't know if it's even the largest, most probably not) service. Here are first few sections from /proc/pid/maps belonging to the main binary: 00200000-170ad000 r--p 00000000 07:01 5 172ac000-498e7000 r-xp 16eac000 07:01 5 49ae7000-49b8b000 r--p 494e7000 07:01 5 49d8b000-4a228000 rw-p 4958b000 07:01 5 4a228000-4c677000 rw-p 00000000 00:00 0 4c800000-4ca00000 r-xp 49c00000 07:01 5 4ca00000-4f600000 r-xp 49e00000 07:01 5 4f600000-5b270000 r-xp 4ca00000 07:01 5 Few observations: 1) There are 4 executable segments in just the first 8 entries. 2) Their total size is already approaching 1.5GB: >>> ((0x170ad000 - 0x200000) + (0x5b270000 - 0x4f600000) + (0x498e7000 - 0x172ac000)) / 1024 / 1024 1361.34375 I don't know about you, but from my experience things like code size tend to just grow over time, rarely it shrinks (and even that usually requires tremendous and focused efforts). > > Anyway it's iterative development and not everybody's requirements are > clear from day 1. Which is why we're discussing it now. I think there > are already plans to do an sframe v3. Of course, which is why I'm providing this feedback. But it would be nice to avoid having to support a zoo of versions if we already know there are practical limitations that we are not that far from hitting. > > > > > > + if (text_vma) { > > > > > + pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n", > > > > > + current->comm, current->pid); > > > > > > > > is this just something that fundamentally can't be supported by SFrame > > > > format? Or just an implementation simplification? > > > > > > It's a simplification I suppose. > > > > That's a rather random limitation, IMO... How hard would it be to not > > make that assumption? > > It's definitely not random, there's no need to complicate the code if > this condition doesn't exist. Sorry, I'm probably dense and missing something. But from the example process above, isn't this check violated already? Or it's two different things? Not sure, honestly. > > > > > It's not illegal to have an executable with multiple VM_EXEC segments, > > > > no? Should this be a pr_warn_once() then? > > > > > > I don't know, is it allowed? I've never seen it in practice. The > > > > I'm pretty sure you can do that with a custom linker script, at the > > very least. Normally this probably won't happen, but I don't think > > Linux dictates how many executable VMAs an application can have. > > And it probably just naturally happens for JIT-ted applications (Java, > > Go, etc). > > Actually I just double checked and even the kernel's ELF loader assumes > that each executable has only a single text start+end address pair. See above, very confused by such assumptions, but I'm hoping we are talking about two different things here. > > > > pr_warn_once() is not reporting that it's illegal but rather that this > > > corner case actually exists and maybe needs to be looked at. > > > > This warn() will be logged across millions of machines in the fleet, > > triggering alarms, people looking at this, making custom internal > > patches to disable the known-to-happen warn. Why do we need all this? > > This is an issue that is trivial to trigger by user process that's not > > doing anything illegal. Why? > > There's no point in adding complexity to support some hypothetical. I > can remove the printk though. We are talking about fundamental things like format for supporting frame pointer-less stack trace capture. It will take years to adopt SFrame everywhere, so I think it's prudent to think a bit ahead beyond just saying "no real application should need more than 4GB text", IMO. > > -- > Josh
On Fri, Nov 01, 2024 at 11:34:48AM -0700, Andrii Nakryiko wrote: > 00200000-170ad000 r--p 00000000 07:01 5 > 172ac000-498e7000 r-xp 16eac000 07:01 5 > 49ae7000-49b8b000 r--p 494e7000 07:01 5 > 49d8b000-4a228000 rw-p 4958b000 07:01 5 > 4a228000-4c677000 rw-p 00000000 00:00 0 > 4c800000-4ca00000 r-xp 49c00000 07:01 5 > 4ca00000-4f600000 r-xp 49e00000 07:01 5 > 4f600000-5b270000 r-xp 4ca00000 07:01 5 > > Sorry, I'm probably dense and missing something. But from the example > process above, isn't this check violated already? Or it's two > different things? Not sure, honestly. It's hard to tell exactly what's going on, did you strip the file names? The sframe limitation is per file, not per address space. I assume these are one file: > 172ac000-498e7000 r-xp 16eac000 07:01 5 and these are another: > 4c800000-4ca00000 r-xp 49c00000 07:01 5 > 4ca00000-4f600000 r-xp 49e00000 07:01 5 > 4f600000-5b270000 r-xp 4ca00000 07:01 5 Multiple mappings for a single file is fine, as long as they're contiguous. > > Actually I just double checked and even the kernel's ELF loader assumes > > that each executable has only a single text start+end address pair. > > See above, very confused by such assumptions, but I'm hoping we are > talking about two different things here. The "contiguous text" thing seems enforced by the kernel for executables. However it doesn't manage shared libraries, those are mapped by the loader, e.g. /lib64/ld-linux-x86-64.so.2. At a quick glance I can't tell if /lib64/ld-linux-x86-64.so.2 enforces that. > > There's no point in adding complexity to support some hypothetical. I > > can remove the printk though. > > We are talking about fundamental things like format for supporting > frame pointer-less stack trace capture. It will take years to adopt > SFrame everywhere, so I think it's prudent to think a bit ahead beyond > just saying "no real application should need more than 4GB text", IMO. I don't think anybody is saying that... -- Josh
On Fri, Nov 1, 2024 at 12:29 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > On Fri, Nov 01, 2024 at 11:34:48AM -0700, Andrii Nakryiko wrote: > > 00200000-170ad000 r--p 00000000 07:01 5 > > 172ac000-498e7000 r-xp 16eac000 07:01 5 > > 49ae7000-49b8b000 r--p 494e7000 07:01 5 > > 49d8b000-4a228000 rw-p 4958b000 07:01 5 > > 4a228000-4c677000 rw-p 00000000 00:00 0 > > 4c800000-4ca00000 r-xp 49c00000 07:01 5 > > 4ca00000-4f600000 r-xp 49e00000 07:01 5 > > 4f600000-5b270000 r-xp 4ca00000 07:01 5 > > > > Sorry, I'm probably dense and missing something. But from the example > > process above, isn't this check violated already? Or it's two > > different things? Not sure, honestly. > > It's hard to tell exactly what's going on, did you strip the file names? Yes, I did, of course. But as I said, they all belong to the same main binary of the process. > > The sframe limitation is per file, not per address space. I assume > these are one file: > > > 172ac000-498e7000 r-xp 16eac000 07:01 5 > > and these are another: > > > 4c800000-4ca00000 r-xp 49c00000 07:01 5 > > 4ca00000-4f600000 r-xp 49e00000 07:01 5 > > 4f600000-5b270000 r-xp 4ca00000 07:01 5 > > Multiple mappings for a single file is fine, as long as they're > contiguous. No all of what I posted above belongs to the same file (except "4a228000-4c677000 rw-p 00000000 00:00 0" which doesn't have associated file, but I suspect it originally was part of this file, we do some tricks with re-mmap()'ing stuff due to huge pages usage). > > > > Actually I just double checked and even the kernel's ELF loader assumes > > > that each executable has only a single text start+end address pair. > > > > See above, very confused by such assumptions, but I'm hoping we are > > talking about two different things here. > > The "contiguous text" thing seems enforced by the kernel for > executables. However it doesn't manage shared libraries, those are > mapped by the loader, e.g. /lib64/ld-linux-x86-64.so.2. > > At a quick glance I can't tell if /lib64/ld-linux-x86-64.so.2 enforces > that. > > > > There's no point in adding complexity to support some hypothetical. I > > > can remove the printk though. > > > > We are talking about fundamental things like format for supporting > > frame pointer-less stack trace capture. It will take years to adopt > > SFrame everywhere, so I think it's prudent to think a bit ahead beyond > > just saying "no real application should need more than 4GB text", IMO. > > I don't think anybody is saying that... > > -- > Josh
On Fri, Nov 1, 2024 at 12:44 PM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Fri, Nov 1, 2024 at 12:29 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > > > On Fri, Nov 01, 2024 at 11:34:48AM -0700, Andrii Nakryiko wrote: > > > 00200000-170ad000 r--p 00000000 07:01 5 > > > 172ac000-498e7000 r-xp 16eac000 07:01 5 > > > 49ae7000-49b8b000 r--p 494e7000 07:01 5 > > > 49d8b000-4a228000 rw-p 4958b000 07:01 5 > > > 4a228000-4c677000 rw-p 00000000 00:00 0 > > > 4c800000-4ca00000 r-xp 49c00000 07:01 5 > > > 4ca00000-4f600000 r-xp 49e00000 07:01 5 > > > 4f600000-5b270000 r-xp 4ca00000 07:01 5 > > > I should have maybe posted this in this form: 00200000-170ad000 r--p 00000000 07:01 5 /packages/obfuscated_file 172ac000-498e7000 r-xp 16eac000 07:01 5 /packages/obfuscated_file 49ae7000-49b8b000 r--p 494e7000 07:01 5 /packages/obfuscated_file 49d8b000-4a228000 rw-p 4958b000 07:01 5 /packages/obfuscated_file 4a228000-4c677000 rw-p 00000000 00:00 0 4c800000-4ca00000 r-xp 49c00000 07:01 5 /packages/obfuscated_file 4ca00000-4f600000 r-xp 49e00000 07:01 5 /packages/obfuscated_file 4f600000-5b270000 r-xp 4ca00000 07:01 5 /packages/obfuscated_file Those paths are pointing to the same binary. > > > Sorry, I'm probably dense and missing something. But from the example > > > process above, isn't this check violated already? Or it's two > > > different things? Not sure, honestly. > > > > It's hard to tell exactly what's going on, did you strip the file names? > > Yes, I did, of course. But as I said, they all belong to the same main > binary of the process. > > > > > The sframe limitation is per file, not per address space. I assume > > these are one file: > > > > > 172ac000-498e7000 r-xp 16eac000 07:01 5 > > > > and these are another: > > > > > 4c800000-4ca00000 r-xp 49c00000 07:01 5 > > > 4ca00000-4f600000 r-xp 49e00000 07:01 5 > > > 4f600000-5b270000 r-xp 4ca00000 07:01 5 > > > > Multiple mappings for a single file is fine, as long as they're > > contiguous. > > No all of what I posted above belongs to the same file (except > "4a228000-4c677000 rw-p 00000000 00:00 0" which doesn't have > associated file, but I suspect it originally was part of this file, we > do some tricks with re-mmap()'ing stuff due to huge pages usage). > > > > > > > Actually I just double checked and even the kernel's ELF loader assumes > > > > that each executable has only a single text start+end address pair. > > > > > > See above, very confused by such assumptions, but I'm hoping we are > > > talking about two different things here. > > > > The "contiguous text" thing seems enforced by the kernel for > > executables. However it doesn't manage shared libraries, those are > > mapped by the loader, e.g. /lib64/ld-linux-x86-64.so.2. > > > > At a quick glance I can't tell if /lib64/ld-linux-x86-64.so.2 enforces > > that. > > > > > > There's no point in adding complexity to support some hypothetical. I > > > > can remove the printk though. > > > > > > We are talking about fundamental things like format for supporting > > > frame pointer-less stack trace capture. It will take years to adopt > > > SFrame everywhere, so I think it's prudent to think a bit ahead beyond > > > just saying "no real application should need more than 4GB text", IMO. > > > > I don't think anybody is saying that... > > > > -- > > Josh
On Fri, Nov 01, 2024 at 12:46:09PM -0700, Andrii Nakryiko wrote: > On Fri, Nov 1, 2024 at 12:44 PM Andrii Nakryiko > <andrii.nakryiko@gmail.com> wrote: > > > > On Fri, Nov 1, 2024 at 12:29 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > > > > > On Fri, Nov 01, 2024 at 11:34:48AM -0700, Andrii Nakryiko wrote: > > > > 00200000-170ad000 r--p 00000000 07:01 5 > > > > 172ac000-498e7000 r-xp 16eac000 07:01 5 > > > > 49ae7000-49b8b000 r--p 494e7000 07:01 5 > > > > 49d8b000-4a228000 rw-p 4958b000 07:01 5 > > > > 4a228000-4c677000 rw-p 00000000 00:00 0 > > > > 4c800000-4ca00000 r-xp 49c00000 07:01 5 > > > > 4ca00000-4f600000 r-xp 49e00000 07:01 5 > > > > 4f600000-5b270000 r-xp 4ca00000 07:01 5 > > > > > > I should have maybe posted this in this form: > > 00200000-170ad000 r--p 00000000 07:01 5 /packages/obfuscated_file > 172ac000-498e7000 r-xp 16eac000 07:01 5 /packages/obfuscated_file > 49ae7000-49b8b000 r--p 494e7000 07:01 5 /packages/obfuscated_file > 49d8b000-4a228000 rw-p 4958b000 07:01 5 /packages/obfuscated_file > 4a228000-4c677000 rw-p 00000000 00:00 0 > 4c800000-4ca00000 r-xp 49c00000 07:01 5 /packages/obfuscated_file > 4ca00000-4f600000 r-xp 49e00000 07:01 5 /packages/obfuscated_file > 4f600000-5b270000 r-xp 4ca00000 07:01 5 /packages/obfuscated_file > > Those paths are pointing to the same binary. Ok, thanks for sharing that. I'll add in support for noncontiguous text. -- Josh
On 10/31/24 1:57 PM, Andrii Nakryiko wrote: > On Tue, Oct 29, 2024 at 10:53 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: >> >> On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote: >>> It feels like this patch is trying to do too much. There is both new >>> UAPI introduction, and SFrame format definition, and unwinder >>> integration, etc, etc. Do you think it can be split further into more >>> focused smaller patches? >> >> True, let me see if I can split it up. >> >>>> + >>>> + if ((eppnt->p_flags & PF_X) && k < start_code) >>>> + start_code = k; >>>> + >>>> + if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code) >>>> + end_code = k + eppnt->p_filesz; >>>> + break; >>>> + } >>>> + case PT_GNU_SFRAME: >>>> + sframe_phdr = eppnt; >>> >>> if I understand correctly, there has to be only one sframe, is that >>> right? Should we validate that? >> >> Yes, there shouldn't be more than one PT_GNU_SFRAME for the executable >> itself. I can validate that. >> >>>> + break; >>>> } >>>> } >>>> >>>> + if (sframe_phdr) >>>> + sframe_add_section(load_addr + sframe_phdr->p_vaddr, >>>> + start_code, end_code); >>>> + >>> >>> no error checking? >> >> Good point. I remember discussing this with some people at Cauldon/LPC, >> I just forgot to do it! >> >> Right now it does all the validation at unwind, which could really slow >> things down unnecessarily if the sframe isn't valid. >> >>>> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME >>>> + >>>> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0), >>>> + >>>> +extern void sframe_free_mm(struct mm_struct *mm); >>>> + >>>> +/* text_start, text_end, file_name are optional */ >>> >>> what file_name? was that an extra argument that got removed? >> >> Indeed, that was for some old code. >> >>>> case PR_RISCV_SET_ICACHE_FLUSH_CTX: >>>> error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); >>>> break; >>>> + case PR_ADD_SFRAME: >>>> + if (arg5) >>>> + return -EINVAL; >>>> + error = sframe_add_section(arg2, arg3, arg4); >>> >>> wouldn't it be better to make this interface extendable from the get >>> go? Instead of passing 3 arguments with fixed meaning, why not pass a >>> pointer to an extendable binary struct like seems to be the trend >>> nowadays with nicely extensible APIs. See [0] for one such example >>> (specifically, struct procmap_query). Seems more prudent, as we'll >>> most probably will be adding flags, options, extra information, etc) >>> >>> [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/ >> >> This ioctl interface was admittedly hacked together. I was hoping >> somebody would suggest something better :-) I'll take a look. >> >>>> +static int find_fde(struct sframe_section *sec, unsigned long ip, >>>> + struct sframe_fde *fde) >>>> +{ >>>> + struct sframe_fde __user *first, *last, *found = NULL; >>>> + u32 ip_off, func_off_low = 0, func_off_high = -1; >>>> + >>>> + ip_off = ip - sec->sframe_addr; >>> >>> what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right? >> >> That's baked into sframe v2. > > I believe we do have large production binaries with more than 4GB of > text, what are we going to do about them? It would be interesting to > hear sframe people's opinion. Adding such a far-reaching new format in > 2024 with these limitations is kind of sad. At the very least maybe we > should allow some form of chaining sframe definitions to cover more > than 4GB segments? Please CC relevant folks, I'm wondering what > they're thinking about this. > SFrame V2 does have that limitation. We can try to have 64-bit representation for the 'ip' in the SFrame FDE and conditionalize it somehow (say, with a flag in the header) so as to not bloat the majority of applications. >> >>> and also, does it mean that SFrame doesn't support executables with >>> text bigger than 4GB? >> >> Yes, but is that a realistic concern? > > See above, yes. You'd be surprised. As somewhat corroborating > evidence, there were tons of problems and churn (within at least Meta) > with DWARF not supporting more than 2GB sizes, so yes, this is not an > abstract problem for sure. Modern production applications can be > ridiculously big. > >> >>>> + } else { >>>> + struct vm_area_struct *vma, *text_vma = NULL; >>>> + VMA_ITERATOR(vmi, mm, 0); >>>> + >>>> + for_each_vma(vmi, vma) { >>>> + if (vma->vm_file != sframe_vma->vm_file || >>>> + !(vma->vm_flags & VM_EXEC)) >>>> + continue; >>>> + >>>> + if (text_vma) { >>>> + pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n", >>>> + current->comm, current->pid); >>> >>> is this just something that fundamentally can't be supported by SFrame >>> format? Or just an implementation simplification? >> >> It's a simplification I suppose. > > That's a rather random limitation, IMO... How hard would it be to not > make that assumption? > >> >>> It's not illegal to have an executable with multiple VM_EXEC segments, >>> no? Should this be a pr_warn_once() then? >> >> I don't know, is it allowed? I've never seen it in practice. The > > I'm pretty sure you can do that with a custom linker script, at the > very least. Normally this probably won't happen, but I don't think > Linux dictates how many executable VMAs an application can have. And > it probably just naturally happens for JIT-ted applications (Java, Go, > etc). > > Linux kernel itself has two executable segments, for instance (though > kernel is special, of course, but still). > >> pr_warn_once() is not reporting that it's illegal but rather that this >> corner case actually exists and maybe needs to be looked at. > > This warn() will be logged across millions of machines in the fleet, > triggering alarms, people looking at this, making custom internal > patches to disable the known-to-happen warn. Why do we need all this? > This is an issue that is trivial to trigger by user process that's not > doing anything illegal. Why? > >> >> -- >> Josh
On Thu, Oct 31, 2024 at 2:38 PM Indu Bhagat <indu.bhagat@oracle.com> wrote: > > On 10/31/24 1:57 PM, Andrii Nakryiko wrote: > > On Tue, Oct 29, 2024 at 10:53 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > >> > >> On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote: > >>> It feels like this patch is trying to do too much. There is both new > >>> UAPI introduction, and SFrame format definition, and unwinder > >>> integration, etc, etc. Do you think it can be split further into more > >>> focused smaller patches? > >> > >> True, let me see if I can split it up. > >> > >>>> + > >>>> + if ((eppnt->p_flags & PF_X) && k < start_code) > >>>> + start_code = k; > >>>> + > >>>> + if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code) > >>>> + end_code = k + eppnt->p_filesz; > >>>> + break; > >>>> + } > >>>> + case PT_GNU_SFRAME: > >>>> + sframe_phdr = eppnt; > >>> > >>> if I understand correctly, there has to be only one sframe, is that > >>> right? Should we validate that? > >> > >> Yes, there shouldn't be more than one PT_GNU_SFRAME for the executable > >> itself. I can validate that. > >> > >>>> + break; > >>>> } > >>>> } > >>>> > >>>> + if (sframe_phdr) > >>>> + sframe_add_section(load_addr + sframe_phdr->p_vaddr, > >>>> + start_code, end_code); > >>>> + > >>> > >>> no error checking? > >> > >> Good point. I remember discussing this with some people at Cauldon/LPC, > >> I just forgot to do it! > >> > >> Right now it does all the validation at unwind, which could really slow > >> things down unnecessarily if the sframe isn't valid. > >> > >>>> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME > >>>> + > >>>> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0), > >>>> + > >>>> +extern void sframe_free_mm(struct mm_struct *mm); > >>>> + > >>>> +/* text_start, text_end, file_name are optional */ > >>> > >>> what file_name? was that an extra argument that got removed? > >> > >> Indeed, that was for some old code. > >> > >>>> case PR_RISCV_SET_ICACHE_FLUSH_CTX: > >>>> error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); > >>>> break; > >>>> + case PR_ADD_SFRAME: > >>>> + if (arg5) > >>>> + return -EINVAL; > >>>> + error = sframe_add_section(arg2, arg3, arg4); > >>> > >>> wouldn't it be better to make this interface extendable from the get > >>> go? Instead of passing 3 arguments with fixed meaning, why not pass a > >>> pointer to an extendable binary struct like seems to be the trend > >>> nowadays with nicely extensible APIs. See [0] for one such example > >>> (specifically, struct procmap_query). Seems more prudent, as we'll > >>> most probably will be adding flags, options, extra information, etc) > >>> > >>> [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/ > >> > >> This ioctl interface was admittedly hacked together. I was hoping > >> somebody would suggest something better :-) I'll take a look. > >> > >>>> +static int find_fde(struct sframe_section *sec, unsigned long ip, > >>>> + struct sframe_fde *fde) > >>>> +{ > >>>> + struct sframe_fde __user *first, *last, *found = NULL; > >>>> + u32 ip_off, func_off_low = 0, func_off_high = -1; > >>>> + > >>>> + ip_off = ip - sec->sframe_addr; > >>> > >>> what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right? > >> > >> That's baked into sframe v2. > > > > I believe we do have large production binaries with more than 4GB of > > text, what are we going to do about them? It would be interesting to > > hear sframe people's opinion. Adding such a far-reaching new format in > > 2024 with these limitations is kind of sad. At the very least maybe we > > should allow some form of chaining sframe definitions to cover more > > than 4GB segments? Please CC relevant folks, I'm wondering what > > they're thinking about this. > > > > SFrame V2 does have that limitation. We can try to have 64-bit > representation for the 'ip' in the SFrame FDE and conditionalize it > somehow (say, with a flag in the header) so as to not bloat the majority > of applications. Hi Indu, I think that's prudent if we believe that SFrame is the solution here. See my reply to Josh. Real-world already approach 4GB limits, and things are not going to shrink in the years to come. So yeah, probably we need some adjustments to the format to at least allow 64-bit offsets (though trying to stick to 32-bit as much as possible, of course, if they work). I'm not really familiar with the nuances of the format just yet, so can't really provide anything more useful at this point. What would be the sort of gold reference for Sframe format to familiarize myself thoroughly? BTW, I wanted to ask. Are there any plans to add SFrame support to Clang as well? It feels like without that there is no future for SFrame as a general-purpose solution for stack traces. > > >> > >>> and also, does it mean that SFrame doesn't support executables with > >>> text bigger than 4GB? > >> > >> Yes, but is that a realistic concern? > > > > See above, yes. You'd be surprised. As somewhat corroborating > > evidence, there were tons of problems and churn (within at least Meta) > > with DWARF not supporting more than 2GB sizes, so yes, this is not an > > abstract problem for sure. Modern production applications can be > > ridiculously big. > > > >> > >>>> + } else { > >>>> + struct vm_area_struct *vma, *text_vma = NULL; > >>>> + VMA_ITERATOR(vmi, mm, 0); > >>>> + > >>>> + for_each_vma(vmi, vma) { > >>>> + if (vma->vm_file != sframe_vma->vm_file || > >>>> + !(vma->vm_flags & VM_EXEC)) > >>>> + continue; > >>>> + > >>>> + if (text_vma) { > >>>> + pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n", > >>>> + current->comm, current->pid); > >>> > >>> is this just something that fundamentally can't be supported by SFrame > >>> format? Or just an implementation simplification? > >> > >> It's a simplification I suppose. > > > > That's a rather random limitation, IMO... How hard would it be to not > > make that assumption? > > > >> > >>> It's not illegal to have an executable with multiple VM_EXEC segments, > >>> no? Should this be a pr_warn_once() then? > >> > >> I don't know, is it allowed? I've never seen it in practice. The > > > > I'm pretty sure you can do that with a custom linker script, at the > > very least. Normally this probably won't happen, but I don't think > > Linux dictates how many executable VMAs an application can have. And > > it probably just naturally happens for JIT-ted applications (Java, Go, > > etc). > > > > Linux kernel itself has two executable segments, for instance (though > > kernel is special, of course, but still). > > > >> pr_warn_once() is not reporting that it's illegal but rather that this > >> corner case actually exists and maybe needs to be looked at. > > > > This warn() will be logged across millions of machines in the fleet, > > triggering alarms, people looking at this, making custom internal > > patches to disable the known-to-happen warn. Why do we need all this? > > This is an issue that is trivial to trigger by user process that's not > > doing anything illegal. Why? > > > >> > >> -- > >> Josh >
On 11/1/24 11:38 AM, Andrii Nakryiko wrote: > On Thu, Oct 31, 2024 at 2:38 PM Indu Bhagat <indu.bhagat@oracle.com> wrote: >> >> On 10/31/24 1:57 PM, Andrii Nakryiko wrote: >>> On Tue, Oct 29, 2024 at 10:53 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: >>>> >>>> On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote: >>>>> It feels like this patch is trying to do too much. There is both new >>>>> UAPI introduction, and SFrame format definition, and unwinder >>>>> integration, etc, etc. Do you think it can be split further into more >>>>> focused smaller patches? >>>> >>>> True, let me see if I can split it up. >>>> >>>>>> + >>>>>> + if ((eppnt->p_flags & PF_X) && k < start_code) >>>>>> + start_code = k; >>>>>> + >>>>>> + if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code) >>>>>> + end_code = k + eppnt->p_filesz; >>>>>> + break; >>>>>> + } >>>>>> + case PT_GNU_SFRAME: >>>>>> + sframe_phdr = eppnt; >>>>> >>>>> if I understand correctly, there has to be only one sframe, is that >>>>> right? Should we validate that? >>>> >>>> Yes, there shouldn't be more than one PT_GNU_SFRAME for the executable >>>> itself. I can validate that. >>>> >>>>>> + break; >>>>>> } >>>>>> } >>>>>> >>>>>> + if (sframe_phdr) >>>>>> + sframe_add_section(load_addr + sframe_phdr->p_vaddr, >>>>>> + start_code, end_code); >>>>>> + >>>>> >>>>> no error checking? >>>> >>>> Good point. I remember discussing this with some people at Cauldon/LPC, >>>> I just forgot to do it! >>>> >>>> Right now it does all the validation at unwind, which could really slow >>>> things down unnecessarily if the sframe isn't valid. >>>> >>>>>> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME >>>>>> + >>>>>> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0), >>>>>> + >>>>>> +extern void sframe_free_mm(struct mm_struct *mm); >>>>>> + >>>>>> +/* text_start, text_end, file_name are optional */ >>>>> >>>>> what file_name? was that an extra argument that got removed? >>>> >>>> Indeed, that was for some old code. >>>> >>>>>> case PR_RISCV_SET_ICACHE_FLUSH_CTX: >>>>>> error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); >>>>>> break; >>>>>> + case PR_ADD_SFRAME: >>>>>> + if (arg5) >>>>>> + return -EINVAL; >>>>>> + error = sframe_add_section(arg2, arg3, arg4); >>>>> >>>>> wouldn't it be better to make this interface extendable from the get >>>>> go? Instead of passing 3 arguments with fixed meaning, why not pass a >>>>> pointer to an extendable binary struct like seems to be the trend >>>>> nowadays with nicely extensible APIs. See [0] for one such example >>>>> (specifically, struct procmap_query). Seems more prudent, as we'll >>>>> most probably will be adding flags, options, extra information, etc) >>>>> >>>>> [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/ >>>> >>>> This ioctl interface was admittedly hacked together. I was hoping >>>> somebody would suggest something better :-) I'll take a look. >>>> >>>>>> +static int find_fde(struct sframe_section *sec, unsigned long ip, >>>>>> + struct sframe_fde *fde) >>>>>> +{ >>>>>> + struct sframe_fde __user *first, *last, *found = NULL; >>>>>> + u32 ip_off, func_off_low = 0, func_off_high = -1; >>>>>> + >>>>>> + ip_off = ip - sec->sframe_addr; >>>>> >>>>> what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right? >>>> >>>> That's baked into sframe v2. >>> >>> I believe we do have large production binaries with more than 4GB of >>> text, what are we going to do about them? It would be interesting to >>> hear sframe people's opinion. Adding such a far-reaching new format in >>> 2024 with these limitations is kind of sad. At the very least maybe we >>> should allow some form of chaining sframe definitions to cover more >>> than 4GB segments? Please CC relevant folks, I'm wondering what >>> they're thinking about this. >>> >> >> SFrame V2 does have that limitation. We can try to have 64-bit >> representation for the 'ip' in the SFrame FDE and conditionalize it >> somehow (say, with a flag in the header) so as to not bloat the majority >> of applications. > > Hi Indu, > > I think that's prudent if we believe that SFrame is the solution here. > See my reply to Josh. Real-world already approach 4GB limits, and > things are not going to shrink in the years to come. So yeah, probably > we need some adjustments to the format to at least allow 64-bit > offsets (though trying to stick to 32-bit as much as possible, of > course, if they work). > > I'm not really familiar with the nuances of the format just yet, so > can't really provide anything more useful at this point. What would be > the sort of gold reference for Sframe format to familiarize myself > thoroughly? > There are some links on the SFrame wiki that can be helpful https://sourceware.org/binutils/wiki/sframe > BTW, I wanted to ask. Are there any plans to add SFrame support to > Clang as well? It feels like without that there is no future for > SFrame as a general-purpose solution for stack traces. > >> >>>> >>>>> and also, does it mean that SFrame doesn't support executables with >>>>> text bigger than 4GB? >>>> >>>> Yes, but is that a realistic concern? >>> >>> See above, yes. You'd be surprised. As somewhat corroborating >>> evidence, there were tons of problems and churn (within at least Meta) >>> with DWARF not supporting more than 2GB sizes, so yes, this is not an >>> abstract problem for sure. Modern production applications can be >>> ridiculously big. >>> >>>> >>>>>> + } else { >>>>>> + struct vm_area_struct *vma, *text_vma = NULL; >>>>>> + VMA_ITERATOR(vmi, mm, 0); >>>>>> + >>>>>> + for_each_vma(vmi, vma) { >>>>>> + if (vma->vm_file != sframe_vma->vm_file || >>>>>> + !(vma->vm_flags & VM_EXEC)) >>>>>> + continue; >>>>>> + >>>>>> + if (text_vma) { >>>>>> + pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n", >>>>>> + current->comm, current->pid); >>>>> >>>>> is this just something that fundamentally can't be supported by SFrame >>>>> format? Or just an implementation simplification? >>>> >>>> It's a simplification I suppose. >>> >>> That's a rather random limitation, IMO... How hard would it be to not >>> make that assumption? >>> >>>> >>>>> It's not illegal to have an executable with multiple VM_EXEC segments, >>>>> no? Should this be a pr_warn_once() then? >>>> >>>> I don't know, is it allowed? I've never seen it in practice. The >>> >>> I'm pretty sure you can do that with a custom linker script, at the >>> very least. Normally this probably won't happen, but I don't think >>> Linux dictates how many executable VMAs an application can have. And >>> it probably just naturally happens for JIT-ted applications (Java, Go, >>> etc). >>> >>> Linux kernel itself has two executable segments, for instance (though >>> kernel is special, of course, but still). >>> >>>> pr_warn_once() is not reporting that it's illegal but rather that this >>>> corner case actually exists and maybe needs to be looked at. >>> >>> This warn() will be logged across millions of machines in the fleet, >>> triggering alarms, people looking at this, making custom internal >>> patches to disable the known-to-happen warn. Why do we need all this? >>> This is an issue that is trivial to trigger by user process that's not >>> doing anything illegal. Why? >>> >>>> >>>> -- >>>> Josh >>
On Fri, 1 Nov 2024 11:38:47 -0700 Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > BTW, I wanted to ask. Are there any plans to add SFrame support to > Clang as well? It feels like without that there is no future for > SFrame as a general-purpose solution for stack traces. We want to use SFrames inside Google, and having Clang support it is a requirement for that. I'm working on getting people to support it in Clang. -- Steve
On Fri, Nov 1, 2024 at 11:46 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Fri, 1 Nov 2024 11:38:47 -0700 > Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > > BTW, I wanted to ask. Are there any plans to add SFrame support to > > Clang as well? It feels like without that there is no future for > > SFrame as a general-purpose solution for stack traces. > > We want to use SFrames inside Google, and having Clang support it is a > requirement for that. I'm working on getting people to support it in Clang. > Nice, good to hear! > -- Steve
On Thu, Oct 31, 2024 at 1:57 PM Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote: > > On Tue, Oct 29, 2024 at 10:53 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > > > > On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote: > > > > +static int find_fde(struct sframe_section *sec, unsigned long ip, > > > > + struct sframe_fde *fde) > > > > +{ > > > > + struct sframe_fde __user *first, *last, *found = NULL; > > > > + u32 ip_off, func_off_low = 0, func_off_high = -1; > > > > + > > > > + ip_off = ip - sec->sframe_addr; > > > > > > what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right? > > > > That's baked into sframe v2. > > I believe we do have large production binaries with more than 4GB of > text, what are we going to do about them? It would be interesting to > hear sframe people's opinion. Adding such a far-reaching new format in > 2024 with these limitations is kind of sad. At the very least maybe we > should allow some form of chaining sframe definitions to cover more > than 4GB segments? Please CC relevant folks, I'm wondering what > they're thinking about this. FWIW, Google has such large binaries, too. https://llvm.org/devmtg/2023-10/slides/quicktalks/Eubanks-CompromisesWithLargeX86-64Binaries.pdf -- Thanks, ~Nick Desaulniers
On Mon, Oct 28, 2024 at 02:47:36PM -0700, Josh Poimboeuf wrote: > +static int __sframe_add_section(unsigned long sframe_addr, > + unsigned long text_start, > + unsigned long text_end) > +{ > + struct maple_tree *sframe_mt = ¤t->mm->sframe_mt; > + struct sframe_section *sec; > + struct sframe_header shdr; > + unsigned long header_end; > + int ret; > + > + if (copy_from_user(&shdr, (void __user *)sframe_addr, sizeof(shdr))) > + return -EFAULT; > + > + if (shdr.preamble.magic != SFRAME_MAGIC || > + shdr.preamble.version != SFRAME_VERSION_2 || > + !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) || > + shdr.auxhdr_len || !shdr.num_fdes || !shdr.num_fres || > + shdr.fdes_off > shdr.fres_off) { > + return -EINVAL; > + } > + > + sec = kmalloc(sizeof(*sec), GFP_KERNEL); > + if (!sec) > + return -ENOMEM; > + > + header_end = sframe_addr + SFRAME_HDR_SIZE(shdr); > + > + sec->sframe_addr = sframe_addr; > + sec->text_addr = text_start; > + sec->fdes_addr = header_end + shdr.fdes_off; > + sec->fres_addr = header_end + shdr.fres_off; > + sec->fdes_nr = shdr.num_fdes; > + sec->ra_off = shdr.cfa_fixed_ra_offset; > + sec->fp_off = shdr.cfa_fixed_fp_offset; > + > + ret = mtree_insert_range(sframe_mt, text_start, text_end, sec, GFP_KERNEL); > + if (ret) { > + kfree(sec); > + return ret; > + } > + > + return 0; > +} > + > +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, > + unsigned long text_end) > +{ > + struct mm_struct *mm = current->mm; > + struct vm_area_struct *sframe_vma; > + > + mmap_read_lock(mm); DEFINE_GUARD(mmap_read_lock, struct mm_struct *, mmap_read_lock(_T), mmap_read_unlock(_T)) in include/linux/mmap_lock.h ? > + > + sframe_vma = vma_lookup(mm, sframe_addr); > + if (!sframe_vma) > + goto err_unlock; > + > + if (text_start && text_end) { > + struct vm_area_struct *text_vma; > + > + text_vma = vma_lookup(mm, text_start); > + if (!(text_vma->vm_flags & VM_EXEC)) > + goto err_unlock; > + > + if (PAGE_ALIGN(text_end) != text_vma->vm_end) > + goto err_unlock; > + } else { > + struct vm_area_struct *vma, *text_vma = NULL; > + VMA_ITERATOR(vmi, mm, 0); > + > + for_each_vma(vmi, vma) { > + if (vma->vm_file != sframe_vma->vm_file || > + !(vma->vm_flags & VM_EXEC)) > + continue; > + > + if (text_vma) { > + pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n", > + current->comm, current->pid); > + goto err_unlock; > + } > + > + text_vma = vma; > + } > + > + if (!text_vma) > + goto err_unlock; > + > + text_start = text_vma->vm_start; > + text_end = text_vma->vm_end; > + } > + > + mmap_read_unlock(mm); > + > + return __sframe_add_section(sframe_addr, text_start, text_end); > + > +err_unlock: > + mmap_read_unlock(mm); > + return -EINVAL; > +} > +int sframe_remove_section(unsigned long sframe_addr) > +{ > + struct mm_struct *mm = current->mm; > + struct sframe_section *sec; > + unsigned long index = 0; > + > + mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) { > + if (sec->sframe_addr == sframe_addr) > + return __sframe_remove_section(mm, sec); > + } > + > + return -EINVAL; > +} > --- a/kernel/sys.c > +++ b/kernel/sys.c > @@ -64,6 +64,7 @@ > #include <linux/rcupdate.h> > #include <linux/uidgid.h> > #include <linux/cred.h> > +#include <linux/sframe.h> > > #include <linux/nospec.h> > > @@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, > case PR_RISCV_SET_ICACHE_FLUSH_CTX: > error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); > break; > + case PR_ADD_SFRAME: > + if (arg5) > + return -EINVAL; > + error = sframe_add_section(arg2, arg3, arg4); > + break; > + case PR_REMOVE_SFRAME: > + if (arg3 || arg4 || arg5) > + return -EINVAL; > + error = sframe_remove_section(arg2); > + break; > default: > error = -EINVAL; > break; So I realize that mtree has an internal lock, but are we sure we don't want a lock around those prctl()s?
On Tue, Oct 29, 2024 at 02:27:09PM +0100, Peter Zijlstra wrote: > > +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, > > + unsigned long text_end) > > +{ > > + struct mm_struct *mm = current->mm; > > + struct vm_area_struct *sframe_vma; > > + > > + mmap_read_lock(mm); > > DEFINE_GUARD(mmap_read_lock, struct mm_struct *, > mmap_read_lock(_T), mmap_read_unlock(_T)) > > in include/linux/mmap_lock.h ? Will do. > > @@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, > > case PR_RISCV_SET_ICACHE_FLUSH_CTX: > > error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); > > break; > > + case PR_ADD_SFRAME: > > + if (arg5) > > + return -EINVAL; > > + error = sframe_add_section(arg2, arg3, arg4); > > + break; > > + case PR_REMOVE_SFRAME: > > + if (arg3 || arg4 || arg5) > > + return -EINVAL; > > + error = sframe_remove_section(arg2); > > + break; > > default: > > error = -EINVAL; > > break; > > So I realize that mtree has an internal lock, but are we sure we don't > want a lock around those prctl()s? Not that I can tell? It relies on the mtree internal locking for atomicity. For sframe_remove_section() it uses srcu to delay the freeing until all sframe_find()'s have completed. -- Josh
On Tue, Oct 29, 2024 at 09:50:18AM -0700, Josh Poimboeuf wrote: > On Tue, Oct 29, 2024 at 02:27:09PM +0100, Peter Zijlstra wrote: > > > +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, > > > + unsigned long text_end) > > > +{ > > > + struct mm_struct *mm = current->mm; > > > + struct vm_area_struct *sframe_vma; > > > + > > > + mmap_read_lock(mm); > > > > DEFINE_GUARD(mmap_read_lock, struct mm_struct *, > > mmap_read_lock(_T), mmap_read_unlock(_T)) > > > > in include/linux/mmap_lock.h ? > > Will do. > > > > @@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, > > > case PR_RISCV_SET_ICACHE_FLUSH_CTX: > > > error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3); > > > break; > > > + case PR_ADD_SFRAME: > > > + if (arg5) > > > + return -EINVAL; > > > + error = sframe_add_section(arg2, arg3, arg4); > > > + break; > > > + case PR_REMOVE_SFRAME: > > > + if (arg3 || arg4 || arg5) > > > + return -EINVAL; > > > + error = sframe_remove_section(arg2); > > > + break; > > > default: > > > error = -EINVAL; > > > break; > > > > So I realize that mtree has an internal lock, but are we sure we don't > > want a lock around those prctl()s? > > Not that I can tell? It relies on the mtree internal locking for > atomicity. > > For sframe_remove_section() it uses srcu to delay the freeing until all > sframe_find()'s have completed. Yeah it does all that. But I was sorta looking at all that kmalloc and copy from user stuff, but I suppose you can race that without problem.
© 2016 - 2024 Red Hat, Inc.