[PATCH v3 09/19] unwind: Introduce sframe user space unwinding

Josh Poimboeuf posted 19 patches 3 weeks, 6 days ago
[PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 3 weeks, 6 days ago
Some distros have started compiling frame pointers into all their
packages to enable the kernel to do system-wide profiling of user space.
Unfortunately that creates a runtime performance penalty across the
entire system.  Using DWARF instead isn't feasible due to the complexity
it would add to the kernel.

For in-kernel unwinding we solved this problem with the creation of the
ORC unwinder for x86_64.  Similarly, for user space the GNU assembler
has created the sframe format starting with binutils 2.41 for sframe v2.
Sframe is a simpler version of .eh_frame.  It gets placed in the .sframe
section.

Add support for unwinding user space using sframe.

More information about sframe can be found here:

  - https://lwn.net/Articles/932209/
  - https://lwn.net/Articles/940686/
  - https://sourceware.org/binutils/docs/sframe-spec.html

Glibc support is needed to implement the prctl() calls to tell the
kernel where the .sframe segments are.

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 arch/Kconfig                |   4 +
 arch/x86/include/asm/mmu.h  |   2 +-
 fs/binfmt_elf.c             |  35 +++-
 include/linux/mm_types.h    |   3 +
 include/linux/sframe.h      |  41 ++++
 include/linux/unwind_user.h |   2 +
 include/uapi/linux/elf.h    |   1 +
 include/uapi/linux/prctl.h  |   3 +
 kernel/fork.c               |  10 +
 kernel/sys.c                |  11 ++
 kernel/unwind/Makefile      |   3 +-
 kernel/unwind/sframe.c      | 380 ++++++++++++++++++++++++++++++++++++
 kernel/unwind/sframe.h      | 215 ++++++++++++++++++++
 kernel/unwind/user.c        |  24 ++-
 mm/init-mm.c                |   6 +
 15 files changed, 732 insertions(+), 8 deletions(-)
 create mode 100644 include/linux/sframe.h
 create mode 100644 kernel/unwind/sframe.c
 create mode 100644 kernel/unwind/sframe.h

diff --git a/arch/Kconfig b/arch/Kconfig
index ee8ec97ea0ef..e769c39dd221 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -442,6 +442,10 @@ config HAVE_UNWIND_USER_FP
 	bool
 	select UNWIND_USER
 
+config HAVE_UNWIND_USER_SFRAME
+	bool
+	select UNWIND_USER
+
 config HAVE_PERF_REGS
 	bool
 	help
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index ce4677b8b735..12ea831978cc 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -73,7 +73,7 @@ typedef struct {
 	.context = {							\
 		.ctx_id = 1,						\
 		.lock = __MUTEX_INITIALIZER(mm.context.lock),		\
-	}
+	},
 
 void leave_mm(void);
 #define leave_mm leave_mm
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 06dc4a57ba78..434c548f0837 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -47,6 +47,7 @@
 #include <linux/dax.h>
 #include <linux/uaccess.h>
 #include <linux/rseq.h>
+#include <linux/sframe.h>
 #include <asm/param.h>
 #include <asm/page.h>
 
@@ -633,11 +634,13 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		unsigned long no_base, struct elf_phdr *interp_elf_phdata,
 		struct arch_elf_state *arch_state)
 {
-	struct elf_phdr *eppnt;
+	struct elf_phdr *eppnt, *sframe_phdr = NULL;
 	unsigned long load_addr = 0;
 	int load_addr_set = 0;
 	unsigned long error = ~0UL;
 	unsigned long total_size;
+	unsigned long start_code = ~0UL;
+	unsigned long end_code = 0;
 	int i;
 
 	/* First of all, some simple consistency checks */
@@ -659,7 +662,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 
 	eppnt = interp_elf_phdata;
 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
-		if (eppnt->p_type == PT_LOAD) {
+		switch (eppnt->p_type) {
+		case PT_LOAD: {
 			int elf_type = MAP_PRIVATE;
 			int elf_prot = make_prot(eppnt->p_flags, arch_state,
 						 true, true);
@@ -688,7 +692,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			/*
 			 * Check to see if the section's size will overflow the
 			 * allowed task size. Note that p_filesz must always be
-			 * <= p_memsize so it's only necessary to check p_memsz.
+			 * <= p_memsz so it's only necessary to check p_memsz.
 			 */
 			k = load_addr + eppnt->p_vaddr;
 			if (BAD_ADDR(k) ||
@@ -698,9 +702,24 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 				error = -ENOMEM;
 				goto out;
 			}
+
+			if ((eppnt->p_flags & PF_X) && k < start_code)
+				start_code = k;
+
+			if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
+				end_code = k + eppnt->p_filesz;
+			break;
+		}
+		case PT_GNU_SFRAME:
+			sframe_phdr = eppnt;
+			break;
 		}
 	}
 
+	if (sframe_phdr)
+		sframe_add_section(load_addr + sframe_phdr->p_vaddr,
+				   start_code, end_code);
+
 	error = load_addr;
 out:
 	return error;
@@ -823,7 +842,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	int first_pt_load = 1;
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
-	struct elf_phdr *elf_property_phdata = NULL;
+	struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
 	unsigned long elf_brk;
 	int retval, i;
 	unsigned long elf_entry;
@@ -931,6 +950,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 				executable_stack = EXSTACK_DISABLE_X;
 			break;
 
+		case PT_GNU_SFRAME:
+			sframe_phdr = elf_ppnt;
+			break;
+
 		case PT_LOPROC ... PT_HIPROC:
 			retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
 						  bprm->file, false,
@@ -1321,6 +1344,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 					    task_pid_nr(current), retval);
 	}
 
+	if (sframe_phdr)
+		sframe_add_section(load_bias + sframe_phdr->p_vaddr,
+				   start_code, end_code);
+
 	regs = current_pt_regs();
 #ifdef ELF_PLAT_INIT
 	/*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 381d22eba088..6e7561c1a5fc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1052,6 +1052,9 @@ struct mm_struct {
 #endif
 		} lru_gen;
 #endif /* CONFIG_LRU_GEN_WALKS_MMU */
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+		struct maple_tree sframe_mt;
+#endif
 	} __randomize_layout;
 
 	/*
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
new file mode 100644
index 000000000000..d167e01817c4
--- /dev/null
+++ b/include/linux/sframe.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SFRAME_H
+#define _LINUX_SFRAME_H
+
+#include <linux/mm_types.h>
+
+struct unwind_user_frame;
+
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+
+#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
+
+extern void sframe_free_mm(struct mm_struct *mm);
+
+/* text_start, text_end, file_name are optional */
+extern int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
+			      unsigned long text_end);
+
+extern int sframe_remove_section(unsigned long sframe_addr);
+extern int sframe_find(unsigned long ip, struct unwind_user_frame *frame);
+
+static inline bool current_has_sframe(void)
+{
+	struct mm_struct *mm = current->mm;
+
+	return mm && !mtree_empty(&mm->sframe_mt);
+}
+
+#else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
+
+static inline void sframe_free_mm(struct mm_struct *mm) {}
+
+static inline int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end) { return -EINVAL; }
+static inline int sframe_remove_section(unsigned long sframe_addr) { return -EINVAL; }
+static inline int sframe_find(unsigned long ip, struct unwind_user_frame *frame) { return -EINVAL; }
+
+static inline bool current_has_sframe(void) { return false; }
+
+#endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
+
+#endif /* _LINUX_SFRAME_H */
diff --git a/include/linux/unwind_user.h b/include/linux/unwind_user.h
index 9d28db06f33f..cde0fde4923e 100644
--- a/include/linux/unwind_user.h
+++ b/include/linux/unwind_user.h
@@ -5,7 +5,9 @@
 #include <linux/types.h>
 
 enum unwind_user_type {
+	UNWIND_USER_TYPE_NONE,
 	UNWIND_USER_TYPE_FP,
+	UNWIND_USER_TYPE_SFRAME,
 };
 
 struct unwind_stacktrace {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b9935988da5c..4dc3f0ca5af5 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -39,6 +39,7 @@ typedef __s64	Elf64_Sxword;
 #define PT_GNU_STACK	(PT_LOOS + 0x474e551)
 #define PT_GNU_RELRO	(PT_LOOS + 0x474e552)
 #define PT_GNU_PROPERTY	(PT_LOOS + 0x474e553)
+#define PT_GNU_SFRAME	(PT_LOOS + 0x474e554)
 
 
 /* ARM MTE memory tag segment type */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879..69511077c910 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -328,4 +328,7 @@ struct prctl_mm_map {
 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
 # define PR_PPC_DEXCR_CTRL_MASK		0x1f
 
+#define PR_ADD_SFRAME			74
+#define PR_REMOVE_SFRAME		75
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index c056ea95fe8c..60f14fbab956 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -105,6 +105,7 @@
 #include <linux/rseq.h>
 #include <uapi/linux/pidfd.h>
 #include <linux/pidfs.h>
+#include <linux/sframe.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -924,6 +925,7 @@ void __mmdrop(struct mm_struct *mm)
 	mm_pasid_drop(mm);
 	mm_destroy_cid(mm);
 	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+	sframe_free_mm(mm);
 
 	free_mm(mm);
 }
@@ -1251,6 +1253,13 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
 #endif
 }
 
+static void mm_init_sframe(struct mm_struct *mm)
+{
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+	mt_init(&mm->sframe_mt);
+#endif
+}
+
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	struct user_namespace *user_ns)
 {
@@ -1282,6 +1291,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->pmd_huge_pte = NULL;
 #endif
 	mm_init_uprobes_state(mm);
+	mm_init_sframe(mm);
 	hugetlb_count_init(mm);
 
 	if (current->mm) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 4da31f28fda8..7d05a67727db 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -64,6 +64,7 @@
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
+#include <linux/sframe.h>
 
 #include <linux/nospec.h>
 
@@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
 		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
 		break;
+	case PR_ADD_SFRAME:
+		if (arg5)
+			return -EINVAL;
+		error = sframe_add_section(arg2, arg3, arg4);
+		break;
+	case PR_REMOVE_SFRAME:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = sframe_remove_section(arg2);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index 349ce3677526..f70380d7a6a6 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1 +1,2 @@
- obj-$(CONFIG_UNWIND_USER) += user.o
+ obj-$(CONFIG_UNWIND_USER)		+= user.o
+ obj-$(CONFIG_HAVE_UNWIND_USER_SFRAME)	+= sframe.o
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
new file mode 100644
index 000000000000..933e47696e29
--- /dev/null
+++ b/kernel/unwind/sframe.c
@@ -0,0 +1,380 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define pr_fmt(fmt)	"sframe: " fmt
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <linux/sframe.h>
+#include <linux/unwind_user.h>
+
+#include "sframe.h"
+
+#define SFRAME_FILENAME_LEN 32
+
+struct sframe_section {
+	struct rcu_head rcu;
+
+	unsigned long sframe_addr;
+	unsigned long text_addr;
+
+	unsigned long fdes_addr;
+	unsigned long fres_addr;
+	unsigned int  fdes_nr;
+	signed char   ra_off;
+	signed char   fp_off;
+};
+
+DEFINE_STATIC_SRCU(sframe_srcu);
+
+#define __SFRAME_GET_USER(out, user_ptr, type)				\
+({									\
+	type __tmp;							\
+	if (get_user(__tmp, (type __user *)user_ptr))			\
+		return -EFAULT;						\
+	user_ptr += sizeof(__tmp);					\
+	out = __tmp;							\
+})
+
+#define SFRAME_GET_USER(out, user_ptr, size)				\
+({									\
+	switch (size) {							\
+	case 1:								\
+		__SFRAME_GET_USER(out, user_ptr, u8);			\
+		break;							\
+	case 2:								\
+		__SFRAME_GET_USER(out, user_ptr, u16);			\
+		break;							\
+	case 4:								\
+		__SFRAME_GET_USER(out, user_ptr, u32);			\
+		break;							\
+	default:							\
+		return -EINVAL;						\
+	}								\
+})
+
+static unsigned char fre_type_to_size(unsigned char fre_type)
+{
+	if (fre_type > 2)
+		return 0;
+	return 1 << fre_type;
+}
+
+static unsigned char offset_size_enum_to_size(unsigned char off_size)
+{
+	if (off_size > 2)
+		return 0;
+	return 1 << off_size;
+}
+
+static int find_fde(struct sframe_section *sec, unsigned long ip,
+		    struct sframe_fde *fde)
+{
+	struct sframe_fde __user *first, *last, *found = NULL;
+	u32 ip_off, func_off_low = 0, func_off_high = -1;
+
+	ip_off = ip - sec->sframe_addr;
+
+	first = (void __user *)sec->fdes_addr;
+	last = first + sec->fdes_nr;
+	while (first <= last) {
+		struct sframe_fde __user *mid;
+		u32 func_off;
+
+		mid = first + ((last - first) / 2);
+
+		if (get_user(func_off, (s32 __user *)mid))
+			return -EFAULT;
+
+		if (ip_off >= func_off) {
+			/* validate sort order */
+			if (func_off < func_off_low)
+				return -EINVAL;
+
+			func_off_low = func_off;
+
+			found = mid;
+			first = mid + 1;
+		} else {
+			/* validate sort order */
+			if (func_off > func_off_high)
+				return -EINVAL;
+
+			func_off_high = func_off;
+
+			last = mid - 1;
+		}
+	}
+
+	if (!found)
+		return -EINVAL;
+
+	if (copy_from_user(fde, found, sizeof(*fde)))
+		return -EFAULT;
+
+	/* check for gaps */
+	if (ip_off < fde->start_addr || ip_off >= fde->start_addr + fde->size)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int find_fre(struct sframe_section *sec, struct sframe_fde *fde,
+		    unsigned long ip, struct unwind_user_frame *frame)
+{
+	unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info);
+	unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info);
+	unsigned char offset_count, offset_size;
+	s32 cfa_off, ra_off, fp_off, ip_off;
+	void __user *f, *last_f = NULL;
+	unsigned char addr_size;
+	u32 last_fre_ip_off = 0;
+	u8 fre_info = 0;
+	int i;
+
+	addr_size = fre_type_to_size(fre_type);
+	if (!addr_size)
+		return -EINVAL;
+
+	ip_off = ip - (sec->sframe_addr + fde->start_addr);
+
+	f = (void __user *)sec->fres_addr + fde->fres_off;
+
+	for (i = 0; i < fde->fres_num; i++) {
+		u32 fre_ip_off;
+
+		SFRAME_GET_USER(fre_ip_off, f, addr_size);
+
+		if (fre_ip_off < last_fre_ip_off)
+			return -EINVAL;
+
+		last_fre_ip_off = fre_ip_off;
+
+		if (fde_type == SFRAME_FDE_TYPE_PCINC) {
+			if (ip_off < fre_ip_off)
+				break;
+		} else {
+			/* SFRAME_FDE_TYPE_PCMASK */
+			if (ip_off % fde->rep_size < fre_ip_off)
+				break;
+		}
+
+		SFRAME_GET_USER(fre_info, f, 1);
+
+		offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info);
+		offset_size  = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info));
+
+		if (!offset_count || !offset_size)
+			return -EINVAL;
+
+		last_f = f;
+		f += offset_count * offset_size;
+	}
+
+	if (!last_f)
+		return -EINVAL;
+
+	f = last_f;
+
+	SFRAME_GET_USER(cfa_off, f, offset_size);
+	offset_count--;
+
+	ra_off = sec->ra_off;
+	if (!ra_off) {
+		if (!offset_count--)
+			return -EINVAL;
+
+		SFRAME_GET_USER(ra_off, f, offset_size);
+	}
+
+	fp_off = sec->fp_off;
+	if (!fp_off && offset_count) {
+		offset_count--;
+		SFRAME_GET_USER(fp_off, f, offset_size);
+	}
+
+	if (offset_count)
+		return -EINVAL;
+
+	frame->cfa_off = cfa_off;
+	frame->ra_off = ra_off;
+	frame->fp_off = fp_off;
+	frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP;
+
+	return 0;
+}
+
+int sframe_find(unsigned long ip, struct unwind_user_frame *frame)
+{
+	struct mm_struct *mm = current->mm;
+	struct sframe_section *sec;
+	struct sframe_fde fde;
+	int ret = -EINVAL;
+
+	if (!mm)
+		return -EINVAL;
+
+	guard(srcu)(&sframe_srcu);
+
+	sec = mtree_load(&mm->sframe_mt, ip);
+	if (!sec)
+		return ret;
+
+	ret = find_fde(sec, ip, &fde);
+	if (ret)
+		return ret;
+
+	ret = find_fre(sec, &fde, ip, frame);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int __sframe_add_section(unsigned long sframe_addr,
+				unsigned long text_start,
+				unsigned long text_end)
+{
+	struct maple_tree *sframe_mt = &current->mm->sframe_mt;
+	struct sframe_section *sec;
+	struct sframe_header shdr;
+	unsigned long header_end;
+	int ret;
+
+	if (copy_from_user(&shdr, (void __user *)sframe_addr, sizeof(shdr)))
+		return -EFAULT;
+
+	if (shdr.preamble.magic != SFRAME_MAGIC ||
+	    shdr.preamble.version != SFRAME_VERSION_2 ||
+	    !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
+	    shdr.auxhdr_len || !shdr.num_fdes || !shdr.num_fres ||
+	    shdr.fdes_off > shdr.fres_off) {
+		return -EINVAL;
+	}
+
+	sec = kmalloc(sizeof(*sec), GFP_KERNEL);
+	if (!sec)
+		return -ENOMEM;
+
+	header_end = sframe_addr + SFRAME_HDR_SIZE(shdr);
+
+	sec->sframe_addr	= sframe_addr;
+	sec->text_addr		= text_start;
+	sec->fdes_addr		= header_end + shdr.fdes_off;
+	sec->fres_addr		= header_end + shdr.fres_off;
+	sec->fdes_nr		= shdr.num_fdes;
+	sec->ra_off		= shdr.cfa_fixed_ra_offset;
+	sec->fp_off		= shdr.cfa_fixed_fp_offset;
+
+	ret = mtree_insert_range(sframe_mt, text_start, text_end, sec, GFP_KERNEL);
+	if (ret) {
+		kfree(sec);
+		return ret;
+	}
+
+	return 0;
+}
+
+int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
+		       unsigned long text_end)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *sframe_vma;
+
+	mmap_read_lock(mm);
+
+	sframe_vma = vma_lookup(mm, sframe_addr);
+	if (!sframe_vma)
+		goto err_unlock;
+
+	if (text_start && text_end) {
+		struct vm_area_struct *text_vma;
+
+		text_vma = vma_lookup(mm, text_start);
+		if (!(text_vma->vm_flags & VM_EXEC))
+			goto err_unlock;
+
+		if (PAGE_ALIGN(text_end) != text_vma->vm_end)
+			goto err_unlock;
+	} else {
+		struct vm_area_struct *vma, *text_vma = NULL;
+		VMA_ITERATOR(vmi, mm, 0);
+
+		for_each_vma(vmi, vma) {
+			if (vma->vm_file != sframe_vma->vm_file ||
+			    !(vma->vm_flags & VM_EXEC))
+				continue;
+
+			if (text_vma) {
+				pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
+					     current->comm, current->pid);
+				goto err_unlock;
+			}
+
+			text_vma = vma;
+		}
+
+		if (!text_vma)
+			goto err_unlock;
+
+		text_start = text_vma->vm_start;
+		text_end   = text_vma->vm_end;
+	}
+
+	mmap_read_unlock(mm);
+
+	return __sframe_add_section(sframe_addr, text_start, text_end);
+
+err_unlock:
+	mmap_read_unlock(mm);
+	return -EINVAL;
+}
+
+static void sframe_free_srcu(struct rcu_head *rcu)
+{
+	struct sframe_section *sec = container_of(rcu, struct sframe_section, rcu);
+
+	kfree(sec);
+}
+
+static int __sframe_remove_section(struct mm_struct *mm,
+				   struct sframe_section *sec)
+{
+	sec = mtree_erase(&mm->sframe_mt, sec->text_addr);
+	if (!sec)
+		return -EINVAL;
+
+	call_srcu(&sframe_srcu, &sec->rcu, sframe_free_srcu);
+
+	return 0;
+}
+
+int sframe_remove_section(unsigned long sframe_addr)
+{
+	struct mm_struct *mm = current->mm;
+	struct sframe_section *sec;
+	unsigned long index = 0;
+
+	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
+		if (sec->sframe_addr == sframe_addr)
+			return __sframe_remove_section(mm, sec);
+	}
+
+	return -EINVAL;
+}
+
+void sframe_free_mm(struct mm_struct *mm)
+{
+	struct sframe_section *sec;
+	unsigned long index = 0;
+
+	if (!mm)
+		return;
+
+	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX)
+		kfree(sec);
+
+	mtree_destroy(&mm->sframe_mt);
+}
diff --git a/kernel/unwind/sframe.h b/kernel/unwind/sframe.h
new file mode 100644
index 000000000000..11b9be7ad82e
--- /dev/null
+++ b/kernel/unwind/sframe.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023, Oracle and/or its affiliates.
+ *
+ * This file contains definitions for the SFrame stack tracing format, which is
+ * documented at https://sourceware.org/binutils/docs
+ */
+#ifndef _SFRAME_H
+#define _SFRAME_H
+
+#include <linux/types.h>
+
+#define SFRAME_VERSION_1	1
+#define SFRAME_VERSION_2	2
+#define SFRAME_MAGIC		0xdee2
+
+/* Function Descriptor Entries are sorted on PC. */
+#define SFRAME_F_FDE_SORTED	0x1
+/* Frame-pointer based stack tracing. Defined, but not set. */
+#define SFRAME_F_FRAME_POINTER	0x2
+
+#define SFRAME_CFA_FIXED_FP_INVALID 0
+#define SFRAME_CFA_FIXED_RA_INVALID 0
+
+/* Supported ABIs/Arch. */
+#define SFRAME_ABI_AARCH64_ENDIAN_BIG	    1 /* AARCH64 big endian. */
+#define SFRAME_ABI_AARCH64_ENDIAN_LITTLE    2 /* AARCH64 little endian. */
+#define SFRAME_ABI_AMD64_ENDIAN_LITTLE	    3 /* AMD64 little endian. */
+
+/* SFrame FRE types. */
+#define SFRAME_FRE_TYPE_ADDR1	0
+#define SFRAME_FRE_TYPE_ADDR2	1
+#define SFRAME_FRE_TYPE_ADDR4	2
+
+/*
+ * SFrame Function Descriptor Entry types.
+ *
+ * The SFrame format has two possible representations for functions. The
+ * choice of which type to use is made according to the instruction patterns
+ * in the relevant program stub.
+ */
+
+/* Unwinders perform a (PC >= FRE_START_ADDR) to look up a matching FRE. */
+#define SFRAME_FDE_TYPE_PCINC	0
+/*
+ * Unwinders perform a (PC & FRE_START_ADDR_AS_MASK >= FRE_START_ADDR_AS_MASK)
+ * to look up a matching FRE. Typical usecases are pltN entries, trampolines
+ * etc.
+ */
+#define SFRAME_FDE_TYPE_PCMASK	1
+
+/**
+ * struct sframe_preamble - SFrame Preamble.
+ * @magic: Magic number (SFRAME_MAGIC).
+ * @version: Format version number (SFRAME_VERSION).
+ * @flags: Various flags.
+ */
+struct sframe_preamble {
+	u16 magic;
+	u8  version;
+	u8  flags;
+} __packed;
+
+/**
+ * struct sframe_header - SFrame Header.
+ * @preamble: SFrame preamble.
+ * @abi_arch: Identify the arch (including endianness) and ABI.
+ * @cfa_fixed_fp_offset: Offset for the Frame Pointer (FP) from CFA may be
+ *	  fixed for some ABIs ((e.g, in AMD64 when -fno-omit-frame-pointer is
+ *	  used). When fixed, this field specifies the fixed stack frame offset
+ *	  and the individual FREs do not need to track it. When not fixed, it
+ *	  is set to SFRAME_CFA_FIXED_FP_INVALID, and the individual FREs may
+ *	  provide the applicable stack frame offset, if any.
+ * @cfa_fixed_ra_offset: Offset for the Return Address from CFA is fixed for
+ *	  some ABIs. When fixed, this field specifies the fixed stack frame
+ *	  offset and the individual FREs do not need to track it. When not
+ *	  fixed, it is set to SFRAME_CFA_FIXED_FP_INVALID.
+ * @auxhdr_len: Number of bytes making up the auxiliary header, if any.
+ *	  Some ABI/arch, in the future, may use this space for extending the
+ *	  information in SFrame header. Auxiliary header is contained in bytes
+ *	  sequentially following the sframe_header.
+ * @num_fdes: Number of SFrame FDEs in this SFrame section.
+ * @num_fres: Number of SFrame Frame Row Entries.
+ * @fre_len:  Number of bytes in the SFrame Frame Row Entry section.
+ * @fdes_off: Offset of SFrame Function Descriptor Entry section.
+ * @fres_off: Offset of SFrame Frame Row Entry section.
+ */
+struct sframe_header {
+	struct sframe_preamble preamble;
+	u8  abi_arch;
+	s8  cfa_fixed_fp_offset;
+	s8  cfa_fixed_ra_offset;
+	u8  auxhdr_len;
+	u32 num_fdes;
+	u32 num_fres;
+	u32 fre_len;
+	u32 fdes_off;
+	u32 fres_off;
+} __packed;
+
+#define SFRAME_HDR_SIZE(sframe_hdr)	\
+	((sizeof(struct sframe_header) + (sframe_hdr).auxhdr_len))
+
+/* Two possible keys for executable (instruction) pointers signing. */
+#define SFRAME_AARCH64_PAUTH_KEY_A    0 /* Key A. */
+#define SFRAME_AARCH64_PAUTH_KEY_B    1 /* Key B. */
+
+/**
+ * struct sframe_fde - SFrame Function Descriptor Entry.
+ * @start_addr: Function start address. Encoded as a signed offset,
+ *	  relative to the current FDE.
+ * @size: Size of the function in bytes.
+ * @fres_off: Offset of the first SFrame Frame Row Entry of the function,
+ *	  relative to the beginning of the SFrame Frame Row Entry sub-section.
+ * @fres_num: Number of frame row entries for the function.
+ * @info: Additional information for deciphering the stack trace
+ *	  information for the function. Contains information about SFrame FRE
+ *	  type, SFrame FDE type, PAC authorization A/B key, etc.
+ * @rep_size: Block size for SFRAME_FDE_TYPE_PCMASK
+ * @padding: Unused
+ */
+struct sframe_fde {
+	s32 start_addr;
+	u32 size;
+	u32 fres_off;
+	u32 fres_num;
+	u8  info;
+	u8  rep_size;
+	u16 padding;
+} __packed;
+
+/*
+ * 'func_info' in SFrame FDE contains additional information for deciphering
+ * the stack trace information for the function. In V1, the information is
+ * organized as follows:
+ *   - 4-bits: Identify the FRE type used for the function.
+ *   - 1-bit: Identify the FDE type of the function - mask or inc.
+ *   - 1-bit: PAC authorization A/B key (aarch64).
+ *   - 2-bits: Unused.
+ * ---------------------------------------------------------------------
+ * |  Unused  |  PAC auth A/B key (aarch64) |  FDE type |   FRE type   |
+ * |          |        Unused (amd64)       |           |              |
+ * ---------------------------------------------------------------------
+ * 8          6                             5           4              0
+ */
+
+/* Note: Set PAC auth key to SFRAME_AARCH64_PAUTH_KEY_A by default.  */
+#define SFRAME_FUNC_INFO(fde_type, fre_enc_type) \
+	(((SFRAME_AARCH64_PAUTH_KEY_A & 0x1) << 5) | \
+	 (((fde_type) & 0x1) << 4) | ((fre_enc_type) & 0xf))
+
+#define SFRAME_FUNC_FRE_TYPE(data)	  ((data) & 0xf)
+#define SFRAME_FUNC_FDE_TYPE(data)	  (((data) >> 4) & 0x1)
+#define SFRAME_FUNC_PAUTH_KEY(data)	  (((data) >> 5) & 0x1)
+
+/*
+ * Size of stack frame offsets in an SFrame Frame Row Entry. A single
+ * SFrame FRE has all offsets of the same size. Offset size may vary
+ * across frame row entries.
+ */
+#define SFRAME_FRE_OFFSET_1B	  0
+#define SFRAME_FRE_OFFSET_2B	  1
+#define SFRAME_FRE_OFFSET_4B	  2
+
+/* An SFrame Frame Row Entry can be SP or FP based.  */
+#define SFRAME_BASE_REG_FP	0
+#define SFRAME_BASE_REG_SP	1
+
+/*
+ * The index at which a specific offset is presented in the variable length
+ * bytes of an FRE.
+ */
+#define SFRAME_FRE_CFA_OFFSET_IDX   0
+/*
+ * The RA stack offset, if present, will always be at index 1 in the variable
+ * length bytes of the FRE.
+ */
+#define SFRAME_FRE_RA_OFFSET_IDX    1
+/*
+ * The FP stack offset may appear at offset 1 or 2, depending on the ABI as RA
+ * may or may not be tracked.
+ */
+#define SFRAME_FRE_FP_OFFSET_IDX    2
+
+/*
+ * 'fre_info' in SFrame FRE contains information about:
+ *   - 1 bit: base reg for CFA
+ *   - 4 bits: Number of offsets (N). A value of up to 3 is allowed to track
+ *   all three of CFA, FP and RA (fixed implicit order).
+ *   - 2 bits: information about size of the offsets (S) in bytes.
+ *     Valid values are SFRAME_FRE_OFFSET_1B, SFRAME_FRE_OFFSET_2B,
+ *     SFRAME_FRE_OFFSET_4B
+ *   - 1 bit: Mangled RA state bit (aarch64 only).
+ * ---------------------------------------------------------------
+ * | Mangled-RA (aarch64) |  Size of   |   Number of  | base_reg |
+ * |  Unused (amd64)      |  offsets   |    offsets   |          |
+ * ---------------------------------------------------------------
+ * 8                      7             5             1          0
+ */
+
+/* Note: Set mangled_ra_p to zero by default. */
+#define SFRAME_FRE_INFO(base_reg_id, offset_num, offset_size) \
+	(((0 & 0x1) << 7) | (((offset_size) & 0x3) << 5) | \
+	 (((offset_num) & 0xf) << 1) | ((base_reg_id) & 0x1))
+
+/* Set the mangled_ra_p bit as indicated. */
+#define SFRAME_FRE_INFO_UPDATE_MANGLED_RA_P(mangled_ra_p, fre_info) \
+	((((mangled_ra_p) & 0x1) << 7) | ((fre_info) & 0x7f))
+
+#define SFRAME_FRE_CFA_BASE_REG_ID(data)	  ((data) & 0x1)
+#define SFRAME_FRE_OFFSET_COUNT(data)		  (((data) >> 1) & 0xf)
+#define SFRAME_FRE_OFFSET_SIZE(data)		  (((data) >> 5) & 0x3)
+#define SFRAME_FRE_MANGLED_RA_P(data)		  (((data) >> 7) & 0x1)
+
+#endif /* _SFRAME_H */
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 54b989810a0e..8e47c80e3e54 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -8,12 +8,17 @@
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
 #include <linux/unwind_user.h>
+#include <linux/sframe.h>
 #include <linux/uaccess.h>
-#include <asm/unwind_user.h>
 
+#ifdef CONFIG_HAVE_UNWIND_USER_FP
+#include <asm/unwind_user.h>
 static struct unwind_user_frame fp_frame = {
 	ARCH_INIT_USER_FP_FRAME
 };
+#else
+static struct unwind_user_frame fp_frame;
+#endif
 
 int unwind_user_next(struct unwind_user_state *state)
 {
@@ -30,6 +35,16 @@ int unwind_user_next(struct unwind_user_state *state)
 	case UNWIND_USER_TYPE_FP:
 		frame = &fp_frame;
 		break;
+	case UNWIND_USER_TYPE_SFRAME:
+		if (sframe_find(state->ip, frame)) {
+			if (!IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
+				goto the_end;
+
+			frame = &fp_frame;
+		}
+		break;
+	case UNWIND_USER_TYPE_NONE:
+		goto the_end;
 	default:
 		BUG();
 	}
@@ -68,7 +83,12 @@ int unwind_user_start(struct unwind_user_state *state)
 		return -EINVAL;
 	}
 
-	state->type = UNWIND_USER_TYPE_FP;
+	if (current_has_sframe())
+		state->type = UNWIND_USER_TYPE_SFRAME;
+	else if (IS_ENABLED(CONFIG_UNWIND_USER_FP))
+		state->type = UNWIND_USER_TYPE_FP;
+	else
+		state->type = UNWIND_USER_TYPE_NONE;
 
 	state->sp = user_stack_pointer(regs);
 	state->ip = instruction_pointer(regs);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 24c809379274..8eb1b122b7bf 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -11,12 +11,17 @@
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
 #include <linux/iommu.h>
+#include <linux/sframe.h>
 #include <asm/mmu.h>
 
 #ifndef INIT_MM_CONTEXT
 #define INIT_MM_CONTEXT(name)
 #endif
 
+#ifndef INIT_MM_SFRAME
+#define INIT_MM_SFRAME
+#endif
+
 const struct vm_operations_struct vma_dummy_vm_ops;
 
 /*
@@ -45,6 +50,7 @@ struct mm_struct init_mm = {
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
 	INIT_MM_CONTEXT(init_mm)
+	INIT_MM_SFRAME
 };
 
 void setup_initial_init_mm(void *start_code, void *end_code,
-- 
2.47.0
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Jens Remus 1 week, 4 days ago
On 28.10.2024 22:47, Josh Poimboeuf wrote:

> diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c

> @@ -68,7 +83,12 @@ int unwind_user_start(struct unwind_user_state *state)
>   		return -EINVAL;
>   	}
>   
> -	state->type = UNWIND_USER_TYPE_FP;
> +	if (current_has_sframe())
> +		state->type = UNWIND_USER_TYPE_SFRAME;
> +	else if (IS_ENABLED(CONFIG_UNWIND_USER_FP))

The test must be for CONFIG_HAVE_UNWIND_USER_FP. :-)

> +		state->type = UNWIND_USER_TYPE_FP;
> +	else
> +		state->type = UNWIND_USER_TYPE_NONE;
>   
>   	state->sp = user_stack_pointer(regs);
>   	state->ip = instruction_pointer(regs);

Regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303) and z/VSE Support
+49-7031-16-1128 Office
jremus@de.ibm.com

IBM

IBM Deutschland Research & Development GmbH; Vorsitzender des 
Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der 
Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/

Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Steven Rostedt 1 week, 4 days ago
On Wed, 13 Nov 2024 16:56:25 +0100
Jens Remus <jremus@linux.ibm.com> wrote:

> On 28.10.2024 22:47, Josh Poimboeuf wrote:
> 
> > diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c  
> 
> > @@ -68,7 +83,12 @@ int unwind_user_start(struct unwind_user_state *state)
> >   		return -EINVAL;
> >   	}
> >   
> > -	state->type = UNWIND_USER_TYPE_FP;
> > +	if (current_has_sframe())
> > +		state->type = UNWIND_USER_TYPE_SFRAME;
> > +	else if (IS_ENABLED(CONFIG_UNWIND_USER_FP))  
> 
> The test must be for CONFIG_HAVE_UNWIND_USER_FP. :-)

Yep, that too.

Thanks,

-- Steve

> 
> > +		state->type = UNWIND_USER_TYPE_FP;
> > +	else
> > +		state->type = UNWIND_USER_TYPE_NONE;
> >   
> >   	state->sp = user_stack_pointer(regs);
> >   	state->ip = instruction_pointer(regs);  
> 
> Regards,
> Jens
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 1 week, 4 days ago
On Wed, Nov 13, 2024 at 03:50:58PM -0500, Steven Rostedt wrote:
> On Wed, 13 Nov 2024 16:56:25 +0100
> Jens Remus <jremus@linux.ibm.com> wrote:
> 
> > On 28.10.2024 22:47, Josh Poimboeuf wrote:
> > 
> > > diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c  
> > 
> > > @@ -68,7 +83,12 @@ int unwind_user_start(struct unwind_user_state *state)
> > >   		return -EINVAL;
> > >   	}
> > >   
> > > -	state->type = UNWIND_USER_TYPE_FP;
> > > +	if (current_has_sframe())
> > > +		state->type = UNWIND_USER_TYPE_SFRAME;
> > > +	else if (IS_ENABLED(CONFIG_UNWIND_USER_FP))  
> > 
> > The test must be for CONFIG_HAVE_UNWIND_USER_FP. :-)
> 
> Yep, that too.

I also found this one, so that makes three of us!

It's too bad IS_ENABLED() doesn't catch typos.

-- 
Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Jens Remus 2 weeks, 3 days ago
On 28.10.2024 22:47, Josh Poimboeuf wrote:
...
> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
...
> +static int find_fde(struct sframe_section *sec, unsigned long ip,
> +		    struct sframe_fde *fde)
> +{
> +	struct sframe_fde __user *first, *last, *found = NULL;
> +	u32 ip_off, func_off_low = 0, func_off_high = -1;
> +
> +	ip_off = ip - sec->sframe_addr;
> +
> +	first = (void __user *)sec->fdes_addr;
> +	last = first + sec->fdes_nr;

Could it be that this needs to be:

	last = first + sec->fdes_nr - 1;

> +	while (first <= last) {
> +		struct sframe_fde __user *mid;
> +		u32 func_off;
> +
> +		mid = first + ((last - first) / 2);
> +
> +		if (get_user(func_off, (s32 __user *)mid))
> +			return -EFAULT;
> +
> +		if (ip_off >= func_off) {
> +			/* validate sort order */
> +			if (func_off < func_off_low)
> +				return -EINVAL;

Otherwise I run into this when the IP is within the function whose FDE 
is the last one in the .sframe section:

find_fde: IP=0x000000000110fbcc: ERROR: func_off < func_off_low 
(func_off=196608, func_off_low=4294224904)

110fbcc dump_sframe+0x2ec (/opt/binutils-sframe2/bin/objdump)

func idx [2275]: pc = 0x110f8e0, size = 3310 bytes <dump_sframe>
STARTPC         CFA       FP        RA           INFO
000000000110f8e0  sp+160    u         u            (1*1B)
000000000110f8e6  sp+160    c-72      c-48         (3*1B)
000000000110f8f6  sp+632    c-72      c-48         (3*1B)
000000000110fa82  sp+160    u         u            (1*1B)
000000000110fa88  sp+632    c-72      c-48         (3*1B)
0000000001110486  sp+160    u         u            (1*1B)
000000000111048c  sp+632    c-72      c-48         (3*1B)
0000000001110574  sp+160    u         u            (1*1B)
000000000111057a  sp+632    c-72      c-48         (3*1B)

> +
> +			func_off_low = func_off;
> +
> +			found = mid;
> +			first = mid + 1;
> +		} else {
> +			/* validate sort order */
> +			if (func_off > func_off_high)
> +				return -EINVAL;
> +
> +			func_off_high = func_off;
> +
> +			last = mid - 1;
> +		}
> +	}
> +
> +	if (!found)
> +		return -EINVAL;
> +
> +	if (copy_from_user(fde, found, sizeof(*fde)))
> +		return -EFAULT;
> +
> +	/* check for gaps */
> +	if (ip_off < fde->start_addr || ip_off >= fde->start_addr + fde->size)
> +		return -EINVAL;
> +
> +	return 0;
> +}

Thanks and regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303) and z/VSE Support
+49-7031-16-1128 Office
jremus@de.ibm.com

IBM

IBM Deutschland Research & Development GmbH; Vorsitzender des 
Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der 
Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/

Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Steven Rostedt 1 week, 4 days ago
On Thu, 7 Nov 2024 17:59:08 +0100
Jens Remus <jremus@linux.ibm.com> wrote:

> On 28.10.2024 22:47, Josh Poimboeuf wrote:
> ...
> > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c  
> ...
> > +static int find_fde(struct sframe_section *sec, unsigned long ip,
> > +		    struct sframe_fde *fde)
> > +{
> > +	struct sframe_fde __user *first, *last, *found = NULL;
> > +	u32 ip_off, func_off_low = 0, func_off_high = -1;
> > +
> > +	ip_off = ip - sec->sframe_addr;
> > +
> > +	first = (void __user *)sec->fdes_addr;
> > +	last = first + sec->fdes_nr;  
> 
> Could it be that this needs to be:
> 
> 	last = first + sec->fdes_nr - 1;

Yep, I discovered the same issue.

-- Steve

> 
> > +	while (first <= last) {
> > +		struct sframe_fde __user *mid;
> > +		u32 func_off;
> > +
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 1 week, 4 days ago
On Wed, Nov 13, 2024 at 03:50:40PM -0500, Steven Rostedt wrote:
> On Thu, 7 Nov 2024 17:59:08 +0100
> Jens Remus <jremus@linux.ibm.com> wrote:
> 
> > On 28.10.2024 22:47, Josh Poimboeuf wrote:
> > ...
> > > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c  
> > ...
> > > +static int find_fde(struct sframe_section *sec, unsigned long ip,
> > > +		    struct sframe_fde *fde)
> > > +{
> > > +	struct sframe_fde __user *first, *last, *found = NULL;
> > > +	u32 ip_off, func_off_low = 0, func_off_high = -1;
> > > +
> > > +	ip_off = ip - sec->sframe_addr;
> > > +
> > > +	first = (void __user *)sec->fdes_addr;
> > > +	last = first + sec->fdes_nr;  
> > 
> > Could it be that this needs to be:
> > 
> > 	last = first + sec->fdes_nr - 1;
> 
> Yep, I discovered the same issue.

Indeed, thanks.

-- 
Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Steven Rostedt 1 week, 4 days ago
On Wed, 13 Nov 2024 13:15:35 -0800
Josh Poimboeuf <jpoimboe@kernel.org> wrote:

> On Wed, Nov 13, 2024 at 03:50:40PM -0500, Steven Rostedt wrote:
> > On Thu, 7 Nov 2024 17:59:08 +0100
> > Jens Remus <jremus@linux.ibm.com> wrote:
> >   
> > > On 28.10.2024 22:47, Josh Poimboeuf wrote:
> > > ...  
> > > > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c    
> > > ...  
> > > > +static int find_fde(struct sframe_section *sec, unsigned long ip,
> > > > +		    struct sframe_fde *fde)
> > > > +{
> > > > +	struct sframe_fde __user *first, *last, *found = NULL;
> > > > +	u32 ip_off, func_off_low = 0, func_off_high = -1;
> > > > +
> > > > +	ip_off = ip - sec->sframe_addr;
> > > > +
> > > > +	first = (void __user *)sec->fdes_addr;
> > > > +	last = first + sec->fdes_nr;    
> > > 
> > > Could it be that this needs to be:
> > > 
> > > 	last = first + sec->fdes_nr - 1;  
> > 
> > Yep, I discovered the same issue.  
> 
> Indeed, thanks.
> 

BTW, the following changes were needed to make it work for me:

-- Steve

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 434c548f0837..64cc3c1188ca 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -842,7 +842,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	int first_pt_load = 1;
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
-	struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
+	struct elf_phdr *elf_property_phdata = NULL;
+	unsigned long sframe_vaddr = 0;
 	unsigned long elf_brk;
 	int retval, i;
 	unsigned long elf_entry;
@@ -951,7 +952,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			break;
 
 		case PT_GNU_SFRAME:
-			sframe_phdr = elf_ppnt;
+			sframe_vaddr = elf_ppnt->p_vaddr;
 			break;
 
 		case PT_LOPROC ... PT_HIPROC:
@@ -1344,8 +1345,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
 					    task_pid_nr(current), retval);
 	}
 
-	if (sframe_phdr)
-		sframe_add_section(load_bias + sframe_phdr->p_vaddr,
+	if (sframe_vaddr)
+		sframe_add_section(load_bias + sframe_vaddr,
 				   start_code, end_code);
 
 	regs = current_pt_regs();
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
index 933e47696e29..ca4ef0b72772 100644
--- a/kernel/unwind/sframe.c
+++ b/kernel/unwind/sframe.c
@@ -73,15 +73,15 @@ static int find_fde(struct sframe_section *sec, unsigned long ip,
 		    struct sframe_fde *fde)
 {
 	struct sframe_fde __user *first, *last, *found = NULL;
-	u32 ip_off, func_off_low = 0, func_off_high = -1;
+	s32 ip_off, func_off_low = INT_MIN, func_off_high = INT_MAX;
 
 	ip_off = ip - sec->sframe_addr;
 
 	first = (void __user *)sec->fdes_addr;
-	last = first + sec->fdes_nr;
+	last = first + sec->fdes_nr - 1;
 	while (first <= last) {
 		struct sframe_fde __user *mid;
-		u32 func_off;
+		s32 func_off;
 
 		mid = first + ((last - first) / 2);
 
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 11aadfade005..d9cd820150c5 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -97,7 +97,7 @@ int unwind_user_start(struct unwind_user_state *state)
 
 	if (current_has_sframe())
 		state->type = UNWIND_USER_TYPE_SFRAME;
-	else if (IS_ENABLED(CONFIG_UNWIND_USER_FP))
+	else if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))
 		state->type = UNWIND_USER_TYPE_FP;
 	else
 		state->type = UNWIND_USER_TYPE_NONE;
@@ -138,7 +138,7 @@ int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries)
 static u64 ctx_to_cookie(u64 cpu, u64 ctx)
 {
 	BUILD_BUG_ON(NR_CPUS > 65535);
-	return (ctx & ((1UL << 48) - 1)) | cpu;
+	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);
 }
 
 /*
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Jens Remus 1 week, 3 days ago
On 13.11.2024 23:13, Steven Rostedt wrote:
> On Wed, 13 Nov 2024 13:15:35 -0800

> BTW, the following changes were needed to make it work for me:

> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
> index 933e47696e29..ca4ef0b72772 100644
> --- a/kernel/unwind/sframe.c
> +++ b/kernel/unwind/sframe.c
> @@ -73,15 +73,15 @@ static int find_fde(struct sframe_section *sec, unsigned long ip,
>   		    struct sframe_fde *fde)
>   {
>   	struct sframe_fde __user *first, *last, *found = NULL;
> -	u32 ip_off, func_off_low = 0, func_off_high = -1;
> +	s32 ip_off, func_off_low = INT_MIN, func_off_high = INT_MAX;

Coincidentally I was experimenting with exactly the same changes, except 
that I used S32_MIN and S32_MAX.  Any preference?

>   
>   	ip_off = ip - sec->sframe_addr;
>   
>   	first = (void __user *)sec->fdes_addr;
> -	last = first + sec->fdes_nr;
> +	last = first + sec->fdes_nr - 1;
>   	while (first <= last) {
>   		struct sframe_fde __user *mid;
> -		u32 func_off;
> +		s32 func_off;
>   
>   		mid = first + ((last - first) / 2);
>   

Thanks and regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303) and z/VSE Support
+49-7031-16-1128 Office
jremus@de.ibm.com

IBM

IBM Deutschland Research & Development GmbH; Vorsitzender des 
Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der 
Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/

Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Steven Rostedt 1 week, 4 days ago
[ This reply fixes the linux-trace-kernel email :-p ]

On Wed, 13 Nov 2024 17:13:26 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> BTW, the following changes were needed to make it work for me:
> 
> -- Steve
> 
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 434c548f0837..64cc3c1188ca 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -842,7 +842,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
>  	int first_pt_load = 1;
>  	unsigned long error;
>  	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
> -	struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
> +	struct elf_phdr *elf_property_phdata = NULL;
> +	unsigned long sframe_vaddr = 0;

Could not just save the pointer to the sframe phd, as it gets freed before we need it.

>  	unsigned long elf_brk;
>  	int retval, i;
>  	unsigned long elf_entry;
> @@ -951,7 +952,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
>  			break;
>  
>  		case PT_GNU_SFRAME:
> -			sframe_phdr = elf_ppnt;
> +			sframe_vaddr = elf_ppnt->p_vaddr;
>  			break;
>  
>  		case PT_LOPROC ... PT_HIPROC:
> @@ -1344,8 +1345,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
>  					    task_pid_nr(current), retval);
>  	}
>  
> -	if (sframe_phdr)
> -		sframe_add_section(load_bias + sframe_phdr->p_vaddr,
> +	if (sframe_vaddr)
> +		sframe_add_section(load_bias + sframe_vaddr,
>  				   start_code, end_code);
>  
>  	regs = current_pt_regs();
> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
> index 933e47696e29..ca4ef0b72772 100644
> --- a/kernel/unwind/sframe.c
> +++ b/kernel/unwind/sframe.c
> @@ -73,15 +73,15 @@ static int find_fde(struct sframe_section *sec, unsigned long ip,
>  		    struct sframe_fde *fde)
>  {
>  	struct sframe_fde __user *first, *last, *found = NULL;
> -	u32 ip_off, func_off_low = 0, func_off_high = -1;
> +	s32 ip_off, func_off_low = INT_MIN, func_off_high = INT_MAX;

The ip_off is a signed it. I wrote a program to dump out the sframe section
of files, and I had:

	ffffed88: (1020) size:      16 off:     146 num:       2 info: 1 rep:16
	ffffed98: (1030) size:     336 off:     154 num:       2 info:17 rep:16
	ffffefe1: (1279) size:     113 off:       0 num:       4 info: 0 rep: 0
	fffff052: (12ea) size:      54 off:      15 num:       3 info: 0 rep: 0
	fffff088: (1320) size:     167 off:      26 num:       3 info: 0 rep: 0
	fffff12f: (13c7) size:     167 off:      37 num:       4 info: 0 rep: 0
	fffff1d6: (146e) size:     167 off:      52 num:       4 info: 0 rep: 0
	fffff27d: (1515) size:      22 off:      67 num:       4 info: 0 rep: 0
	fffff293: (152b) size:     141 off:      82 num:       4 info: 0 rep: 0
	fffff320: (15b8) size:      81 off:      97 num:       4 info: 0 rep: 0
	fffff371: (1609) size:     671 off:     112 num:       4 info: 1 rep: 0
	fffff610: (18a8) size:     171 off:     131 num:       4 info: 0 rep: 0

The above turns was created by a loop of:

	fde = (void *)sframes + sizeof(*sframes) + sframes->sfh_fdeoff;
	for (s = 0; s < sframes->sfh_num_fdes; s++, fde++) {
		printf("\t%x: (%lx) size:%8u off:%8u num:%8u info:%2u rep:%2u\n",
		       fde->sfde_func_start_address,
		       fde->sfde_func_start_address + shdr->sh_offset,
		       fde->sfde_func_size,
		       fde->sfde_func_start_fre_off,
		       fde->sfde_func_num_fres,
		       fde->sfde_func_info,
		       fde->sfde_func_rep_size);
	}

As you can see, all the ip_off are negative.

>  
>  	ip_off = ip - sec->sframe_addr;
>  
>  	first = (void __user *)sec->fdes_addr;
> -	last = first + sec->fdes_nr;
> +	last = first + sec->fdes_nr - 1;

The above was mentioned before.

>  	while (first <= last) {
>  		struct sframe_fde __user *mid;
> -		u32 func_off;
> +		s32 func_off;
>  
>  		mid = first + ((last - first) / 2);
>  
> diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
> index 11aadfade005..d9cd820150c5 100644
> --- a/kernel/unwind/user.c
> +++ b/kernel/unwind/user.c
> @@ -97,7 +97,7 @@ int unwind_user_start(struct unwind_user_state *state)
>  
>  	if (current_has_sframe())
>  		state->type = UNWIND_USER_TYPE_SFRAME;
> -	else if (IS_ENABLED(CONFIG_UNWIND_USER_FP))
> +	else if (IS_ENABLED(CONFIG_HAVE_UNWIND_USER_FP))

This was mentioned too.

>  		state->type = UNWIND_USER_TYPE_FP;
>  	else
>  		state->type = UNWIND_USER_TYPE_NONE;
> @@ -138,7 +138,7 @@ int unwind_user(struct unwind_stacktrace *trace, unsigned int max_entries)
>  static u64 ctx_to_cookie(u64 cpu, u64 ctx)
>  {
>  	BUILD_BUG_ON(NR_CPUS > 65535);
> -	return (ctx & ((1UL << 48) - 1)) | cpu;
> +	return (ctx & ((1UL << 48) - 1)) | (cpu << 48);

And so was this.

-- Steve

>  }
>  
>  /*
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Steven Rostedt 1 week, 4 days ago
On Wed, 13 Nov 2024 17:21:18 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> > diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> > index 434c548f0837..64cc3c1188ca 100644
> > --- a/fs/binfmt_elf.c
> > +++ b/fs/binfmt_elf.c
> > @@ -842,7 +842,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
> >  	int first_pt_load = 1;
> >  	unsigned long error;
> >  	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
> > -	struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
> > +	struct elf_phdr *elf_property_phdata = NULL;
> > +	unsigned long sframe_vaddr = 0;  
> 
> Could not just save the pointer to the sframe phd, as it gets freed before we need it.
                                                ^^^
                                                phdr


> > diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
> > index 933e47696e29..ca4ef0b72772 100644
> > --- a/kernel/unwind/sframe.c
> > +++ b/kernel/unwind/sframe.c
> > @@ -73,15 +73,15 @@ static int find_fde(struct sframe_section *sec, unsigned long ip,
> >  		    struct sframe_fde *fde)
> >  {
> >  	struct sframe_fde __user *first, *last, *found = NULL;
> > -	u32 ip_off, func_off_low = 0, func_off_high = -1;
> > +	s32 ip_off, func_off_low = INT_MIN, func_off_high = INT_MAX;  
> 
> The ip_off is a signed it. I wrote a program to dump out the sframe section
                         ^^
                         int


> of files, and I had:
> 
> 	ffffed88: (1020) size:      16 off:     146 num:       2 info: 1 rep:16
> 	ffffed98: (1030) size:     336 off:     154 num:       2 info:17 rep:16
> 	ffffefe1: (1279) size:     113 off:       0 num:       4 info: 0 rep: 0
> 	fffff052: (12ea) size:      54 off:      15 num:       3 info: 0 rep: 0
> 	fffff088: (1320) size:     167 off:      26 num:       3 info: 0 rep: 0
> 	fffff12f: (13c7) size:     167 off:      37 num:       4 info: 0 rep: 0
> 	fffff1d6: (146e) size:     167 off:      52 num:       4 info: 0 rep: 0
> 	fffff27d: (1515) size:      22 off:      67 num:       4 info: 0 rep: 0
> 	fffff293: (152b) size:     141 off:      82 num:       4 info: 0 rep: 0
> 	fffff320: (15b8) size:      81 off:      97 num:       4 info: 0 rep: 0
> 	fffff371: (1609) size:     671 off:     112 num:       4 info: 1 rep: 0
> 	fffff610: (18a8) size:     171 off:     131 num:       4 info: 0 rep: 0
> 
> The above turns was created by a loop of:
            ^^^^^^^^^
            items were

No idea why I typed that :-p


I can't blame jetlag anymore.

-- Steve
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Weinan Liu 2 weeks, 3 days ago
> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
...
> +static int find_fre(struct sframe_section *sec, struct sframe_fde *fde,
> +		    unsigned long ip, struct unwind_user_frame *frame)
> +{
> +	unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info);
> +	unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info);
> +	unsigned char offset_count, offset_size;
> +	s32 cfa_off, ra_off, fp_off, ip_off;
> +	void __user *f, *last_f = NULL;
> +	unsigned char addr_size;
> +	u32 last_fre_ip_off = 0;
> +	u8 fre_info = 0;
> +	int i;
> +
> +	addr_size = fre_type_to_size(fre_type);
> +	if (!addr_size)
> +		return -EINVAL;
> +
> +	ip_off = ip - (sec->sframe_addr + fde->start_addr);

nit: Since we already know wether the ip_off should mask or not, I think we don't have to check fde_type and mask the ip_off everytime.
	ip_off = (fde_type == SFRAME_FDE_TYPE_PCINC) ? ip_off : ip_off % fde->rep_size;

> +
> +	f = (void __user *)sec->fres_addr + fde->fres_off;
> +
> +	for (i = 0; i < fde->fres_num; i++) {
> +		u32 fre_ip_off;
> +
> +		SFRAME_GET_USER(fre_ip_off, f, addr_size);
> +
> +		if (fre_ip_off < last_fre_ip_off)
> +			return -EINVAL;
> +
> +		last_fre_ip_off = fre_ip_off;
> +
> +		if (fde_type == SFRAME_FDE_TYPE_PCINC) {
> +			if (ip_off < fre_ip_off)
> +				break;
> +		} else {
> +			/* SFRAME_FDE_TYPE_PCMASK */
> +			if (ip_off % fde->rep_size < fre_ip_off)
> +				break;
> +		}
> +
> +		SFRAME_GET_USER(fre_info, f, 1);
> +
> +		offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info);
> +		offset_size  = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info));
> +
> +		if (!offset_count || !offset_size)
> +			return -EINVAL;
> +
> +		last_f = f;
> +		f += offset_count * offset_size;
> +	}
> +
> +	if (!last_f)
> +		return -EINVAL;
> +
> +	f = last_f;
> +
> +	SFRAME_GET_USER(cfa_off, f, offset_size);
> +	offset_count--;
> +
> +	ra_off = sec->ra_off;
> +	if (!ra_off) {
> +		if (!offset_count--)
> +			return -EINVAL;
> +
> +		SFRAME_GET_USER(ra_off, f, offset_size);
> +	}
> +
> +	fp_off = sec->fp_off;
> +	if (!fp_off && offset_count) {
> +		offset_count--;
> +		SFRAME_GET_USER(fp_off, f, offset_size);
> +	}
> +
> +	if (offset_count)
> +		return -EINVAL;
> +
> +	frame->cfa_off = cfa_off;
> +	frame->ra_off = ra_off;
> +	frame->fp_off = fp_off;
> +	frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP;
> +
> +	return 0;
> +}
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Jens Remus 2 weeks, 4 days ago
On 28.10.2024 22:47, Josh Poimboeuf wrote:
...

> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
...
> +#define __SFRAME_GET_USER(out, user_ptr, type)				\
> +({									\
> +	type __tmp;							\
> +	if (get_user(__tmp, (type __user *)user_ptr))			\
> +		return -EFAULT;						\
> +	user_ptr += sizeof(__tmp);					\
> +	out = __tmp;							\
> +})
> +
> +#define SFRAME_GET_USER(out, user_ptr, size)				\
> +({									\
> +	switch (size) {							\
> +	case 1:								\
> +		__SFRAME_GET_USER(out, user_ptr, u8);			\
> +		break;							\
> +	case 2:								\
> +		__SFRAME_GET_USER(out, user_ptr, u16);			\
> +		break;							\
> +	case 4:								\
> +		__SFRAME_GET_USER(out, user_ptr, u32);			\
> +		break;							\
> +	default:							\
> +		return -EINVAL;						\
> +	}								\
> +})
> +
> +static unsigned char fre_type_to_size(unsigned char fre_type)
> +{
> +	if (fre_type > 2)
> +		return 0;
> +	return 1 << fre_type;
> +}
> +
> +static unsigned char offset_size_enum_to_size(unsigned char off_size)
> +{
> +	if (off_size > 2)
> +		return 0;
> +	return 1 << off_size;
> +}
...
> +static int find_fre(struct sframe_section *sec, struct sframe_fde *fde,
> +		    unsigned long ip, struct unwind_user_frame *frame)
> +{
> +	unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info);
> +	unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info);
> +	unsigned char offset_count, offset_size;
> +	s32 cfa_off, ra_off, fp_off, ip_off;
> +	void __user *f, *last_f = NULL;
> +	unsigned char addr_size;
> +	u32 last_fre_ip_off = 0;
> +	u8 fre_info = 0;
> +	int i;
> +
> +	addr_size = fre_type_to_size(fre_type);
> +	if (!addr_size)
> +		return -EINVAL;
> +
> +	ip_off = ip - (sec->sframe_addr + fde->start_addr);
> +
> +	f = (void __user *)sec->fres_addr + fde->fres_off;
> +
> +	for (i = 0; i < fde->fres_num; i++) {
> +		u32 fre_ip_off;
> +
> +		SFRAME_GET_USER(fre_ip_off, f, addr_size);
> +
> +		if (fre_ip_off < last_fre_ip_off)
> +			return -EINVAL;
> +
> +		last_fre_ip_off = fre_ip_off;
> +
> +		if (fde_type == SFRAME_FDE_TYPE_PCINC) {
> +			if (ip_off < fre_ip_off)
> +				break;
> +		} else {
> +			/* SFRAME_FDE_TYPE_PCMASK */
> +			if (ip_off % fde->rep_size < fre_ip_off)
> +				break;
> +		}
> +
> +		SFRAME_GET_USER(fre_info, f, 1);
> +
> +		offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info);
> +		offset_size  = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info));
> +
> +		if (!offset_count || !offset_size)
> +			return -EINVAL;
> +
> +		last_f = f;
> +		f += offset_count * offset_size;
> +	}
> +
> +	if (!last_f)
> +		return -EINVAL;
> +
> +	f = last_f;
> +
> +	SFRAME_GET_USER(cfa_off, f, offset_size);

SFRAME_GET_USER() does not work for the signed SFrame CFA offset.

> +	offset_count--;
> +
> +	ra_off = sec->ra_off;
> +	if (!ra_off) {
> +		if (!offset_count--)
> +			return -EINVAL;
> +
> +		SFRAME_GET_USER(ra_off, f, offset_size);

Likewise for the signed SFrame RA offset.

Excerpt from my added trace. Note the RA_off=65488 (unsigned) = 0xFFD0 = 
-48 (signed):

unwind_user_next: WARNING: RA could not be obtained from user space 
(IP=0x000003ffbb5f4218, CFA=0x000003ffc22f8f10, RA_off=65488)

Excerpt from perf script:

3ffbb5f4218 internal_fnwmatch+0x558 (/usr/lib64/libc.so.6)

Excerpts from objdump -wt --sframe:

00000000000f3cc0 l     F .text  000000000000195c 
internal_fnwmatch

func idx [1715]: pc = 0xf3cc0, size = 6492 bytes
STARTPC         CFA       FP        RA           INFO
00000000000f3cc0  sp+160    u         u            (1*1B)
00000000000f3cc6  sp+160    c-72      c-48         (3*1B)
00000000000f3cd0  sp+4256   c-72      c-48         (3*2B)
00000000000f3cdc  sp+8352   c-72      c-48         (3*2B)
00000000000f3ce8  sp+10792  c-72      c-48         (3*2B)
00000000000f3f7e  sp+160    u         u            (1*1B)
00000000000f3f80  sp+10792  c-72      c-48         (3*2B)

> +	}
> +
> +	fp_off = sec->fp_off;
> +	if (!fp_off && offset_count) {
> +		offset_count--;
> +		SFRAME_GET_USER(fp_off, f, offset_size);

Likewise for the signed SFrame FP offset.

> +	}
> +
> +	if (offset_count)
> +		return -EINVAL;
> +
> +	frame->cfa_off = cfa_off;
> +	frame->ra_off = ra_off;
> +	frame->fp_off = fp_off;
> +	frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP;
> +
> +	return 0;
> +}
...

I have verified that reintroducing and using SFRAME_GET_USER_SIGNED() 
works correctly.

Regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303) and z/VSE Support
+49-7031-16-1128 Office
jremus@de.ibm.com

IBM

IBM Deutschland Research & Development GmbH; Vorsitzender des 
Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der 
Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/

Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Steven Rostedt 2 weeks, 5 days ago
On Mon, 28 Oct 2024 14:47:56 -0700
Josh Poimboeuf <jpoimboe@kernel.org> wrote:

> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 06dc4a57ba78..434c548f0837 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -47,6 +47,7 @@
>  #include <linux/dax.h>
>  #include <linux/uaccess.h>
>  #include <linux/rseq.h>
> +#include <linux/sframe.h>
>  #include <asm/param.h>
>  #include <asm/page.h>
>  
> @@ -633,11 +634,13 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>  		unsigned long no_base, struct elf_phdr *interp_elf_phdata,
>  		struct arch_elf_state *arch_state)
>  {
> -	struct elf_phdr *eppnt;
> +	struct elf_phdr *eppnt, *sframe_phdr = NULL;
>  	unsigned long load_addr = 0;
>  	int load_addr_set = 0;
>  	unsigned long error = ~0UL;
>  	unsigned long total_size;
> +	unsigned long start_code = ~0UL;
> +	unsigned long end_code = 0;
>  	int i;
>  
>  	/* First of all, some simple consistency checks */
> @@ -659,7 +662,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>  
>  	eppnt = interp_elf_phdata;
>  	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
> -		if (eppnt->p_type == PT_LOAD) {
> +		switch (eppnt->p_type) {
> +		case PT_LOAD: {
>  			int elf_type = MAP_PRIVATE;
>  			int elf_prot = make_prot(eppnt->p_flags, arch_state,
>  						 true, true);
> @@ -688,7 +692,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>  			/*
>  			 * Check to see if the section's size will overflow the
>  			 * allowed task size. Note that p_filesz must always be
> -			 * <= p_memsize so it's only necessary to check p_memsz.
> +			 * <= p_memsz so it's only necessary to check p_memsz.
>  			 */
>  			k = load_addr + eppnt->p_vaddr;
>  			if (BAD_ADDR(k) ||
> @@ -698,9 +702,24 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>  				error = -ENOMEM;
>  				goto out;
>  			}
> +
> +			if ((eppnt->p_flags & PF_X) && k < start_code)
> +				start_code = k;
> +
> +			if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
> +				end_code = k + eppnt->p_filesz;
> +			break;
> +		}
> +		case PT_GNU_SFRAME:
> +			sframe_phdr = eppnt;
> +			break;
>  		}
>  	}
>  
> +	if (sframe_phdr)
> +		sframe_add_section(load_addr + sframe_phdr->p_vaddr,
> +				   start_code, end_code);
> +
>  	error = load_addr;
>  out:
>  	return error;
> @@ -823,7 +842,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
>  	int first_pt_load = 1;
>  	unsigned long error;
>  	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
> -	struct elf_phdr *elf_property_phdata = NULL;
> +	struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
>  	unsigned long elf_brk;
>  	int retval, i;
>  	unsigned long elf_entry;
> @@ -931,6 +950,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
>  				executable_stack = EXSTACK_DISABLE_X;
>  			break;
>  
> +		case PT_GNU_SFRAME:
> +			sframe_phdr = elf_ppnt;

You need to save the p_vaddr here and not the pointer.

> +			break;
> +
>  		case PT_LOPROC ... PT_HIPROC:
>  			retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
>  						  bprm->file, false,
> @@ -1321,6 +1344,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
>  					    task_pid_nr(current), retval);
>  	}

Before this code we have:

	kfree(elf_phdata);

And I added:

	if (sframe_phdr)
		trace_printk("after sframe vaddr=%x\n", sframe_phdr->p_vaddr);
	kfree(elf_phdata);
	if (sframe_phdr)
		trace_printk("after sframe vaddr=%x\n", sframe_phdr->p_vaddr);

Which produced:

         scan-fs-940   [007] .....    16.091081: bprint:               load_elf_binary: after sframe vaddr=2298
         scan-fs-940   [007] .....    16.091083: bprint:               load_elf_binary: after sframe vaddr=0

I was wondering why it wasn't working.

-- Steve

>  
> +	if (sframe_phdr)
> +		sframe_add_section(load_bias + sframe_phdr->p_vaddr,
> +				   start_code, end_code);
> +
>  	regs = current_pt_regs();
>  #ifdef ELF_PLAT_INIT
>  	/*
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 381d22eba088..6e7561c1a5fc 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1052,6 +1052,9 @@ struct mm_struct {
>  #endif
>  		} lru_gen;
>  #endif /* CONFIG_LRU_GEN_WALKS_MMU */
> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
> +		struct maple_tree sframe_mt;
> +#endif
>  	} __randomize_layout;
>  
>  	/*
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Steven Rostedt 2 weeks, 5 days ago
On Tue, 5 Nov 2024 12:40:53 -0500
Steven Rostedt <rostedt@goodmis.org> wrote:

> On Mon, 28 Oct 2024 14:47:56 -0700
> Josh Poimboeuf <jpoimboe@kernel.org> wrote:

<linux-trace-kernel@vger.kerne.org>: Host or domain name not found. Name
    service error for name=vger.kerne.org type=AAAA: Host not found

Hmm, no wonder this didn't show up in patchwork :-/

Please fix on your next version.

Thanks,

-- Steve
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Andrii Nakryiko 3 weeks, 5 days ago
On Mon, Oct 28, 2024 at 2:48 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> Some distros have started compiling frame pointers into all their
> packages to enable the kernel to do system-wide profiling of user space.
> Unfortunately that creates a runtime performance penalty across the
> entire system.  Using DWARF instead isn't feasible due to the complexity
> it would add to the kernel.
>
> For in-kernel unwinding we solved this problem with the creation of the
> ORC unwinder for x86_64.  Similarly, for user space the GNU assembler
> has created the sframe format starting with binutils 2.41 for sframe v2.
> Sframe is a simpler version of .eh_frame.  It gets placed in the .sframe
> section.
>
> Add support for unwinding user space using sframe.
>
> More information about sframe can be found here:
>
>   - https://lwn.net/Articles/932209/
>   - https://lwn.net/Articles/940686/
>   - https://sourceware.org/binutils/docs/sframe-spec.html
>
> Glibc support is needed to implement the prctl() calls to tell the
> kernel where the .sframe segments are.
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
>  arch/Kconfig                |   4 +
>  arch/x86/include/asm/mmu.h  |   2 +-
>  fs/binfmt_elf.c             |  35 +++-
>  include/linux/mm_types.h    |   3 +
>  include/linux/sframe.h      |  41 ++++
>  include/linux/unwind_user.h |   2 +
>  include/uapi/linux/elf.h    |   1 +
>  include/uapi/linux/prctl.h  |   3 +
>  kernel/fork.c               |  10 +
>  kernel/sys.c                |  11 ++
>  kernel/unwind/Makefile      |   3 +-
>  kernel/unwind/sframe.c      | 380 ++++++++++++++++++++++++++++++++++++
>  kernel/unwind/sframe.h      | 215 ++++++++++++++++++++
>  kernel/unwind/user.c        |  24 ++-
>  mm/init-mm.c                |   6 +
>  15 files changed, 732 insertions(+), 8 deletions(-)
>  create mode 100644 include/linux/sframe.h
>  create mode 100644 kernel/unwind/sframe.c
>  create mode 100644 kernel/unwind/sframe.h
>

It feels like this patch is trying to do too much. There is both new
UAPI introduction, and SFrame format definition, and unwinder
integration, etc, etc. Do you think it can be split further into more
focused smaller patches?


> @@ -688,7 +692,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>                         /*
>                          * Check to see if the section's size will overflow the
>                          * allowed task size. Note that p_filesz must always be
> -                        * <= p_memsize so it's only necessary to check p_memsz.
> +                        * <= p_memsz so it's only necessary to check p_memsz.
>                          */
>                         k = load_addr + eppnt->p_vaddr;
>                         if (BAD_ADDR(k) ||
> @@ -698,9 +702,24 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>                                 error = -ENOMEM;
>                                 goto out;
>                         }
> +
> +                       if ((eppnt->p_flags & PF_X) && k < start_code)
> +                               start_code = k;
> +
> +                       if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
> +                               end_code = k + eppnt->p_filesz;
> +                       break;
> +               }
> +               case PT_GNU_SFRAME:
> +                       sframe_phdr = eppnt;

if I understand correctly, there has to be only one sframe, is that
right? Should we validate that?

> +                       break;
>                 }
>         }
>
> +       if (sframe_phdr)
> +               sframe_add_section(load_addr + sframe_phdr->p_vaddr,
> +                                  start_code, end_code);
> +

no error checking?

>         error = load_addr;
>  out:
>         return error;
> @@ -823,7 +842,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
>         int first_pt_load = 1;
>         unsigned long error;
>         struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
> -       struct elf_phdr *elf_property_phdata = NULL;
> +       struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
>         unsigned long elf_brk;
>         int retval, i;
>         unsigned long elf_entry;
> @@ -931,6 +950,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
>                                 executable_stack = EXSTACK_DISABLE_X;
>                         break;
>
> +               case PT_GNU_SFRAME:
> +                       sframe_phdr = elf_ppnt;
> +                       break;
> +
>                 case PT_LOPROC ... PT_HIPROC:
>                         retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
>                                                   bprm->file, false,
> @@ -1321,6 +1344,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
>                                             task_pid_nr(current), retval);
>         }
>
> +       if (sframe_phdr)
> +               sframe_add_section(load_bias + sframe_phdr->p_vaddr,
> +                                  start_code, end_code);
> +

error checking missing?

>         regs = current_pt_regs();
>  #ifdef ELF_PLAT_INIT
>         /*
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 381d22eba088..6e7561c1a5fc 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1052,6 +1052,9 @@ struct mm_struct {
>  #endif
>                 } lru_gen;
>  #endif /* CONFIG_LRU_GEN_WALKS_MMU */
> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
> +               struct maple_tree sframe_mt;
> +#endif
>         } __randomize_layout;
>
>         /*
> diff --git a/include/linux/sframe.h b/include/linux/sframe.h
> new file mode 100644
> index 000000000000..d167e01817c4
> --- /dev/null
> +++ b/include/linux/sframe.h
> @@ -0,0 +1,41 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_SFRAME_H
> +#define _LINUX_SFRAME_H
> +
> +#include <linux/mm_types.h>
> +
> +struct unwind_user_frame;
> +
> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
> +
> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
> +
> +extern void sframe_free_mm(struct mm_struct *mm);
> +
> +/* text_start, text_end, file_name are optional */

what file_name? was that an extra argument that got removed?

> +extern int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
> +                             unsigned long text_end);
> +
> +extern int sframe_remove_section(unsigned long sframe_addr);
> +extern int sframe_find(unsigned long ip, struct unwind_user_frame *frame);
> +
> +static inline bool current_has_sframe(void)
> +{
> +       struct mm_struct *mm = current->mm;
> +
> +       return mm && !mtree_empty(&mm->sframe_mt);
> +}
> +

[...]

> diff --git a/kernel/sys.c b/kernel/sys.c
> index 4da31f28fda8..7d05a67727db 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -64,6 +64,7 @@
>  #include <linux/rcupdate.h>
>  #include <linux/uidgid.h>
>  #include <linux/cred.h>
> +#include <linux/sframe.h>
>
>  #include <linux/nospec.h>
>
> @@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>         case PR_RISCV_SET_ICACHE_FLUSH_CTX:
>                 error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
>                 break;
> +       case PR_ADD_SFRAME:
> +               if (arg5)
> +                       return -EINVAL;
> +               error = sframe_add_section(arg2, arg3, arg4);

wouldn't it be better to make this interface extendable from the get
go? Instead of passing 3 arguments with fixed meaning, why not pass a
pointer to an extendable binary struct like seems to be the trend
nowadays with nicely extensible APIs. See [0] for one such example
(specifically, struct procmap_query). Seems more prudent, as we'll
most probably will be adding flags, options, extra information, etc)

  [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/

> +               break;
> +       case PR_REMOVE_SFRAME:
> +               if (arg3 || arg4 || arg5)
> +                       return -EINVAL;
> +               error = sframe_remove_section(arg2);
> +               break;
>         default:
>                 error = -EINVAL;
>                 break;

[...]

> +static unsigned char fre_type_to_size(unsigned char fre_type)
> +{
> +       if (fre_type > 2)
> +               return 0;
> +       return 1 << fre_type;
> +}
> +
> +static unsigned char offset_size_enum_to_size(unsigned char off_size)
> +{
> +       if (off_size > 2)
> +               return 0;
> +       return 1 << off_size;
> +}
> +
> +static int find_fde(struct sframe_section *sec, unsigned long ip,
> +                   struct sframe_fde *fde)
> +{
> +       struct sframe_fde __user *first, *last, *found = NULL;
> +       u32 ip_off, func_off_low = 0, func_off_high = -1;
> +
> +       ip_off = ip - sec->sframe_addr;

what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right?

and also, does it mean that SFrame doesn't support executables with
text bigger than 4GB?

> +
> +       first = (void __user *)sec->fdes_addr;
> +       last = first + sec->fdes_nr;
> +       while (first <= last) {
> +               struct sframe_fde __user *mid;
> +               u32 func_off;
> +
> +               mid = first + ((last - first) / 2);
> +
> +               if (get_user(func_off, (s32 __user *)mid))
> +                       return -EFAULT;
> +
> +               if (ip_off >= func_off) {
> +                       /* validate sort order */
> +                       if (func_off < func_off_low)
> +                               return -EINVAL;
> +
> +                       func_off_low = func_off;
> +
> +                       found = mid;
> +                       first = mid + 1;
> +               } else {
> +                       /* validate sort order */
> +                       if (func_off > func_off_high)
> +                               return -EINVAL;
> +
> +                       func_off_high = func_off;
> +
> +                       last = mid - 1;
> +               }
> +       }
> +
> +       if (!found)
> +               return -EINVAL;
> +
> +       if (copy_from_user(fde, found, sizeof(*fde)))
> +               return -EFAULT;
> +
> +       /* check for gaps */
> +       if (ip_off < fde->start_addr || ip_off >= fde->start_addr + fde->size)
> +               return -EINVAL;
> +
> +       return 0;
> +}
> +

[...]

> +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
> +                      unsigned long text_end)
> +{
> +       struct mm_struct *mm = current->mm;
> +       struct vm_area_struct *sframe_vma;
> +
> +       mmap_read_lock(mm);
> +
> +       sframe_vma = vma_lookup(mm, sframe_addr);
> +       if (!sframe_vma)
> +               goto err_unlock;
> +
> +       if (text_start && text_end) {
> +               struct vm_area_struct *text_vma;
> +
> +               text_vma = vma_lookup(mm, text_start);
> +               if (!(text_vma->vm_flags & VM_EXEC))
> +                       goto err_unlock;
> +
> +               if (PAGE_ALIGN(text_end) != text_vma->vm_end)
> +                       goto err_unlock;
> +       } else {
> +               struct vm_area_struct *vma, *text_vma = NULL;
> +               VMA_ITERATOR(vmi, mm, 0);
> +
> +               for_each_vma(vmi, vma) {
> +                       if (vma->vm_file != sframe_vma->vm_file ||
> +                           !(vma->vm_flags & VM_EXEC))
> +                               continue;
> +
> +                       if (text_vma) {
> +                               pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
> +                                            current->comm, current->pid);

is this just something that fundamentally can't be supported by SFrame
format? Or just an implementation simplification?

It's not illegal to have an executable with multiple VM_EXEC segments,
no? Should this be a pr_warn_once() then?

> +                               goto err_unlock;
> +                       }
> +
> +                       text_vma = vma;
> +               }
> +
> +               if (!text_vma)
> +                       goto err_unlock;
> +
> +               text_start = text_vma->vm_start;
> +               text_end   = text_vma->vm_end;
> +       }
> +
> +       mmap_read_unlock(mm);
> +
> +       return __sframe_add_section(sframe_addr, text_start, text_end);
> +

[...]
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 3 weeks, 5 days ago
On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote:
> It feels like this patch is trying to do too much. There is both new
> UAPI introduction, and SFrame format definition, and unwinder
> integration, etc, etc. Do you think it can be split further into more
> focused smaller patches?

True, let me see if I can split it up.

> > +
> > +                       if ((eppnt->p_flags & PF_X) && k < start_code)
> > +                               start_code = k;
> > +
> > +                       if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
> > +                               end_code = k + eppnt->p_filesz;
> > +                       break;
> > +               }
> > +               case PT_GNU_SFRAME:
> > +                       sframe_phdr = eppnt;
> 
> if I understand correctly, there has to be only one sframe, is that
> right? Should we validate that?

Yes, there shouldn't be more than one PT_GNU_SFRAME for the executable
itself.  I can validate that.

> > +                       break;
> >                 }
> >         }
> >
> > +       if (sframe_phdr)
> > +               sframe_add_section(load_addr + sframe_phdr->p_vaddr,
> > +                                  start_code, end_code);
> > +
> 
> no error checking?

Good point.  I remember discussing this with some people at Cauldon/LPC,
I just forgot to do it!

Right now it does all the validation at unwind, which could really slow
things down unnecessarily if the sframe isn't valid.

> > +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
> > +
> > +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
> > +
> > +extern void sframe_free_mm(struct mm_struct *mm);
> > +
> > +/* text_start, text_end, file_name are optional */
> 
> what file_name? was that an extra argument that got removed?

Indeed, that was for some old code.

> >         case PR_RISCV_SET_ICACHE_FLUSH_CTX:
> >                 error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
> >                 break;
> > +       case PR_ADD_SFRAME:
> > +               if (arg5)
> > +                       return -EINVAL;
> > +               error = sframe_add_section(arg2, arg3, arg4);
> 
> wouldn't it be better to make this interface extendable from the get
> go? Instead of passing 3 arguments with fixed meaning, why not pass a
> pointer to an extendable binary struct like seems to be the trend
> nowadays with nicely extensible APIs. See [0] for one such example
> (specifically, struct procmap_query). Seems more prudent, as we'll
> most probably will be adding flags, options, extra information, etc)
> 
>   [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/

This ioctl interface was admittedly hacked together.  I was hoping
somebody would suggest something better :-)  I'll take a look.

> > +static int find_fde(struct sframe_section *sec, unsigned long ip,
> > +                   struct sframe_fde *fde)
> > +{
> > +       struct sframe_fde __user *first, *last, *found = NULL;
> > +       u32 ip_off, func_off_low = 0, func_off_high = -1;
> > +
> > +       ip_off = ip - sec->sframe_addr;
> 
> what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right?

That's baked into sframe v2.

> and also, does it mean that SFrame doesn't support executables with
> text bigger than 4GB?

Yes, but is that a realistic concern?

> > +       } else {
> > +               struct vm_area_struct *vma, *text_vma = NULL;
> > +               VMA_ITERATOR(vmi, mm, 0);
> > +
> > +               for_each_vma(vmi, vma) {
> > +                       if (vma->vm_file != sframe_vma->vm_file ||
> > +                           !(vma->vm_flags & VM_EXEC))
> > +                               continue;
> > +
> > +                       if (text_vma) {
> > +                               pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
> > +                                            current->comm, current->pid);
> 
> is this just something that fundamentally can't be supported by SFrame
> format? Or just an implementation simplification?

It's a simplification I suppose.

> It's not illegal to have an executable with multiple VM_EXEC segments,
> no? Should this be a pr_warn_once() then?

I don't know, is it allowed?  I've never seen it in practice.  The
pr_warn_once() is not reporting that it's illegal but rather that this
corner case actually exists and maybe needs to be looked at.

-- 
Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Andrii Nakryiko 3 weeks, 3 days ago
On Tue, Oct 29, 2024 at 10:53 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote:
> > It feels like this patch is trying to do too much. There is both new
> > UAPI introduction, and SFrame format definition, and unwinder
> > integration, etc, etc. Do you think it can be split further into more
> > focused smaller patches?
>
> True, let me see if I can split it up.
>
> > > +
> > > +                       if ((eppnt->p_flags & PF_X) && k < start_code)
> > > +                               start_code = k;
> > > +
> > > +                       if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
> > > +                               end_code = k + eppnt->p_filesz;
> > > +                       break;
> > > +               }
> > > +               case PT_GNU_SFRAME:
> > > +                       sframe_phdr = eppnt;
> >
> > if I understand correctly, there has to be only one sframe, is that
> > right? Should we validate that?
>
> Yes, there shouldn't be more than one PT_GNU_SFRAME for the executable
> itself.  I can validate that.
>
> > > +                       break;
> > >                 }
> > >         }
> > >
> > > +       if (sframe_phdr)
> > > +               sframe_add_section(load_addr + sframe_phdr->p_vaddr,
> > > +                                  start_code, end_code);
> > > +
> >
> > no error checking?
>
> Good point.  I remember discussing this with some people at Cauldon/LPC,
> I just forgot to do it!
>
> Right now it does all the validation at unwind, which could really slow
> things down unnecessarily if the sframe isn't valid.
>
> > > +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
> > > +
> > > +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
> > > +
> > > +extern void sframe_free_mm(struct mm_struct *mm);
> > > +
> > > +/* text_start, text_end, file_name are optional */
> >
> > what file_name? was that an extra argument that got removed?
>
> Indeed, that was for some old code.
>
> > >         case PR_RISCV_SET_ICACHE_FLUSH_CTX:
> > >                 error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
> > >                 break;
> > > +       case PR_ADD_SFRAME:
> > > +               if (arg5)
> > > +                       return -EINVAL;
> > > +               error = sframe_add_section(arg2, arg3, arg4);
> >
> > wouldn't it be better to make this interface extendable from the get
> > go? Instead of passing 3 arguments with fixed meaning, why not pass a
> > pointer to an extendable binary struct like seems to be the trend
> > nowadays with nicely extensible APIs. See [0] for one such example
> > (specifically, struct procmap_query). Seems more prudent, as we'll
> > most probably will be adding flags, options, extra information, etc)
> >
> >   [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/
>
> This ioctl interface was admittedly hacked together.  I was hoping
> somebody would suggest something better :-)  I'll take a look.
>
> > > +static int find_fde(struct sframe_section *sec, unsigned long ip,
> > > +                   struct sframe_fde *fde)
> > > +{
> > > +       struct sframe_fde __user *first, *last, *found = NULL;
> > > +       u32 ip_off, func_off_low = 0, func_off_high = -1;
> > > +
> > > +       ip_off = ip - sec->sframe_addr;
> >
> > what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right?
>
> That's baked into sframe v2.

I believe we do have large production binaries with more than 4GB of
text, what are we going to do about them? It would be interesting to
hear sframe people's opinion. Adding such a far-reaching new format in
2024 with these limitations is kind of sad. At the very least maybe we
should allow some form of chaining sframe definitions to cover more
than 4GB segments? Please CC relevant folks, I'm wondering what
they're thinking about this.

>
> > and also, does it mean that SFrame doesn't support executables with
> > text bigger than 4GB?
>
> Yes, but is that a realistic concern?

See above, yes. You'd be surprised. As somewhat corroborating
evidence, there were tons of problems and churn (within at least Meta)
with DWARF not supporting more than 2GB sizes, so yes, this is not an
abstract problem for sure. Modern production applications can be
ridiculously big.

>
> > > +       } else {
> > > +               struct vm_area_struct *vma, *text_vma = NULL;
> > > +               VMA_ITERATOR(vmi, mm, 0);
> > > +
> > > +               for_each_vma(vmi, vma) {
> > > +                       if (vma->vm_file != sframe_vma->vm_file ||
> > > +                           !(vma->vm_flags & VM_EXEC))
> > > +                               continue;
> > > +
> > > +                       if (text_vma) {
> > > +                               pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
> > > +                                            current->comm, current->pid);
> >
> > is this just something that fundamentally can't be supported by SFrame
> > format? Or just an implementation simplification?
>
> It's a simplification I suppose.

That's a rather random limitation, IMO... How hard would it be to not
make that assumption?

>
> > It's not illegal to have an executable with multiple VM_EXEC segments,
> > no? Should this be a pr_warn_once() then?
>
> I don't know, is it allowed?  I've never seen it in practice.  The

I'm pretty sure you can do that with a custom linker script, at the
very least. Normally this probably won't happen, but I don't think
Linux dictates how many executable VMAs an application can have. And
it probably just naturally happens for JIT-ted applications (Java, Go,
etc).

Linux kernel itself has two executable segments, for instance (though
kernel is special, of course, but still).

> pr_warn_once() is not reporting that it's illegal but rather that this
> corner case actually exists and maybe needs to be looked at.

This warn() will be logged across millions of machines in the fleet,
triggering alarms, people looking at this, making custom internal
patches to disable the known-to-happen warn. Why do we need all this?
This is an issue that is trivial to trigger by user process that's not
doing anything illegal. Why?

>
> --
> Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 3 weeks, 3 days ago
On Thu, Oct 31, 2024 at 01:57:10PM -0700, Andrii Nakryiko wrote:
> > > what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right?
> >
> > That's baked into sframe v2.
> 
> I believe we do have large production binaries with more than 4GB of
> text, what are we going to do about them? It would be interesting to
> hear sframe people's opinion. Adding such a far-reaching new format in
> 2024 with these limitations is kind of sad. At the very least maybe we
> should allow some form of chaining sframe definitions to cover more
> than 4GB segments? Please CC relevant folks, I'm wondering what
> they're thinking about this.

Personally I find the idea of a single 4GB+ text segment pretty
surprising as I've never seen anything even close to that.

Anyway it's iterative development and not everybody's requirements are
clear from day 1.  Which is why we're discussing it now.  I think there
are already plans to do an sframe v3.

> > > > +                       if (text_vma) {
> > > > +                               pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
> > > > +                                            current->comm, current->pid);
> > >
> > > is this just something that fundamentally can't be supported by SFrame
> > > format? Or just an implementation simplification?
> >
> > It's a simplification I suppose.
> 
> That's a rather random limitation, IMO... How hard would it be to not
> make that assumption?

It's definitely not random, there's no need to complicate the code if
this condition doesn't exist.

> > > It's not illegal to have an executable with multiple VM_EXEC segments,
> > > no? Should this be a pr_warn_once() then?
> >
> > I don't know, is it allowed?  I've never seen it in practice.  The
> 
> I'm pretty sure you can do that with a custom linker script, at the
> very least. Normally this probably won't happen, but I don't think
> Linux dictates how many executable VMAs an application can have.
> And it probably just naturally happens for JIT-ted applications (Java,
> Go, etc).

Actually I just double checked and even the kernel's ELF loader assumes
that each executable has only a single text start+end address pair.

> > pr_warn_once() is not reporting that it's illegal but rather that this
> > corner case actually exists and maybe needs to be looked at.
> 
> This warn() will be logged across millions of machines in the fleet,
> triggering alarms, people looking at this, making custom internal
> patches to disable the known-to-happen warn. Why do we need all this?
> This is an issue that is trivial to trigger by user process that's not
> doing anything illegal. Why?

There's no point in adding complexity to support some hypothetical.  I
can remove the printk though.

-- 
Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Segher Boessenkool 3 weeks, 2 days ago
On Thu, Oct 31, 2024 at 04:03:13PM -0700, Josh Poimboeuf wrote:
> Personally I find the idea of a single 4GB+ text segment pretty
> surprising as I've never seen anything even close to that.

It is pretty common I'm afraid.

> Actually I just double checked and even the kernel's ELF loader assumes
> that each executable has only a single text start+end address pair.

Huh?  What makes you think that?  There can be many executable PT_LOAD
segments in each and every binary.


Segher
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 3 weeks, 2 days ago
On Fri, Nov 01, 2024 at 02:09:08PM -0500, Segher Boessenkool wrote:
> On Thu, Oct 31, 2024 at 04:03:13PM -0700, Josh Poimboeuf wrote:
> > Actually I just double checked and even the kernel's ELF loader assumes
> > that each executable has only a single text start+end address pair.
> 
> Huh?  What makes you think that?  There can be many executable PT_LOAD
> segments in each and every binary.

Right, but for executables (not shared libraries) the kernel seems to
assume they're contiguous?  See the 'start_code' and 'end_code'
variables in load_elf_binary() load_elf_interp().

-- 
Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 3 weeks, 2 days ago
On Fri, Nov 01, 2024 at 12:33:07PM -0700, Josh Poimboeuf wrote:
> On Fri, Nov 01, 2024 at 02:09:08PM -0500, Segher Boessenkool wrote:
> > On Thu, Oct 31, 2024 at 04:03:13PM -0700, Josh Poimboeuf wrote:
> > > Actually I just double checked and even the kernel's ELF loader assumes
> > > that each executable has only a single text start+end address pair.
> > 
> > Huh?  What makes you think that?  There can be many executable PT_LOAD
> > segments in each and every binary.
> 
> Right, but for executables (not shared libraries) the kernel seems to
> assume they're contiguous?  See the 'start_code' and 'end_code'
> variables in load_elf_binary() load_elf_interp().

Typo, see load_elf_binary (not load_elf_interp).

-- 
Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 3 weeks, 2 days ago
On Fri, Nov 01, 2024 at 12:35:13PM -0700, Josh Poimboeuf wrote:
> On Fri, Nov 01, 2024 at 12:33:07PM -0700, Josh Poimboeuf wrote:
> > On Fri, Nov 01, 2024 at 02:09:08PM -0500, Segher Boessenkool wrote:
> > > On Thu, Oct 31, 2024 at 04:03:13PM -0700, Josh Poimboeuf wrote:
> > > > Actually I just double checked and even the kernel's ELF loader assumes
> > > > that each executable has only a single text start+end address pair.
> > > 
> > > Huh?  What makes you think that?  There can be many executable PT_LOAD
> > > segments in each and every binary.
> > 
> > Right, but for executables (not shared libraries) the kernel seems to
> > assume they're contiguous?  See the 'start_code' and 'end_code'
> > variables in load_elf_binary() load_elf_interp().
> 
> Typo, see load_elf_binary (not load_elf_interp).

Hm, actually AFAICT that's only for reporting things in sysfs/proc.  So
maybe it's assumed but not really "enforced".

-- 
Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Segher Boessenkool 3 weeks, 2 days ago
Hi!

On Fri, Nov 01, 2024 at 12:48:00PM -0700, Josh Poimboeuf wrote:
> On Fri, Nov 01, 2024 at 12:35:13PM -0700, Josh Poimboeuf wrote:
> > On Fri, Nov 01, 2024 at 12:33:07PM -0700, Josh Poimboeuf wrote:
> > > On Fri, Nov 01, 2024 at 02:09:08PM -0500, Segher Boessenkool wrote:
> > > > On Thu, Oct 31, 2024 at 04:03:13PM -0700, Josh Poimboeuf wrote:
> > > > > Actually I just double checked and even the kernel's ELF loader assumes
> > > > > that each executable has only a single text start+end address pair.
> > > > 
> > > > Huh?  What makes you think that?  There can be many executable PT_LOAD
> > > > segments in each and every binary.
> > > 
> > > Right, but for executables (not shared libraries) the kernel seems to
> > > assume they're contiguous?  See the 'start_code' and 'end_code'
> > > variables in load_elf_binary() load_elf_interp().
> > 
> > Typo, see load_elf_binary (not load_elf_interp).
> 
> Hm, actually AFAICT that's only for reporting things in sysfs/proc.  So
> maybe it's assumed but not really "enforced".

Yes, this is copied to mm->start_code (etc.)  This isn't used for
anything very important it seems, it seems to be a leftover from when we
only had really simple binfmts?  For a.out it did actually make sense,
for example :-)


Segher
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Andrii Nakryiko 3 weeks, 2 days ago
+cc bpf@ where people care about stack traces and profiling as well
(please cc bpf@vger.kernel.org for next revisions as well, I'm sure a
bunch of folks would appreciate it and have something useful to say)

On Thu, Oct 31, 2024 at 4:03 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> On Thu, Oct 31, 2024 at 01:57:10PM -0700, Andrii Nakryiko wrote:
> > > > what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right?
> > >
> > > That's baked into sframe v2.
> >
> > I believe we do have large production binaries with more than 4GB of
> > text, what are we going to do about them? It would be interesting to
> > hear sframe people's opinion. Adding such a far-reaching new format in
> > 2024 with these limitations is kind of sad. At the very least maybe we
> > should allow some form of chaining sframe definitions to cover more
> > than 4GB segments? Please CC relevant folks, I'm wondering what
> > they're thinking about this.
>
> Personally I find the idea of a single 4GB+ text segment pretty
> surprising as I've never seen anything even close to that.

I grabbed one of Meta production servers running one of the big-ish (I
don't know if it's even the largest, most probably not) service. Here
are first few sections from /proc/pid/maps belonging to the main
binary:

00200000-170ad000 r--p 00000000 07:01 5
172ac000-498e7000 r-xp 16eac000 07:01 5
49ae7000-49b8b000 r--p 494e7000 07:01 5
49d8b000-4a228000 rw-p 4958b000 07:01 5
4a228000-4c677000 rw-p 00000000 00:00 0
4c800000-4ca00000 r-xp 49c00000 07:01 5
4ca00000-4f600000 r-xp 49e00000 07:01 5
4f600000-5b270000 r-xp 4ca00000 07:01 5

Few observations:

1) There are 4 executable segments in just the first 8 entries.
2) Their total size is already approaching 1.5GB:

>>> ((0x170ad000 - 0x200000) + (0x5b270000 - 0x4f600000) + (0x498e7000 - 0x172ac000)) / 1024 / 1024
1361.34375

I don't know about you, but from my experience things like code size
tend to just grow over time, rarely it shrinks (and even that usually
requires tremendous and focused efforts).

>
> Anyway it's iterative development and not everybody's requirements are
> clear from day 1.  Which is why we're discussing it now.  I think there
> are already plans to do an sframe v3.

Of course, which is why I'm providing this feedback. But it would be
nice to avoid having to support a zoo of versions if we already know
there are practical limitations that we are not that far from hitting.

>
> > > > > +                       if (text_vma) {
> > > > > +                               pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
> > > > > +                                            current->comm, current->pid);
> > > >
> > > > is this just something that fundamentally can't be supported by SFrame
> > > > format? Or just an implementation simplification?
> > >
> > > It's a simplification I suppose.
> >
> > That's a rather random limitation, IMO... How hard would it be to not
> > make that assumption?
>
> It's definitely not random, there's no need to complicate the code if
> this condition doesn't exist.

Sorry, I'm probably dense and missing something. But from the example
process above, isn't this check violated already? Or it's two
different things? Not sure, honestly.

>
> > > > It's not illegal to have an executable with multiple VM_EXEC segments,
> > > > no? Should this be a pr_warn_once() then?
> > >
> > > I don't know, is it allowed?  I've never seen it in practice.  The
> >
> > I'm pretty sure you can do that with a custom linker script, at the
> > very least. Normally this probably won't happen, but I don't think
> > Linux dictates how many executable VMAs an application can have.
> > And it probably just naturally happens for JIT-ted applications (Java,
> > Go, etc).
>
> Actually I just double checked and even the kernel's ELF loader assumes
> that each executable has only a single text start+end address pair.

See above, very confused by such assumptions, but I'm hoping we are
talking about two different things here.

>
> > > pr_warn_once() is not reporting that it's illegal but rather that this
> > > corner case actually exists and maybe needs to be looked at.
> >
> > This warn() will be logged across millions of machines in the fleet,
> > triggering alarms, people looking at this, making custom internal
> > patches to disable the known-to-happen warn. Why do we need all this?
> > This is an issue that is trivial to trigger by user process that's not
> > doing anything illegal. Why?
>
> There's no point in adding complexity to support some hypothetical.  I
> can remove the printk though.

We are talking about fundamental things like format for supporting
frame pointer-less stack trace capture. It will take years to adopt
SFrame everywhere, so I think it's prudent to think a bit ahead beyond
just saying "no real application should need more than 4GB text", IMO.

>
> --
> Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 3 weeks, 2 days ago
On Fri, Nov 01, 2024 at 11:34:48AM -0700, Andrii Nakryiko wrote:
> 00200000-170ad000 r--p 00000000 07:01 5
> 172ac000-498e7000 r-xp 16eac000 07:01 5
> 49ae7000-49b8b000 r--p 494e7000 07:01 5
> 49d8b000-4a228000 rw-p 4958b000 07:01 5
> 4a228000-4c677000 rw-p 00000000 00:00 0
> 4c800000-4ca00000 r-xp 49c00000 07:01 5
> 4ca00000-4f600000 r-xp 49e00000 07:01 5
> 4f600000-5b270000 r-xp 4ca00000 07:01 5
>
> Sorry, I'm probably dense and missing something. But from the example
> process above, isn't this check violated already? Or it's two
> different things? Not sure, honestly.

It's hard to tell exactly what's going on, did you strip the file names?

The sframe limitation is per file, not per address space.  I assume
these are one file:

> 172ac000-498e7000 r-xp 16eac000 07:01 5

and these are another:

> 4c800000-4ca00000 r-xp 49c00000 07:01 5
> 4ca00000-4f600000 r-xp 49e00000 07:01 5
> 4f600000-5b270000 r-xp 4ca00000 07:01 5

Multiple mappings for a single file is fine, as long as they're
contiguous.

> > Actually I just double checked and even the kernel's ELF loader assumes
> > that each executable has only a single text start+end address pair.
> 
> See above, very confused by such assumptions, but I'm hoping we are
> talking about two different things here.

The "contiguous text" thing seems enforced by the kernel for
executables.  However it doesn't manage shared libraries, those are
mapped by the loader, e.g. /lib64/ld-linux-x86-64.so.2.

At a quick glance I can't tell if /lib64/ld-linux-x86-64.so.2 enforces
that.

> > There's no point in adding complexity to support some hypothetical.  I
> > can remove the printk though.
> 
> We are talking about fundamental things like format for supporting
> frame pointer-less stack trace capture. It will take years to adopt
> SFrame everywhere, so I think it's prudent to think a bit ahead beyond
> just saying "no real application should need more than 4GB text", IMO.

I don't think anybody is saying that...

-- 
Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Andrii Nakryiko 3 weeks, 2 days ago
On Fri, Nov 1, 2024 at 12:29 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> On Fri, Nov 01, 2024 at 11:34:48AM -0700, Andrii Nakryiko wrote:
> > 00200000-170ad000 r--p 00000000 07:01 5
> > 172ac000-498e7000 r-xp 16eac000 07:01 5
> > 49ae7000-49b8b000 r--p 494e7000 07:01 5
> > 49d8b000-4a228000 rw-p 4958b000 07:01 5
> > 4a228000-4c677000 rw-p 00000000 00:00 0
> > 4c800000-4ca00000 r-xp 49c00000 07:01 5
> > 4ca00000-4f600000 r-xp 49e00000 07:01 5
> > 4f600000-5b270000 r-xp 4ca00000 07:01 5
> >
> > Sorry, I'm probably dense and missing something. But from the example
> > process above, isn't this check violated already? Or it's two
> > different things? Not sure, honestly.
>
> It's hard to tell exactly what's going on, did you strip the file names?

Yes, I did, of course. But as I said, they all belong to the same main
binary of the process.

>
> The sframe limitation is per file, not per address space.  I assume
> these are one file:
>
> > 172ac000-498e7000 r-xp 16eac000 07:01 5
>
> and these are another:
>
> > 4c800000-4ca00000 r-xp 49c00000 07:01 5
> > 4ca00000-4f600000 r-xp 49e00000 07:01 5
> > 4f600000-5b270000 r-xp 4ca00000 07:01 5
>
> Multiple mappings for a single file is fine, as long as they're
> contiguous.

No all of what I posted above belongs to the same file (except
"4a228000-4c677000 rw-p 00000000 00:00 0" which doesn't have
associated file, but I suspect it originally was part of this file, we
do some tricks with re-mmap()'ing stuff due to huge pages usage).

>
> > > Actually I just double checked and even the kernel's ELF loader assumes
> > > that each executable has only a single text start+end address pair.
> >
> > See above, very confused by such assumptions, but I'm hoping we are
> > talking about two different things here.
>
> The "contiguous text" thing seems enforced by the kernel for
> executables.  However it doesn't manage shared libraries, those are
> mapped by the loader, e.g. /lib64/ld-linux-x86-64.so.2.
>
> At a quick glance I can't tell if /lib64/ld-linux-x86-64.so.2 enforces
> that.
>
> > > There's no point in adding complexity to support some hypothetical.  I
> > > can remove the printk though.
> >
> > We are talking about fundamental things like format for supporting
> > frame pointer-less stack trace capture. It will take years to adopt
> > SFrame everywhere, so I think it's prudent to think a bit ahead beyond
> > just saying "no real application should need more than 4GB text", IMO.
>
> I don't think anybody is saying that...
>
> --
> Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Andrii Nakryiko 3 weeks, 2 days ago
On Fri, Nov 1, 2024 at 12:44 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Fri, Nov 1, 2024 at 12:29 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> >
> > On Fri, Nov 01, 2024 at 11:34:48AM -0700, Andrii Nakryiko wrote:
> > > 00200000-170ad000 r--p 00000000 07:01 5
> > > 172ac000-498e7000 r-xp 16eac000 07:01 5
> > > 49ae7000-49b8b000 r--p 494e7000 07:01 5
> > > 49d8b000-4a228000 rw-p 4958b000 07:01 5
> > > 4a228000-4c677000 rw-p 00000000 00:00 0
> > > 4c800000-4ca00000 r-xp 49c00000 07:01 5
> > > 4ca00000-4f600000 r-xp 49e00000 07:01 5
> > > 4f600000-5b270000 r-xp 4ca00000 07:01 5
> > >

I should have maybe posted this in this form:

00200000-170ad000 r--p 00000000 07:01 5  /packages/obfuscated_file
172ac000-498e7000 r-xp 16eac000 07:01 5  /packages/obfuscated_file
49ae7000-49b8b000 r--p 494e7000 07:01 5  /packages/obfuscated_file
49d8b000-4a228000 rw-p 4958b000 07:01 5  /packages/obfuscated_file
4a228000-4c677000 rw-p 00000000 00:00 0
4c800000-4ca00000 r-xp 49c00000 07:01 5  /packages/obfuscated_file
4ca00000-4f600000 r-xp 49e00000 07:01 5  /packages/obfuscated_file
4f600000-5b270000 r-xp 4ca00000 07:01 5  /packages/obfuscated_file

Those paths are pointing to the same binary.


> > > Sorry, I'm probably dense and missing something. But from the example
> > > process above, isn't this check violated already? Or it's two
> > > different things? Not sure, honestly.
> >
> > It's hard to tell exactly what's going on, did you strip the file names?
>
> Yes, I did, of course. But as I said, they all belong to the same main
> binary of the process.
>
> >
> > The sframe limitation is per file, not per address space.  I assume
> > these are one file:
> >
> > > 172ac000-498e7000 r-xp 16eac000 07:01 5
> >
> > and these are another:
> >
> > > 4c800000-4ca00000 r-xp 49c00000 07:01 5
> > > 4ca00000-4f600000 r-xp 49e00000 07:01 5
> > > 4f600000-5b270000 r-xp 4ca00000 07:01 5
> >
> > Multiple mappings for a single file is fine, as long as they're
> > contiguous.
>
> No all of what I posted above belongs to the same file (except
> "4a228000-4c677000 rw-p 00000000 00:00 0" which doesn't have
> associated file, but I suspect it originally was part of this file, we
> do some tricks with re-mmap()'ing stuff due to huge pages usage).
>
> >
> > > > Actually I just double checked and even the kernel's ELF loader assumes
> > > > that each executable has only a single text start+end address pair.
> > >
> > > See above, very confused by such assumptions, but I'm hoping we are
> > > talking about two different things here.
> >
> > The "contiguous text" thing seems enforced by the kernel for
> > executables.  However it doesn't manage shared libraries, those are
> > mapped by the loader, e.g. /lib64/ld-linux-x86-64.so.2.
> >
> > At a quick glance I can't tell if /lib64/ld-linux-x86-64.so.2 enforces
> > that.
> >
> > > > There's no point in adding complexity to support some hypothetical.  I
> > > > can remove the printk though.
> > >
> > > We are talking about fundamental things like format for supporting
> > > frame pointer-less stack trace capture. It will take years to adopt
> > > SFrame everywhere, so I think it's prudent to think a bit ahead beyond
> > > just saying "no real application should need more than 4GB text", IMO.
> >
> > I don't think anybody is saying that...
> >
> > --
> > Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 3 weeks, 2 days ago
On Fri, Nov 01, 2024 at 12:46:09PM -0700, Andrii Nakryiko wrote:
> On Fri, Nov 1, 2024 at 12:44 PM Andrii Nakryiko
> <andrii.nakryiko@gmail.com> wrote:
> >
> > On Fri, Nov 1, 2024 at 12:29 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> > >
> > > On Fri, Nov 01, 2024 at 11:34:48AM -0700, Andrii Nakryiko wrote:
> > > > 00200000-170ad000 r--p 00000000 07:01 5
> > > > 172ac000-498e7000 r-xp 16eac000 07:01 5
> > > > 49ae7000-49b8b000 r--p 494e7000 07:01 5
> > > > 49d8b000-4a228000 rw-p 4958b000 07:01 5
> > > > 4a228000-4c677000 rw-p 00000000 00:00 0
> > > > 4c800000-4ca00000 r-xp 49c00000 07:01 5
> > > > 4ca00000-4f600000 r-xp 49e00000 07:01 5
> > > > 4f600000-5b270000 r-xp 4ca00000 07:01 5
> > > >
> 
> I should have maybe posted this in this form:
> 
> 00200000-170ad000 r--p 00000000 07:01 5  /packages/obfuscated_file
> 172ac000-498e7000 r-xp 16eac000 07:01 5  /packages/obfuscated_file
> 49ae7000-49b8b000 r--p 494e7000 07:01 5  /packages/obfuscated_file
> 49d8b000-4a228000 rw-p 4958b000 07:01 5  /packages/obfuscated_file
> 4a228000-4c677000 rw-p 00000000 00:00 0
> 4c800000-4ca00000 r-xp 49c00000 07:01 5  /packages/obfuscated_file
> 4ca00000-4f600000 r-xp 49e00000 07:01 5  /packages/obfuscated_file
> 4f600000-5b270000 r-xp 4ca00000 07:01 5  /packages/obfuscated_file
> 
> Those paths are pointing to the same binary.

Ok, thanks for sharing that.  I'll add in support for noncontiguous
text.

-- 
Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Indu Bhagat 3 weeks, 3 days ago
On 10/31/24 1:57 PM, Andrii Nakryiko wrote:
> On Tue, Oct 29, 2024 at 10:53 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>>
>> On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote:
>>> It feels like this patch is trying to do too much. There is both new
>>> UAPI introduction, and SFrame format definition, and unwinder
>>> integration, etc, etc. Do you think it can be split further into more
>>> focused smaller patches?
>>
>> True, let me see if I can split it up.
>>
>>>> +
>>>> +                       if ((eppnt->p_flags & PF_X) && k < start_code)
>>>> +                               start_code = k;
>>>> +
>>>> +                       if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
>>>> +                               end_code = k + eppnt->p_filesz;
>>>> +                       break;
>>>> +               }
>>>> +               case PT_GNU_SFRAME:
>>>> +                       sframe_phdr = eppnt;
>>>
>>> if I understand correctly, there has to be only one sframe, is that
>>> right? Should we validate that?
>>
>> Yes, there shouldn't be more than one PT_GNU_SFRAME for the executable
>> itself.  I can validate that.
>>
>>>> +                       break;
>>>>                  }
>>>>          }
>>>>
>>>> +       if (sframe_phdr)
>>>> +               sframe_add_section(load_addr + sframe_phdr->p_vaddr,
>>>> +                                  start_code, end_code);
>>>> +
>>>
>>> no error checking?
>>
>> Good point.  I remember discussing this with some people at Cauldon/LPC,
>> I just forgot to do it!
>>
>> Right now it does all the validation at unwind, which could really slow
>> things down unnecessarily if the sframe isn't valid.
>>
>>>> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
>>>> +
>>>> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
>>>> +
>>>> +extern void sframe_free_mm(struct mm_struct *mm);
>>>> +
>>>> +/* text_start, text_end, file_name are optional */
>>>
>>> what file_name? was that an extra argument that got removed?
>>
>> Indeed, that was for some old code.
>>
>>>>          case PR_RISCV_SET_ICACHE_FLUSH_CTX:
>>>>                  error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
>>>>                  break;
>>>> +       case PR_ADD_SFRAME:
>>>> +               if (arg5)
>>>> +                       return -EINVAL;
>>>> +               error = sframe_add_section(arg2, arg3, arg4);
>>>
>>> wouldn't it be better to make this interface extendable from the get
>>> go? Instead of passing 3 arguments with fixed meaning, why not pass a
>>> pointer to an extendable binary struct like seems to be the trend
>>> nowadays with nicely extensible APIs. See [0] for one such example
>>> (specifically, struct procmap_query). Seems more prudent, as we'll
>>> most probably will be adding flags, options, extra information, etc)
>>>
>>>    [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/
>>
>> This ioctl interface was admittedly hacked together.  I was hoping
>> somebody would suggest something better :-)  I'll take a look.
>>
>>>> +static int find_fde(struct sframe_section *sec, unsigned long ip,
>>>> +                   struct sframe_fde *fde)
>>>> +{
>>>> +       struct sframe_fde __user *first, *last, *found = NULL;
>>>> +       u32 ip_off, func_off_low = 0, func_off_high = -1;
>>>> +
>>>> +       ip_off = ip - sec->sframe_addr;
>>>
>>> what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right?
>>
>> That's baked into sframe v2.
> 
> I believe we do have large production binaries with more than 4GB of
> text, what are we going to do about them? It would be interesting to
> hear sframe people's opinion. Adding such a far-reaching new format in
> 2024 with these limitations is kind of sad. At the very least maybe we
> should allow some form of chaining sframe definitions to cover more
> than 4GB segments? Please CC relevant folks, I'm wondering what
> they're thinking about this.
> 

SFrame V2 does have that limitation. We can try to have 64-bit 
representation for the 'ip' in the SFrame FDE and conditionalize it 
somehow (say, with a flag in the header) so as to not bloat the majority 
of applications.

>>
>>> and also, does it mean that SFrame doesn't support executables with
>>> text bigger than 4GB?
>>
>> Yes, but is that a realistic concern?
> 
> See above, yes. You'd be surprised. As somewhat corroborating
> evidence, there were tons of problems and churn (within at least Meta)
> with DWARF not supporting more than 2GB sizes, so yes, this is not an
> abstract problem for sure. Modern production applications can be
> ridiculously big.
> 
>>
>>>> +       } else {
>>>> +               struct vm_area_struct *vma, *text_vma = NULL;
>>>> +               VMA_ITERATOR(vmi, mm, 0);
>>>> +
>>>> +               for_each_vma(vmi, vma) {
>>>> +                       if (vma->vm_file != sframe_vma->vm_file ||
>>>> +                           !(vma->vm_flags & VM_EXEC))
>>>> +                               continue;
>>>> +
>>>> +                       if (text_vma) {
>>>> +                               pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
>>>> +                                            current->comm, current->pid);
>>>
>>> is this just something that fundamentally can't be supported by SFrame
>>> format? Or just an implementation simplification?
>>
>> It's a simplification I suppose.
> 
> That's a rather random limitation, IMO... How hard would it be to not
> make that assumption?
> 
>>
>>> It's not illegal to have an executable with multiple VM_EXEC segments,
>>> no? Should this be a pr_warn_once() then?
>>
>> I don't know, is it allowed?  I've never seen it in practice.  The
> 
> I'm pretty sure you can do that with a custom linker script, at the
> very least. Normally this probably won't happen, but I don't think
> Linux dictates how many executable VMAs an application can have. And
> it probably just naturally happens for JIT-ted applications (Java, Go,
> etc).
> 
> Linux kernel itself has two executable segments, for instance (though
> kernel is special, of course, but still).
> 
>> pr_warn_once() is not reporting that it's illegal but rather that this
>> corner case actually exists and maybe needs to be looked at.
> 
> This warn() will be logged across millions of machines in the fleet,
> triggering alarms, people looking at this, making custom internal
> patches to disable the known-to-happen warn. Why do we need all this?
> This is an issue that is trivial to trigger by user process that's not
> doing anything illegal. Why?
> 
>>
>> --
>> Josh

Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Andrii Nakryiko 3 weeks, 2 days ago
On Thu, Oct 31, 2024 at 2:38 PM Indu Bhagat <indu.bhagat@oracle.com> wrote:
>
> On 10/31/24 1:57 PM, Andrii Nakryiko wrote:
> > On Tue, Oct 29, 2024 at 10:53 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> >>
> >> On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote:
> >>> It feels like this patch is trying to do too much. There is both new
> >>> UAPI introduction, and SFrame format definition, and unwinder
> >>> integration, etc, etc. Do you think it can be split further into more
> >>> focused smaller patches?
> >>
> >> True, let me see if I can split it up.
> >>
> >>>> +
> >>>> +                       if ((eppnt->p_flags & PF_X) && k < start_code)
> >>>> +                               start_code = k;
> >>>> +
> >>>> +                       if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
> >>>> +                               end_code = k + eppnt->p_filesz;
> >>>> +                       break;
> >>>> +               }
> >>>> +               case PT_GNU_SFRAME:
> >>>> +                       sframe_phdr = eppnt;
> >>>
> >>> if I understand correctly, there has to be only one sframe, is that
> >>> right? Should we validate that?
> >>
> >> Yes, there shouldn't be more than one PT_GNU_SFRAME for the executable
> >> itself.  I can validate that.
> >>
> >>>> +                       break;
> >>>>                  }
> >>>>          }
> >>>>
> >>>> +       if (sframe_phdr)
> >>>> +               sframe_add_section(load_addr + sframe_phdr->p_vaddr,
> >>>> +                                  start_code, end_code);
> >>>> +
> >>>
> >>> no error checking?
> >>
> >> Good point.  I remember discussing this with some people at Cauldon/LPC,
> >> I just forgot to do it!
> >>
> >> Right now it does all the validation at unwind, which could really slow
> >> things down unnecessarily if the sframe isn't valid.
> >>
> >>>> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
> >>>> +
> >>>> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
> >>>> +
> >>>> +extern void sframe_free_mm(struct mm_struct *mm);
> >>>> +
> >>>> +/* text_start, text_end, file_name are optional */
> >>>
> >>> what file_name? was that an extra argument that got removed?
> >>
> >> Indeed, that was for some old code.
> >>
> >>>>          case PR_RISCV_SET_ICACHE_FLUSH_CTX:
> >>>>                  error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
> >>>>                  break;
> >>>> +       case PR_ADD_SFRAME:
> >>>> +               if (arg5)
> >>>> +                       return -EINVAL;
> >>>> +               error = sframe_add_section(arg2, arg3, arg4);
> >>>
> >>> wouldn't it be better to make this interface extendable from the get
> >>> go? Instead of passing 3 arguments with fixed meaning, why not pass a
> >>> pointer to an extendable binary struct like seems to be the trend
> >>> nowadays with nicely extensible APIs. See [0] for one such example
> >>> (specifically, struct procmap_query). Seems more prudent, as we'll
> >>> most probably will be adding flags, options, extra information, etc)
> >>>
> >>>    [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/
> >>
> >> This ioctl interface was admittedly hacked together.  I was hoping
> >> somebody would suggest something better :-)  I'll take a look.
> >>
> >>>> +static int find_fde(struct sframe_section *sec, unsigned long ip,
> >>>> +                   struct sframe_fde *fde)
> >>>> +{
> >>>> +       struct sframe_fde __user *first, *last, *found = NULL;
> >>>> +       u32 ip_off, func_off_low = 0, func_off_high = -1;
> >>>> +
> >>>> +       ip_off = ip - sec->sframe_addr;
> >>>
> >>> what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right?
> >>
> >> That's baked into sframe v2.
> >
> > I believe we do have large production binaries with more than 4GB of
> > text, what are we going to do about them? It would be interesting to
> > hear sframe people's opinion. Adding such a far-reaching new format in
> > 2024 with these limitations is kind of sad. At the very least maybe we
> > should allow some form of chaining sframe definitions to cover more
> > than 4GB segments? Please CC relevant folks, I'm wondering what
> > they're thinking about this.
> >
>
> SFrame V2 does have that limitation. We can try to have 64-bit
> representation for the 'ip' in the SFrame FDE and conditionalize it
> somehow (say, with a flag in the header) so as to not bloat the majority
> of applications.

Hi Indu,

I think that's prudent if we believe that SFrame is the solution here.
See my reply to Josh. Real-world already approach 4GB limits, and
things are not going to shrink in the years to come. So yeah, probably
we need some adjustments to the format to at least allow 64-bit
offsets (though trying to stick to 32-bit as much as possible, of
course, if they work).

I'm not really familiar with the nuances of the format just yet, so
can't really provide anything more useful at this point. What would be
the sort of gold reference for Sframe format to familiarize myself
thoroughly?

BTW, I wanted to ask. Are there any plans to add SFrame support to
Clang as well? It feels like without that there is no future for
SFrame as a general-purpose solution for stack traces.

>
> >>
> >>> and also, does it mean that SFrame doesn't support executables with
> >>> text bigger than 4GB?
> >>
> >> Yes, but is that a realistic concern?
> >
> > See above, yes. You'd be surprised. As somewhat corroborating
> > evidence, there were tons of problems and churn (within at least Meta)
> > with DWARF not supporting more than 2GB sizes, so yes, this is not an
> > abstract problem for sure. Modern production applications can be
> > ridiculously big.
> >
> >>
> >>>> +       } else {
> >>>> +               struct vm_area_struct *vma, *text_vma = NULL;
> >>>> +               VMA_ITERATOR(vmi, mm, 0);
> >>>> +
> >>>> +               for_each_vma(vmi, vma) {
> >>>> +                       if (vma->vm_file != sframe_vma->vm_file ||
> >>>> +                           !(vma->vm_flags & VM_EXEC))
> >>>> +                               continue;
> >>>> +
> >>>> +                       if (text_vma) {
> >>>> +                               pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
> >>>> +                                            current->comm, current->pid);
> >>>
> >>> is this just something that fundamentally can't be supported by SFrame
> >>> format? Or just an implementation simplification?
> >>
> >> It's a simplification I suppose.
> >
> > That's a rather random limitation, IMO... How hard would it be to not
> > make that assumption?
> >
> >>
> >>> It's not illegal to have an executable with multiple VM_EXEC segments,
> >>> no? Should this be a pr_warn_once() then?
> >>
> >> I don't know, is it allowed?  I've never seen it in practice.  The
> >
> > I'm pretty sure you can do that with a custom linker script, at the
> > very least. Normally this probably won't happen, but I don't think
> > Linux dictates how many executable VMAs an application can have. And
> > it probably just naturally happens for JIT-ted applications (Java, Go,
> > etc).
> >
> > Linux kernel itself has two executable segments, for instance (though
> > kernel is special, of course, but still).
> >
> >> pr_warn_once() is not reporting that it's illegal but rather that this
> >> corner case actually exists and maybe needs to be looked at.
> >
> > This warn() will be logged across millions of machines in the fleet,
> > triggering alarms, people looking at this, making custom internal
> > patches to disable the known-to-happen warn. Why do we need all this?
> > This is an issue that is trivial to trigger by user process that's not
> > doing anything illegal. Why?
> >
> >>
> >> --
> >> Josh
>
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Indu Bhagat 3 weeks, 1 day ago
On 11/1/24 11:38 AM, Andrii Nakryiko wrote:
> On Thu, Oct 31, 2024 at 2:38 PM Indu Bhagat <indu.bhagat@oracle.com> wrote:
>>
>> On 10/31/24 1:57 PM, Andrii Nakryiko wrote:
>>> On Tue, Oct 29, 2024 at 10:53 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>>>>
>>>> On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote:
>>>>> It feels like this patch is trying to do too much. There is both new
>>>>> UAPI introduction, and SFrame format definition, and unwinder
>>>>> integration, etc, etc. Do you think it can be split further into more
>>>>> focused smaller patches?
>>>>
>>>> True, let me see if I can split it up.
>>>>
>>>>>> +
>>>>>> +                       if ((eppnt->p_flags & PF_X) && k < start_code)
>>>>>> +                               start_code = k;
>>>>>> +
>>>>>> +                       if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
>>>>>> +                               end_code = k + eppnt->p_filesz;
>>>>>> +                       break;
>>>>>> +               }
>>>>>> +               case PT_GNU_SFRAME:
>>>>>> +                       sframe_phdr = eppnt;
>>>>>
>>>>> if I understand correctly, there has to be only one sframe, is that
>>>>> right? Should we validate that?
>>>>
>>>> Yes, there shouldn't be more than one PT_GNU_SFRAME for the executable
>>>> itself.  I can validate that.
>>>>
>>>>>> +                       break;
>>>>>>                   }
>>>>>>           }
>>>>>>
>>>>>> +       if (sframe_phdr)
>>>>>> +               sframe_add_section(load_addr + sframe_phdr->p_vaddr,
>>>>>> +                                  start_code, end_code);
>>>>>> +
>>>>>
>>>>> no error checking?
>>>>
>>>> Good point.  I remember discussing this with some people at Cauldon/LPC,
>>>> I just forgot to do it!
>>>>
>>>> Right now it does all the validation at unwind, which could really slow
>>>> things down unnecessarily if the sframe isn't valid.
>>>>
>>>>>> +#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
>>>>>> +
>>>>>> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0),
>>>>>> +
>>>>>> +extern void sframe_free_mm(struct mm_struct *mm);
>>>>>> +
>>>>>> +/* text_start, text_end, file_name are optional */
>>>>>
>>>>> what file_name? was that an extra argument that got removed?
>>>>
>>>> Indeed, that was for some old code.
>>>>
>>>>>>           case PR_RISCV_SET_ICACHE_FLUSH_CTX:
>>>>>>                   error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
>>>>>>                   break;
>>>>>> +       case PR_ADD_SFRAME:
>>>>>> +               if (arg5)
>>>>>> +                       return -EINVAL;
>>>>>> +               error = sframe_add_section(arg2, arg3, arg4);
>>>>>
>>>>> wouldn't it be better to make this interface extendable from the get
>>>>> go? Instead of passing 3 arguments with fixed meaning, why not pass a
>>>>> pointer to an extendable binary struct like seems to be the trend
>>>>> nowadays with nicely extensible APIs. See [0] for one such example
>>>>> (specifically, struct procmap_query). Seems more prudent, as we'll
>>>>> most probably will be adding flags, options, extra information, etc)
>>>>>
>>>>>     [0] https://lore.kernel.org/linux-mm/20240627170900.1672542-3-andrii@kernel.org/
>>>>
>>>> This ioctl interface was admittedly hacked together.  I was hoping
>>>> somebody would suggest something better :-)  I'll take a look.
>>>>
>>>>>> +static int find_fde(struct sframe_section *sec, unsigned long ip,
>>>>>> +                   struct sframe_fde *fde)
>>>>>> +{
>>>>>> +       struct sframe_fde __user *first, *last, *found = NULL;
>>>>>> +       u32 ip_off, func_off_low = 0, func_off_high = -1;
>>>>>> +
>>>>>> +       ip_off = ip - sec->sframe_addr;
>>>>>
>>>>> what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right?
>>>>
>>>> That's baked into sframe v2.
>>>
>>> I believe we do have large production binaries with more than 4GB of
>>> text, what are we going to do about them? It would be interesting to
>>> hear sframe people's opinion. Adding such a far-reaching new format in
>>> 2024 with these limitations is kind of sad. At the very least maybe we
>>> should allow some form of chaining sframe definitions to cover more
>>> than 4GB segments? Please CC relevant folks, I'm wondering what
>>> they're thinking about this.
>>>
>>
>> SFrame V2 does have that limitation. We can try to have 64-bit
>> representation for the 'ip' in the SFrame FDE and conditionalize it
>> somehow (say, with a flag in the header) so as to not bloat the majority
>> of applications.
> 
> Hi Indu,
> 
> I think that's prudent if we believe that SFrame is the solution here.
> See my reply to Josh. Real-world already approach 4GB limits, and
> things are not going to shrink in the years to come. So yeah, probably
> we need some adjustments to the format to at least allow 64-bit
> offsets (though trying to stick to 32-bit as much as possible, of
> course, if they work).
> 
> I'm not really familiar with the nuances of the format just yet, so
> can't really provide anything more useful at this point. What would be
> the sort of gold reference for Sframe format to familiarize myself
> thoroughly?
> 

There are some links on the SFrame wiki that can be helpful
https://sourceware.org/binutils/wiki/sframe

> BTW, I wanted to ask. Are there any plans to add SFrame support to
> Clang as well? It feels like without that there is no future for
> SFrame as a general-purpose solution for stack traces.
> 
>>
>>>>
>>>>> and also, does it mean that SFrame doesn't support executables with
>>>>> text bigger than 4GB?
>>>>
>>>> Yes, but is that a realistic concern?
>>>
>>> See above, yes. You'd be surprised. As somewhat corroborating
>>> evidence, there were tons of problems and churn (within at least Meta)
>>> with DWARF not supporting more than 2GB sizes, so yes, this is not an
>>> abstract problem for sure. Modern production applications can be
>>> ridiculously big.
>>>
>>>>
>>>>>> +       } else {
>>>>>> +               struct vm_area_struct *vma, *text_vma = NULL;
>>>>>> +               VMA_ITERATOR(vmi, mm, 0);
>>>>>> +
>>>>>> +               for_each_vma(vmi, vma) {
>>>>>> +                       if (vma->vm_file != sframe_vma->vm_file ||
>>>>>> +                           !(vma->vm_flags & VM_EXEC))
>>>>>> +                               continue;
>>>>>> +
>>>>>> +                       if (text_vma) {
>>>>>> +                               pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
>>>>>> +                                            current->comm, current->pid);
>>>>>
>>>>> is this just something that fundamentally can't be supported by SFrame
>>>>> format? Or just an implementation simplification?
>>>>
>>>> It's a simplification I suppose.
>>>
>>> That's a rather random limitation, IMO... How hard would it be to not
>>> make that assumption?
>>>
>>>>
>>>>> It's not illegal to have an executable with multiple VM_EXEC segments,
>>>>> no? Should this be a pr_warn_once() then?
>>>>
>>>> I don't know, is it allowed?  I've never seen it in practice.  The
>>>
>>> I'm pretty sure you can do that with a custom linker script, at the
>>> very least. Normally this probably won't happen, but I don't think
>>> Linux dictates how many executable VMAs an application can have. And
>>> it probably just naturally happens for JIT-ted applications (Java, Go,
>>> etc).
>>>
>>> Linux kernel itself has two executable segments, for instance (though
>>> kernel is special, of course, but still).
>>>
>>>> pr_warn_once() is not reporting that it's illegal but rather that this
>>>> corner case actually exists and maybe needs to be looked at.
>>>
>>> This warn() will be logged across millions of machines in the fleet,
>>> triggering alarms, people looking at this, making custom internal
>>> patches to disable the known-to-happen warn. Why do we need all this?
>>> This is an issue that is trivial to trigger by user process that's not
>>> doing anything illegal. Why?
>>>
>>>>
>>>> --
>>>> Josh
>>

Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Steven Rostedt 3 weeks, 2 days ago
On Fri, 1 Nov 2024 11:38:47 -0700
Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:

> BTW, I wanted to ask. Are there any plans to add SFrame support to
> Clang as well? It feels like without that there is no future for
> SFrame as a general-purpose solution for stack traces.

We want to use SFrames inside Google, and having Clang support it is a
requirement for that. I'm working on getting people to support it in Clang.

-- Steve
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Andrii Nakryiko 3 weeks, 2 days ago
On Fri, Nov 1, 2024 at 11:46 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Fri, 1 Nov 2024 11:38:47 -0700
> Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:
>
> > BTW, I wanted to ask. Are there any plans to add SFrame support to
> > Clang as well? It feels like without that there is no future for
> > SFrame as a general-purpose solution for stack traces.
>
> We want to use SFrames inside Google, and having Clang support it is a
> requirement for that. I'm working on getting people to support it in Clang.
>

Nice, good to hear!

> -- Steve
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Nick Desaulniers 3 weeks, 3 days ago
On Thu, Oct 31, 2024 at 1:57 PM Andrii Nakryiko
<andrii.nakryiko@gmail.com> wrote:
>
> On Tue, Oct 29, 2024 at 10:53 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> >
> > On Tue, Oct 29, 2024 at 04:32:40PM -0700, Andrii Nakryiko wrote:
> > > > +static int find_fde(struct sframe_section *sec, unsigned long ip,
> > > > +                   struct sframe_fde *fde)
> > > > +{
> > > > +       struct sframe_fde __user *first, *last, *found = NULL;
> > > > +       u32 ip_off, func_off_low = 0, func_off_high = -1;
> > > > +
> > > > +       ip_off = ip - sec->sframe_addr;
> > >
> > > what if ip_off is larger than 4GB? ELF section can be bigger than 4GB, right?
> >
> > That's baked into sframe v2.
>
> I believe we do have large production binaries with more than 4GB of
> text, what are we going to do about them? It would be interesting to
> hear sframe people's opinion. Adding such a far-reaching new format in
> 2024 with these limitations is kind of sad. At the very least maybe we
> should allow some form of chaining sframe definitions to cover more
> than 4GB segments? Please CC relevant folks, I'm wondering what
> they're thinking about this.

FWIW, Google has such large binaries, too.
https://llvm.org/devmtg/2023-10/slides/quicktalks/Eubanks-CompromisesWithLargeX86-64Binaries.pdf
-- 
Thanks,
~Nick Desaulniers
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Peter Zijlstra 3 weeks, 5 days ago
On Mon, Oct 28, 2024 at 02:47:36PM -0700, Josh Poimboeuf wrote:

> +static int __sframe_add_section(unsigned long sframe_addr,
> +				unsigned long text_start,
> +				unsigned long text_end)
> +{
> +	struct maple_tree *sframe_mt = &current->mm->sframe_mt;
> +	struct sframe_section *sec;
> +	struct sframe_header shdr;
> +	unsigned long header_end;
> +	int ret;
> +
> +	if (copy_from_user(&shdr, (void __user *)sframe_addr, sizeof(shdr)))
> +		return -EFAULT;
> +
> +	if (shdr.preamble.magic != SFRAME_MAGIC ||
> +	    shdr.preamble.version != SFRAME_VERSION_2 ||
> +	    !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
> +	    shdr.auxhdr_len || !shdr.num_fdes || !shdr.num_fres ||
> +	    shdr.fdes_off > shdr.fres_off) {
> +		return -EINVAL;
> +	}
> +
> +	sec = kmalloc(sizeof(*sec), GFP_KERNEL);
> +	if (!sec)
> +		return -ENOMEM;
> +
> +	header_end = sframe_addr + SFRAME_HDR_SIZE(shdr);
> +
> +	sec->sframe_addr	= sframe_addr;
> +	sec->text_addr		= text_start;
> +	sec->fdes_addr		= header_end + shdr.fdes_off;
> +	sec->fres_addr		= header_end + shdr.fres_off;
> +	sec->fdes_nr		= shdr.num_fdes;
> +	sec->ra_off		= shdr.cfa_fixed_ra_offset;
> +	sec->fp_off		= shdr.cfa_fixed_fp_offset;
> +
> +	ret = mtree_insert_range(sframe_mt, text_start, text_end, sec, GFP_KERNEL);
> +	if (ret) {
> +		kfree(sec);
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
> +		       unsigned long text_end)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *sframe_vma;
> +
> +	mmap_read_lock(mm);

DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
	     mmap_read_lock(_T), mmap_read_unlock(_T))

in include/linux/mmap_lock.h ?

> +
> +	sframe_vma = vma_lookup(mm, sframe_addr);
> +	if (!sframe_vma)
> +		goto err_unlock;
> +
> +	if (text_start && text_end) {
> +		struct vm_area_struct *text_vma;
> +
> +		text_vma = vma_lookup(mm, text_start);
> +		if (!(text_vma->vm_flags & VM_EXEC))
> +			goto err_unlock;
> +
> +		if (PAGE_ALIGN(text_end) != text_vma->vm_end)
> +			goto err_unlock;
> +	} else {
> +		struct vm_area_struct *vma, *text_vma = NULL;
> +		VMA_ITERATOR(vmi, mm, 0);
> +
> +		for_each_vma(vmi, vma) {
> +			if (vma->vm_file != sframe_vma->vm_file ||
> +			    !(vma->vm_flags & VM_EXEC))
> +				continue;
> +
> +			if (text_vma) {
> +				pr_warn_once("%s[%d]: multiple EXEC segments unsupported\n",
> +					     current->comm, current->pid);
> +				goto err_unlock;
> +			}
> +
> +			text_vma = vma;
> +		}
> +
> +		if (!text_vma)
> +			goto err_unlock;
> +
> +		text_start = text_vma->vm_start;
> +		text_end   = text_vma->vm_end;
> +	}
> +
> +	mmap_read_unlock(mm);
> +
> +	return __sframe_add_section(sframe_addr, text_start, text_end);
> +
> +err_unlock:
> +	mmap_read_unlock(mm);
> +	return -EINVAL;
> +}

> +int sframe_remove_section(unsigned long sframe_addr)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct sframe_section *sec;
> +	unsigned long index = 0;
> +
> +	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
> +		if (sec->sframe_addr == sframe_addr)
> +			return __sframe_remove_section(mm, sec);
> +	}
> +
> +	return -EINVAL;
> +}

> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -64,6 +64,7 @@
>  #include <linux/rcupdate.h>
>  #include <linux/uidgid.h>
>  #include <linux/cred.h>
> +#include <linux/sframe.h>
>  
>  #include <linux/nospec.h>
>  
> @@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
>  		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
>  		break;
> +	case PR_ADD_SFRAME:
> +		if (arg5)
> +			return -EINVAL;
> +		error = sframe_add_section(arg2, arg3, arg4);
> +		break;
> +	case PR_REMOVE_SFRAME:
> +		if (arg3 || arg4 || arg5)
> +			return -EINVAL;
> +		error = sframe_remove_section(arg2);
> +		break;
>  	default:
>  		error = -EINVAL;
>  		break;

So I realize that mtree has an internal lock, but are we sure we don't
want a lock around those prctl()s?
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Josh Poimboeuf 3 weeks, 5 days ago
On Tue, Oct 29, 2024 at 02:27:09PM +0100, Peter Zijlstra wrote:
> > +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
> > +		       unsigned long text_end)
> > +{
> > +	struct mm_struct *mm = current->mm;
> > +	struct vm_area_struct *sframe_vma;
> > +
> > +	mmap_read_lock(mm);
> 
> DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
> 	     mmap_read_lock(_T), mmap_read_unlock(_T))
> 
> in include/linux/mmap_lock.h ?

Will do.

> > @@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> >  	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
> >  		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
> >  		break;
> > +	case PR_ADD_SFRAME:
> > +		if (arg5)
> > +			return -EINVAL;
> > +		error = sframe_add_section(arg2, arg3, arg4);
> > +		break;
> > +	case PR_REMOVE_SFRAME:
> > +		if (arg3 || arg4 || arg5)
> > +			return -EINVAL;
> > +		error = sframe_remove_section(arg2);
> > +		break;
> >  	default:
> >  		error = -EINVAL;
> >  		break;
> 
> So I realize that mtree has an internal lock, but are we sure we don't
> want a lock around those prctl()s?

Not that I can tell?  It relies on the mtree internal locking for
atomicity.

For sframe_remove_section() it uses srcu to delay the freeing until all
sframe_find()'s have completed.

-- 
Josh
Re: [PATCH v3 09/19] unwind: Introduce sframe user space unwinding
Posted by Peter Zijlstra 3 weeks, 5 days ago
On Tue, Oct 29, 2024 at 09:50:18AM -0700, Josh Poimboeuf wrote:
> On Tue, Oct 29, 2024 at 02:27:09PM +0100, Peter Zijlstra wrote:
> > > +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start,
> > > +		       unsigned long text_end)
> > > +{
> > > +	struct mm_struct *mm = current->mm;
> > > +	struct vm_area_struct *sframe_vma;
> > > +
> > > +	mmap_read_lock(mm);
> > 
> > DEFINE_GUARD(mmap_read_lock, struct mm_struct *,
> > 	     mmap_read_lock(_T), mmap_read_unlock(_T))
> > 
> > in include/linux/mmap_lock.h ?
> 
> Will do.
> 
> > > @@ -2784,6 +2785,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
> > >  	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
> > >  		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
> > >  		break;
> > > +	case PR_ADD_SFRAME:
> > > +		if (arg5)
> > > +			return -EINVAL;
> > > +		error = sframe_add_section(arg2, arg3, arg4);
> > > +		break;
> > > +	case PR_REMOVE_SFRAME:
> > > +		if (arg3 || arg4 || arg5)
> > > +			return -EINVAL;
> > > +		error = sframe_remove_section(arg2);
> > > +		break;
> > >  	default:
> > >  		error = -EINVAL;
> > >  		break;
> > 
> > So I realize that mtree has an internal lock, but are we sure we don't
> > want a lock around those prctl()s?
> 
> Not that I can tell?  It relies on the mtree internal locking for
> atomicity.
> 
> For sframe_remove_section() it uses srcu to delay the freeing until all
> sframe_find()'s have completed.

Yeah it does all that. But I was sorta looking at all that kmalloc and
copy from user stuff, but I suppose you can race that without problem.