[PATCH v2 03/11] unwind: Introduce SFrame user space unwinding

Josh Poimboeuf posted 11 patches 2 months, 2 weeks ago
[PATCH v2 03/11] unwind: Introduce SFrame user space unwinding
Posted by Josh Poimboeuf 2 months, 2 weeks ago
Some distros have started compiling frame pointers into all their
packages to enable the kernel to do system-wide profiling of user space.
Unfortunately that creates a runtime performance penalty across the
entire system.  Using DWARF instead isn't feasible due to the complexity
it would add to the kernel.

For in-kernel unwinding we solved this problem with the creation of the
ORC unwinder for x86_64.  Similarly, for user space the GNU assembler
has created the SFrame format starting with binutils 2.41 for SFrame v2.
SFrame is a simpler version of .eh_frame which gets placed in the
.sframe section.

Add support for unwinding user space using SFrame.

More information about SFrame can be found here:

  - https://lwn.net/Articles/932209/
  - https://lwn.net/Articles/940686/
  - https://sourceware.org/binutils/docs/sframe-spec.html

Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
 arch/Kconfig                |   3 +
 fs/binfmt_elf.c             |  47 +++-
 include/linux/mm_types.h    |   3 +
 include/linux/sframe.h      |  46 ++++
 include/linux/user_unwind.h |   1 +
 include/uapi/linux/elf.h    |   1 +
 include/uapi/linux/prctl.h  |   3 +
 kernel/fork.c               |  10 +
 kernel/sys.c                |  11 +
 kernel/unwind/Makefile      |   1 +
 kernel/unwind/sframe.c      | 420 ++++++++++++++++++++++++++++++++++++
 kernel/unwind/sframe.h      | 215 ++++++++++++++++++
 kernel/unwind/user.c        |  14 ++
 mm/init-mm.c                |   4 +-
 14 files changed, 774 insertions(+), 5 deletions(-)
 create mode 100644 include/linux/sframe.h
 create mode 100644 kernel/unwind/sframe.c
 create mode 100644 kernel/unwind/sframe.h

diff --git a/arch/Kconfig b/arch/Kconfig
index b1002b2da331..ff5d5bc5f947 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -428,6 +428,9 @@ config HAVE_HARDLOCKUP_DETECTOR_ARCH
 config HAVE_USER_UNWIND
 	bool
 
+config HAVE_USER_UNWIND_SFRAME
+	bool
+
 config HAVE_PERF_REGS
 	bool
 	help
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 19fa49cd9907..923aed390f2e 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -47,6 +47,7 @@
 #include <linux/dax.h>
 #include <linux/uaccess.h>
 #include <linux/rseq.h>
+#include <linux/sframe.h>
 #include <asm/param.h>
 #include <asm/page.h>
 
@@ -633,11 +634,13 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 		unsigned long no_base, struct elf_phdr *interp_elf_phdata,
 		struct arch_elf_state *arch_state)
 {
-	struct elf_phdr *eppnt;
+	struct elf_phdr *eppnt, *sframe_phdr = NULL;
 	unsigned long load_addr = 0;
 	int load_addr_set = 0;
 	unsigned long error = ~0UL;
 	unsigned long total_size;
+	unsigned long start_code = ~0UL;
+	unsigned long end_code = 0;
 	int i;
 
 	/* First of all, some simple consistency checks */
@@ -659,7 +662,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 
 	eppnt = interp_elf_phdata;
 	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
-		if (eppnt->p_type == PT_LOAD) {
+		switch (eppnt->p_type) {
+		case PT_LOAD: {
 			int elf_type = MAP_PRIVATE;
 			int elf_prot = make_prot(eppnt->p_flags, arch_state,
 						 true, true);
@@ -688,7 +692,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 			/*
 			 * Check to see if the section's size will overflow the
 			 * allowed task size. Note that p_filesz must always be
-			 * <= p_memsize so it's only necessary to check p_memsz.
+			 * <= p_memsz so it's only necessary to check p_memsz.
 			 */
 			k = load_addr + eppnt->p_vaddr;
 			if (BAD_ADDR(k) ||
@@ -698,7 +702,28 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
 				error = -ENOMEM;
 				goto out;
 			}
+
+			if ((eppnt->p_flags & PF_X) && k < start_code)
+				start_code = k;
+
+			if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
+				end_code = k + eppnt->p_filesz;
+			break;
 		}
+		case PT_GNU_SFRAME:
+			sframe_phdr = eppnt;
+			break;
+		}
+	}
+
+	if (sframe_phdr) {
+		struct sframe_file sfile = {
+			.sframe_addr	= load_addr + sframe_phdr->p_vaddr,
+			.text_start	= start_code,
+			.text_end	= end_code,
+		};
+
+		__sframe_add_section(&sfile);
 	}
 
 	error = load_addr;
@@ -823,7 +848,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	int first_pt_load = 1;
 	unsigned long error;
 	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
-	struct elf_phdr *elf_property_phdata = NULL;
+	struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
 	unsigned long elf_brk;
 	int retval, i;
 	unsigned long elf_entry;
@@ -931,6 +956,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 				executable_stack = EXSTACK_DISABLE_X;
 			break;
 
+		case PT_GNU_SFRAME:
+			sframe_phdr = elf_ppnt;
+			break;
+
 		case PT_LOPROC ... PT_HIPROC:
 			retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
 						  bprm->file, false,
@@ -1316,6 +1345,16 @@ static int load_elf_binary(struct linux_binprm *bprm)
 				MAP_FIXED | MAP_PRIVATE, 0);
 	}
 
+	if (sframe_phdr) {
+		struct sframe_file sfile = {
+			.sframe_addr	= load_bias + sframe_phdr->p_vaddr,
+			.text_start	= start_code,
+			.text_end	= end_code,
+		};
+
+		__sframe_add_section(&sfile);
+	}
+
 	regs = current_pt_regs();
 #ifdef ELF_PLAT_INIT
 	/*
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 485424979254..1aee78cbea33 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1019,6 +1019,9 @@ struct mm_struct {
 #endif
 		} lru_gen;
 #endif /* CONFIG_LRU_GEN_WALKS_MMU */
+#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
+		struct maple_tree sframe_mt;
+#endif
 	} __randomize_layout;
 
 	/*
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
new file mode 100644
index 000000000000..3a44f76929e2
--- /dev/null
+++ b/include/linux/sframe.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SFRAME_H
+#define _LINUX_SFRAME_H
+
+#include <linux/mm_types.h>
+
+struct sframe_file {
+	unsigned long sframe_addr, text_start, text_end;
+};
+
+struct user_unwind_frame;
+
+#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
+
+#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0)
+
+extern void sframe_free_mm(struct mm_struct *mm);
+
+extern int __sframe_add_section(struct sframe_file *file);
+extern int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end);
+extern int sframe_remove_section(unsigned long sframe_addr);
+extern int sframe_find(unsigned long ip, struct user_unwind_frame *frame);
+
+static inline bool current_has_sframe(void)
+{
+	struct mm_struct *mm = current->mm;
+
+	return mm && !mtree_empty(&mm->sframe_mt);
+}
+
+#else /* !CONFIG_HAVE_USER_UNWIND_SFRAME */
+
+#define INIT_MM_SFRAME
+
+static inline void sframe_free_mm(struct mm_struct *mm) {}
+
+static inline int __sframe_add_section(struct sframe_file *file) { return -EINVAL; }
+static inline int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end) { return -EINVAL; }
+static inline int sframe_remove_section(unsigned long sframe_addr) { return -EINVAL; }
+static inline int sframe_find(unsigned long ip, struct user_unwind_frame *frame) { return -EINVAL; }
+
+static inline bool current_has_sframe(void) { return false; }
+
+#endif /* CONFIG_HAVE_USER_UNWIND_SFRAME */
+
+#endif /* _LINUX_SFRAME_H */
diff --git a/include/linux/user_unwind.h b/include/linux/user_unwind.h
index 0a19ac6c92b2..8003f9d35405 100644
--- a/include/linux/user_unwind.h
+++ b/include/linux/user_unwind.h
@@ -7,6 +7,7 @@
 enum user_unwind_type {
 	USER_UNWIND_TYPE_AUTO,
 	USER_UNWIND_TYPE_FP,
+	USER_UNWIND_TYPE_SFRAME,
 };
 
 struct user_unwind_frame {
diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
index b54b313bcf07..b2aca31e1a49 100644
--- a/include/uapi/linux/elf.h
+++ b/include/uapi/linux/elf.h
@@ -39,6 +39,7 @@ typedef __s64	Elf64_Sxword;
 #define PT_GNU_STACK	(PT_LOOS + 0x474e551)
 #define PT_GNU_RELRO	(PT_LOOS + 0x474e552)
 #define PT_GNU_PROPERTY	(PT_LOOS + 0x474e553)
+#define PT_GNU_SFRAME	(PT_LOOS + 0x474e554)
 
 
 /* ARM MTE memory tag segment type */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 35791791a879..69511077c910 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -328,4 +328,7 @@ struct prctl_mm_map {
 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
 # define PR_PPC_DEXCR_CTRL_MASK		0x1f
 
+#define PR_ADD_SFRAME			74
+#define PR_REMOVE_SFRAME		75
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index cc760491f201..a216f091edfb 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -104,6 +104,7 @@
 #include <linux/rseq.h>
 #include <uapi/linux/pidfd.h>
 #include <linux/pidfs.h>
+#include <linux/sframe.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -923,6 +924,7 @@ void __mmdrop(struct mm_struct *mm)
 	mm_pasid_drop(mm);
 	mm_destroy_cid(mm);
 	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
+	sframe_free_mm(mm);
 
 	free_mm(mm);
 }
@@ -1249,6 +1251,13 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
 #endif
 }
 
+static void mm_init_sframe(struct mm_struct *mm)
+{
+#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
+	mt_init(&mm->sframe_mt);
+#endif
+}
+
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	struct user_namespace *user_ns)
 {
@@ -1280,6 +1289,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	mm->pmd_huge_pte = NULL;
 #endif
 	mm_init_uprobes_state(mm);
+	mm_init_sframe(mm);
 	hugetlb_count_init(mm);
 
 	if (current->mm) {
diff --git a/kernel/sys.c b/kernel/sys.c
index 3a2df1bd9f64..e4d2b64f4ae4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -64,6 +64,7 @@
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
+#include <linux/sframe.h>
 
 #include <linux/nospec.h>
 
@@ -2782,6 +2783,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
 		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
 		break;
+	case PR_ADD_SFRAME:
+		if (arg5)
+			return -EINVAL;
+		error = sframe_add_section(arg2, arg3, arg4);
+		break;
+	case PR_REMOVE_SFRAME:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		error = sframe_remove_section(arg2);
+		break;
 	default:
 		error = -EINVAL;
 		break;
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index eb466d6a3295..6f202c5840cf 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1 +1,2 @@
 obj-$(CONFIG_HAVE_USER_UNWIND) += user.o
+obj-$(CONFIG_HAVE_USER_UNWIND_SFRAME) += sframe.o
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
new file mode 100644
index 000000000000..3e4d29e737a1
--- /dev/null
+++ b/kernel/unwind/sframe.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <linux/sframe.h>
+#include <linux/user_unwind.h>
+
+#include "sframe.h"
+
+#define SFRAME_FILENAME_LEN 32
+
+struct sframe_section {
+	struct rcu_head rcu;
+
+	unsigned long sframe_addr;
+	unsigned long text_addr;
+
+	unsigned long fdes_addr;
+	unsigned long fres_addr;
+	unsigned int  fdes_nr;
+	signed char ra_off, fp_off;
+};
+
+DEFINE_STATIC_SRCU(sframe_srcu);
+
+#define __SFRAME_GET_USER(out, user_ptr, type)				\
+({									\
+	type __tmp;							\
+	if (get_user(__tmp, (type *)user_ptr))				\
+		return -EFAULT;						\
+	user_ptr += sizeof(__tmp);					\
+	out = __tmp;							\
+})
+
+#define SFRAME_GET_USER_SIGNED(out, user_ptr, size)			\
+({									\
+	switch (size) {							\
+	case 1:								\
+		__SFRAME_GET_USER(out, user_ptr, s8);			\
+		break;							\
+	case 2:								\
+		__SFRAME_GET_USER(out, user_ptr, s16);			\
+		break;							\
+	case 4:								\
+		__SFRAME_GET_USER(out, user_ptr, s32);			\
+		break;							\
+	default:							\
+		return -EINVAL;						\
+	}								\
+})
+
+#define SFRAME_GET_USER_UNSIGNED(out, user_ptr, size)			\
+({									\
+	switch (size) {							\
+	case 1:								\
+		__SFRAME_GET_USER(out, user_ptr, u8);			\
+		break;							\
+	case 2:								\
+		__SFRAME_GET_USER(out, user_ptr, u16);			\
+		break;							\
+	case 4:								\
+		__SFRAME_GET_USER(out, user_ptr, u32);			\
+		break;							\
+	default:							\
+		return -EINVAL;						\
+	}								\
+})
+
+static unsigned char fre_type_to_size(unsigned char fre_type)
+{
+	if (fre_type > 2)
+		return 0;
+	return 1 << fre_type;
+}
+
+static unsigned char offset_size_enum_to_size(unsigned char off_size)
+{
+	if (off_size > 2)
+		return 0;
+	return 1 << off_size;
+}
+
+static int find_fde(struct sframe_section *sec, unsigned long ip,
+		    struct sframe_fde *fde)
+{
+	s32 func_off, ip_off;
+	struct sframe_fde __user *first, *last, *mid, *found;
+
+	ip_off = ip - sec->sframe_addr;
+
+	first = (void *)sec->fdes_addr;
+	last = first + sec->fdes_nr;
+	while (first <= last) {
+		mid = first + ((last - first) / 2);
+		if (get_user(func_off, (s32 *)mid))
+			return -EFAULT;
+		if (ip_off >= func_off) {
+			found = mid;
+			first = mid + 1;
+		} else
+			last = mid - 1;
+	}
+
+	if (!found)
+		return -EINVAL;
+
+	if (copy_from_user(fde, found, sizeof(*fde)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int find_fre(struct sframe_section *sec, struct sframe_fde *fde,
+		    unsigned long ip, struct user_unwind_frame *frame)
+{
+	unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info);
+	unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info);
+	s32 fre_ip_off, cfa_off, ra_off, fp_off, ip_off;
+	unsigned char offset_count, offset_size;
+	unsigned char addr_size;
+	void __user *f, *last_f;
+	u8 fre_info;
+	int i;
+
+	addr_size = fre_type_to_size(fre_type);
+	if (!addr_size)
+		return -EINVAL;
+
+	ip_off = ip - sec->sframe_addr - fde->start_addr;
+
+	f = (void *)sec->fres_addr + fde->fres_off;
+
+	for (i = 0; i < fde->fres_num; i++) {
+
+		SFRAME_GET_USER_UNSIGNED(fre_ip_off, f, addr_size);
+
+		if (fde_type == SFRAME_FDE_TYPE_PCINC) {
+			if (fre_ip_off > ip_off)
+				break;
+		} else {
+			/* SFRAME_FDE_TYPE_PCMASK */
+			if (ip_off % fde->rep_size < fre_ip_off)
+				break;
+		}
+
+		SFRAME_GET_USER_UNSIGNED(fre_info, f, 1);
+
+		offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info);
+		offset_size  = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info));
+
+		if (!offset_count || !offset_size)
+			return -EINVAL;
+
+		last_f = f;
+		f += offset_count * offset_size;
+	}
+
+	if (!last_f)
+		return -EINVAL;
+
+	f = last_f;
+
+	SFRAME_GET_USER_UNSIGNED(cfa_off, f, offset_size);
+	offset_count--;
+
+	ra_off = sec->ra_off;
+	if (!ra_off) {
+		if (!offset_count--)
+			return -EINVAL;
+		SFRAME_GET_USER_SIGNED(ra_off, f, offset_size);
+	}
+
+	fp_off = sec->fp_off;
+	if (!fp_off && offset_count) {
+		offset_count--;
+		SFRAME_GET_USER_SIGNED(fp_off, f, offset_size);
+	}
+
+	if (offset_count)
+		return -EINVAL;
+
+	frame->cfa_off = cfa_off;
+	frame->ra_off = ra_off;
+	frame->fp_off = fp_off;
+	frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP;
+
+	return 0;
+}
+
+int sframe_find(unsigned long ip, struct user_unwind_frame *frame)
+{
+	struct mm_struct *mm = current->mm;
+	struct sframe_section *sec;
+	struct sframe_fde fde;
+	int srcu_idx;
+	int ret = -EINVAL;
+
+	srcu_idx = srcu_read_lock(&sframe_srcu);
+
+	sec = mtree_load(&mm->sframe_mt, ip);
+	if (!sec) {
+		srcu_read_unlock(&sframe_srcu, srcu_idx);
+		return -EINVAL;
+	}
+
+
+	ret = find_fde(sec, ip, &fde);
+	if (ret)
+		goto err_unlock;
+
+	ret = find_fre(sec, &fde, ip, frame);
+	if (ret)
+		goto err_unlock;
+
+	srcu_read_unlock(&sframe_srcu, srcu_idx);
+	return 0;
+
+err_unlock:
+	srcu_read_unlock(&sframe_srcu, srcu_idx);
+	return ret;
+}
+
+static int get_sframe_file(unsigned long sframe_addr, struct sframe_file *file)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *sframe_vma, *text_vma, *vma;
+	VMA_ITERATOR(vmi, mm, 0);
+
+	mmap_read_lock(mm);
+
+	sframe_vma = vma_lookup(mm, sframe_addr);
+	if (!sframe_vma || !sframe_vma->vm_file)
+		goto err_unlock;
+
+	text_vma = NULL;
+
+	for_each_vma(vmi, vma) {
+		if (vma->vm_file != sframe_vma->vm_file)
+			continue;
+		if (vma->vm_flags & VM_EXEC) {
+			if (text_vma) {
+				/*
+				 * Multiple EXEC segments in a single file
+				 * aren't currently supported, is that a thing?
+				 */
+				mmap_read_unlock(mm);
+				pr_warn_once("unsupported multiple EXEC segments in task %s[%d]\n",
+					     current->comm, current->pid);
+				return -EINVAL;
+			}
+			text_vma = vma;
+		}
+	}
+
+	file->sframe_addr	= sframe_addr;
+	file->text_start	= text_vma->vm_start;
+	file->text_end		= text_vma->vm_end;
+
+	mmap_read_unlock(mm);
+	return 0;
+
+err_unlock:
+	mmap_read_unlock(mm);
+	return -EINVAL;
+}
+
+static int validate_sframe_addrs(struct sframe_file *file)
+{
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *text_vma;
+
+	mmap_read_lock(mm);
+
+	if (!vma_lookup(mm, file->sframe_addr))
+		goto err_unlock;
+
+	text_vma = vma_lookup(mm, file->text_start);
+	if (!(text_vma->vm_flags & VM_EXEC))
+		goto err_unlock;
+
+	if (vma_lookup(mm, file->text_end-1) != text_vma)
+		goto err_unlock;
+
+	mmap_read_unlock(mm);
+	return 0;
+
+err_unlock:
+	mmap_read_unlock(mm);
+	return -EINVAL;
+}
+
+int __sframe_add_section(struct sframe_file *file)
+{
+	struct maple_tree *sframe_mt = &current->mm->sframe_mt;
+	struct sframe_section *sec;
+	struct sframe_header shdr;
+	unsigned long header_end;
+	int ret;
+
+	if (copy_from_user(&shdr, (void *)file->sframe_addr, sizeof(shdr)))
+		return -EFAULT;
+
+	if (shdr.preamble.magic != SFRAME_MAGIC ||
+	    shdr.preamble.version != SFRAME_VERSION_2 ||
+	    !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
+	    shdr.auxhdr_len || !shdr.num_fdes || !shdr.num_fres ||
+	    shdr.fdes_off > shdr.fres_off) {
+		/*
+		 * Either binutils < 2.41, corrupt sframe header, or
+		 * unsupported feature.
+		 * */
+		pr_warn_once("bad sframe header in task %s[%d]\n",
+			     current->comm, current->pid);
+		return -EINVAL;
+	}
+
+	header_end = file->sframe_addr + SFRAME_HDR_SIZE(shdr);
+
+	sec = kmalloc(sizeof(*sec), GFP_KERNEL);
+	if (!sec)
+		return -ENOMEM;
+
+	sec->sframe_addr	= file->sframe_addr;
+	sec->text_addr		= file->text_start;
+	sec->fdes_addr		= header_end + shdr.fdes_off;
+	sec->fres_addr		= header_end + shdr.fres_off;
+	sec->fdes_nr		= shdr.num_fdes;
+	sec->ra_off		= shdr.cfa_fixed_ra_offset;
+	sec->fp_off		= shdr.cfa_fixed_fp_offset;
+
+	ret = mtree_insert_range(sframe_mt, file->text_start, file->text_end,
+				 sec, GFP_KERNEL);
+	if (ret) {
+		kfree(sec);
+		return ret;
+	}
+
+	return 0;
+}
+
+int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end)
+{
+	struct sframe_file file;
+	int ret;
+
+	if (!text_start || !text_end) {
+		ret = get_sframe_file(sframe_addr, &file);
+		if (ret)
+			return ret;
+	} else {
+		/*
+		 * This is mainly for generated code, for which the text isn't
+		 * file-backed so the user has to give the text bounds.
+		 */
+		file.sframe_addr	= sframe_addr;
+		file.text_start		= text_start;
+		file.text_end		= text_end;
+		ret = validate_sframe_addrs(&file);
+		if (ret)
+			return ret;
+	}
+
+	return __sframe_add_section(&file);
+}
+
+static void sframe_free_rcu(struct rcu_head *rcu)
+{
+	struct sframe_section *sec = container_of(rcu, struct sframe_section, rcu);
+
+	kfree(sec);
+}
+
+static int __sframe_remove_section(struct mm_struct *mm,
+				   struct sframe_section *sec)
+{
+	struct sframe_section *s;
+
+	s = mtree_erase(&mm->sframe_mt, sec->text_addr);
+	if (!s || WARN_ON_ONCE(s != sec))
+		return -EINVAL;
+
+	call_srcu(&sframe_srcu, &sec->rcu, sframe_free_rcu);
+
+	return 0;
+}
+
+int sframe_remove_section(unsigned long sframe_addr)
+{
+	struct mm_struct *mm = current->mm;
+	struct sframe_section *sec;
+	unsigned long index = 0;
+
+	sec = mtree_load(&mm->sframe_mt, sframe_addr);
+	if (!sec)
+		return -EINVAL;
+
+	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
+		if (sec->sframe_addr == sframe_addr)
+			return __sframe_remove_section(mm, sec);
+	}
+
+	return -EINVAL;
+}
+
+void sframe_free_mm(struct mm_struct *mm)
+{
+	struct sframe_section *sec;
+	unsigned long index = 0;
+
+	if (!mm)
+		return;
+
+	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX)
+		kfree(sec);
+
+	mtree_destroy(&mm->sframe_mt);
+}
diff --git a/kernel/unwind/sframe.h b/kernel/unwind/sframe.h
new file mode 100644
index 000000000000..aa468d6f1f4a
--- /dev/null
+++ b/kernel/unwind/sframe.h
@@ -0,0 +1,215 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2023, Oracle and/or its affiliates.
+ *
+ * This file contains definitions for the SFrame stack tracing format, which is
+ * documented at https://sourceware.org/binutils/docs
+ */
+#ifndef _SFRAME_H
+#define _SFRAME_H
+
+#include <linux/types.h>
+
+#define SFRAME_VERSION_1	1
+#define SFRAME_VERSION_2	2
+#define SFRAME_MAGIC		0xdee2
+
+/* Function Descriptor Entries are sorted on PC. */
+#define SFRAME_F_FDE_SORTED	0x1
+/* Frame-pointer based stack tracing. Defined, but not set. */
+#define SFRAME_F_FRAME_POINTER	0x2
+
+#define SFRAME_CFA_FIXED_FP_INVALID 0
+#define SFRAME_CFA_FIXED_RA_INVALID 0
+
+/* Supported ABIs/Arch. */
+#define SFRAME_ABI_AARCH64_ENDIAN_BIG	    1 /* AARCH64 big endian. */
+#define SFRAME_ABI_AARCH64_ENDIAN_LITTLE    2 /* AARCH64 little endian. */
+#define SFRAME_ABI_AMD64_ENDIAN_LITTLE	    3 /* AMD64 little endian. */
+
+/* SFrame FRE types. */
+#define SFRAME_FRE_TYPE_ADDR1	0
+#define SFRAME_FRE_TYPE_ADDR2	1
+#define SFRAME_FRE_TYPE_ADDR4	2
+
+/*
+ * SFrame Function Descriptor Entry types.
+ *
+ * The SFrame format has two possible representations for functions. The
+ * choice of which type to use is made according to the instruction patterns
+ * in the relevant program stub.
+ */
+
+/* Unwinders perform a (PC >= FRE_START_ADDR) to look up a matching FRE. */
+#define SFRAME_FDE_TYPE_PCINC	0
+/*
+ * Unwinders perform a (PC & FRE_START_ADDR_AS_MASK >= FRE_START_ADDR_AS_MASK)
+ * to look up a matching FRE. Typical usecases are pltN entries, trampolines
+ * etc.
+ */
+#define SFRAME_FDE_TYPE_PCMASK	1
+
+/**
+ * struct sframe_preamble - SFrame Preamble.
+ * @magic: Magic number (SFRAME_MAGIC).
+ * @version: Format version number (SFRAME_VERSION).
+ * @flags: Various flags.
+ */
+struct sframe_preamble {
+	u16 magic;
+	u8  version;
+	u8  flags;
+} __packed;
+
+/**
+ * struct sframe_header - SFrame Header.
+ * @preamble: SFrame preamble.
+ * @abi_arch: Identify the arch (including endianness) and ABI.
+ * @cfa_fixed_fp_offset: Offset for the Frame Pointer (FP) from CFA may be
+ *	  fixed  for some ABIs ((e.g, in AMD64 when -fno-omit-frame-pointer is
+ *	  used). When fixed, this field specifies the fixed stack frame offset
+ *	  and the individual FREs do not need to track it. When not fixed, it
+ *	  is set to SFRAME_CFA_FIXED_FP_INVALID, and the individual FREs may
+ *	  provide the applicable stack frame offset, if any.
+ * @cfa_fixed_ra_offset: Offset for the Return Address from CFA is fixed for
+ *	  some ABIs. When fixed, this field specifies the fixed stack frame
+ *	  offset and the individual FREs do not need to track it. When not
+ *	  fixed, it is set to SFRAME_CFA_FIXED_FP_INVALID.
+ * @auxhdr_len: Number of bytes making up the auxiliary header, if any.
+ *	  Some ABI/arch, in the future, may use this space for extending the
+ *	  information in SFrame header. Auxiliary header is contained in bytes
+ *	  sequentially following the sframe_header.
+ * @num_fdes: Number of SFrame FDEs in this SFrame section.
+ * @num_fres: Number of SFrame Frame Row Entries.
+ * @fre_len:  Number of bytes in the SFrame Frame Row Entry section.
+ * @fdes_off: Offset of SFrame Function Descriptor Entry section.
+ * @fres_off: Offset of SFrame Frame Row Entry section.
+ */
+struct sframe_header {
+	struct sframe_preamble preamble;
+	u8  abi_arch;
+	s8  cfa_fixed_fp_offset;
+	s8  cfa_fixed_ra_offset;
+	u8  auxhdr_len;
+	u32 num_fdes;
+	u32 num_fres;
+	u32 fre_len;
+	u32 fdes_off;
+	u32 fres_off;
+} __packed;
+
+#define SFRAME_HDR_SIZE(sframe_hdr)	\
+	((sizeof(struct sframe_header) + (sframe_hdr).auxhdr_len))
+
+/* Two possible keys for executable (instruction) pointers signing. */
+#define SFRAME_AARCH64_PAUTH_KEY_A    0 /* Key A. */
+#define SFRAME_AARCH64_PAUTH_KEY_B    1 /* Key B. */
+
+/**
+ * struct sframe_fde - SFrame Function Descriptor Entry.
+ * @start_addr: Function start address. Encoded as a signed offset,
+ *	  relative to the current FDE.
+ * @size: Size of the function in bytes.
+ * @fres_off: Offset of the first SFrame Frame Row Entry of the function,
+ *	  relative to the beginning of the SFrame Frame Row Entry sub-section.
+ * @fres_num: Number of frame row entries for the function.
+ * @info: Additional information for deciphering the stack trace
+ *	  information for the function. Contains information about SFrame FRE
+ *	  type, SFrame FDE type, PAC authorization A/B key, etc.
+ * @rep_size: Block size for SFRAME_FDE_TYPE_PCMASK
+ * @padding: Unused
+ */
+struct sframe_fde {
+	s32 start_addr;
+	u32 size;
+	u32 fres_off;
+	u32 fres_num;
+	u8  info;
+	u8  rep_size;
+	u16 padding;
+} __packed;
+
+/*
+ * 'func_info' in SFrame FDE contains additional information for deciphering
+ * the stack trace information for the function. In V1, the information is
+ * organized as follows:
+ *   - 4-bits: Identify the FRE type used for the function.
+ *   - 1-bit: Identify the FDE type of the function - mask or inc.
+ *   - 1-bit: PAC authorization A/B key (aarch64).
+ *   - 2-bits: Unused.
+ * ---------------------------------------------------------------------
+ * |  Unused  |  PAC auth A/B key (aarch64) |  FDE type |   FRE type   |
+ * |          |        Unused (amd64)       |           |              |
+ * ---------------------------------------------------------------------
+ * 8          6                             5           4              0
+ */
+
+/* Note: Set PAC auth key to SFRAME_AARCH64_PAUTH_KEY_A by default.  */
+#define SFRAME_FUNC_INFO(fde_type, fre_enc_type) \
+	(((SFRAME_AARCH64_PAUTH_KEY_A & 0x1) << 5) | \
+	 (((fde_type) & 0x1) << 4) | ((fre_enc_type) & 0xf))
+
+#define SFRAME_FUNC_FRE_TYPE(data)	  ((data) & 0xf)
+#define SFRAME_FUNC_FDE_TYPE(data)	  (((data) >> 4) & 0x1)
+#define SFRAME_FUNC_PAUTH_KEY(data)	  (((data) >> 5) & 0x1)
+
+/*
+ * Size of stack frame offsets in an SFrame Frame Row Entry. A single
+ * SFrame FRE has all offsets of the same size. Offset size may vary
+ * across frame row entries.
+ */
+#define SFRAME_FRE_OFFSET_1B	  0
+#define SFRAME_FRE_OFFSET_2B	  1
+#define SFRAME_FRE_OFFSET_4B	  2
+
+/* An SFrame Frame Row Entry can be SP or FP based.  */
+#define SFRAME_BASE_REG_FP	0
+#define SFRAME_BASE_REG_SP	1
+
+/*
+ * The index at which a specific offset is presented in the variable length
+ * bytes of an FRE.
+ */
+#define SFRAME_FRE_CFA_OFFSET_IDX   0
+/*
+ * The RA stack offset, if present, will always be at index 1 in the variable
+ * length bytes of the FRE.
+ */
+#define SFRAME_FRE_RA_OFFSET_IDX    1
+/*
+ * The FP stack offset may appear at offset 1 or 2, depending on the ABI as RA
+ * may or may not be tracked.
+ */
+#define SFRAME_FRE_FP_OFFSET_IDX    2
+
+/*
+ * 'fre_info' in SFrame FRE contains information about:
+ *   - 1 bit: base reg for CFA
+ *   - 4 bits: Number of offsets (N). A value of up to 3 is allowed to track
+ *   all three of CFA, FP and RA (fixed implicit order).
+ *   - 2 bits: information about size of the offsets (S) in bytes.
+ *     Valid values are SFRAME_FRE_OFFSET_1B, SFRAME_FRE_OFFSET_2B,
+ *     SFRAME_FRE_OFFSET_4B
+ *   - 1 bit: Mangled RA state bit (aarch64 only).
+ * ---------------------------------------------------------------
+ * | Mangled-RA (aarch64) |  Size of   |   Number of  | base_reg |
+ * |  Unused (amd64)      |  offsets   |    offsets   |          |
+ * ---------------------------------------------------------------
+ * 8                      7             5             1          0
+ */
+
+/* Note: Set mangled_ra_p to zero by default. */
+#define SFRAME_FRE_INFO(base_reg_id, offset_num, offset_size) \
+	(((0 & 0x1) << 7) | (((offset_size) & 0x3) << 5) | \
+	 (((offset_num) & 0xf) << 1) | ((base_reg_id) & 0x1))
+
+/* Set the mangled_ra_p bit as indicated. */
+#define SFRAME_FRE_INFO_UPDATE_MANGLED_RA_P(mangled_ra_p, fre_info) \
+	((((mangled_ra_p) & 0x1) << 7) | ((fre_info) & 0x7f))
+
+#define SFRAME_FRE_CFA_BASE_REG_ID(data)	  ((data) & 0x1)
+#define SFRAME_FRE_OFFSET_COUNT(data)		  (((data) >> 1) & 0xf)
+#define SFRAME_FRE_OFFSET_SIZE(data)		  (((data) >> 5) & 0x3)
+#define SFRAME_FRE_MANGLED_RA_P(data)		  (((data) >> 7) & 0x1)
+
+#endif /* _SFRAME_H */
diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
index 5d16f9604a61..3a7b14cf522b 100644
--- a/kernel/unwind/user.c
+++ b/kernel/unwind/user.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/sched/task_stack.h>
 #include <linux/user_unwind.h>
+#include <linux/sframe.h>
 #include <linux/uaccess.h>
 #include <asm/user_unwind.h>
 
@@ -29,6 +30,11 @@ int user_unwind_next(struct user_unwind_state *state)
 	case USER_UNWIND_TYPE_FP:
 		frame = &fp_frame;
 		break;
+	case USER_UNWIND_TYPE_SFRAME:
+		ret = sframe_find(state->ip, frame);
+		if (ret)
+			goto the_end;
+		break;
 	default:
 		BUG();
 	}
@@ -57,6 +63,7 @@ int user_unwind_start(struct user_unwind_state *state,
 		      enum user_unwind_type type)
 {
 	struct pt_regs *regs = task_pt_regs(current);
+	bool sframe_possible = current_has_sframe();
 
 	memset(state, 0, sizeof(*state));
 
@@ -67,6 +74,13 @@ int user_unwind_start(struct user_unwind_state *state,
 
 	switch (type) {
 	case USER_UNWIND_TYPE_AUTO:
+		state->type = sframe_possible ? USER_UNWIND_TYPE_SFRAME :
+						USER_UNWIND_TYPE_FP;
+		break;
+	case USER_UNWIND_TYPE_SFRAME:
+		if (!sframe_possible)
+			return -EINVAL;
+		break;
 	case USER_UNWIND_TYPE_FP:
 		break;
 	default:
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 24c809379274..c4c6af046778 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -11,6 +11,7 @@
 #include <linux/atomic.h>
 #include <linux/user_namespace.h>
 #include <linux/iommu.h>
+#include <linux/sframe.h>
 #include <asm/mmu.h>
 
 #ifndef INIT_MM_CONTEXT
@@ -44,7 +45,8 @@ struct mm_struct init_mm = {
 #endif
 	.user_ns	= &init_user_ns,
 	.cpu_bitmap	= CPU_BITS_NONE,
-	INIT_MM_CONTEXT(init_mm)
+	INIT_MM_CONTEXT(init_mm),
+	INIT_MM_SFRAME,
 };
 
 void setup_initial_init_mm(void *start_code, void *end_code,
-- 
2.46.0
Re: [PATCH v2 03/11] unwind: Introduce SFrame user space unwinding
Posted by Jens Remus 1 month, 1 week ago
Hello Josh!

On 14.09.2024 01:02, Josh Poimboeuf wrote:
> Some distros have started compiling frame pointers into all their
> packages to enable the kernel to do system-wide profiling of user space.
> Unfortunately that creates a runtime performance penalty across the
> entire system.  Using DWARF instead isn't feasible due to the complexity
> it would add to the kernel.
> 
> For in-kernel unwinding we solved this problem with the creation of the
> ORC unwinder for x86_64.  Similarly, for user space the GNU assembler
> has created the SFrame format starting with binutils 2.41 for SFrame v2.
> SFrame is a simpler version of .eh_frame which gets placed in the
> .sframe section.
> 
> Add support for unwinding user space using SFrame.
> 
> More information about SFrame can be found here:
> 
>    - https://lwn.net/Articles/932209/
>    - https://lwn.net/Articles/940686/
>    - https://sourceware.org/binutils/docs/sframe-spec.html
> 
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
>   arch/Kconfig                |   3 +
>   fs/binfmt_elf.c             |  47 +++-
>   include/linux/mm_types.h    |   3 +
>   include/linux/sframe.h      |  46 ++++
>   include/linux/user_unwind.h |   1 +
>   include/uapi/linux/elf.h    |   1 +
>   include/uapi/linux/prctl.h  |   3 +
>   kernel/fork.c               |  10 +
>   kernel/sys.c                |  11 +
>   kernel/unwind/Makefile      |   1 +
>   kernel/unwind/sframe.c      | 420 ++++++++++++++++++++++++++++++++++++
>   kernel/unwind/sframe.h      | 215 ++++++++++++++++++
>   kernel/unwind/user.c        |  14 ++
>   mm/init-mm.c                |   4 +-
>   14 files changed, 774 insertions(+), 5 deletions(-)
>   create mode 100644 include/linux/sframe.h
>   create mode 100644 kernel/unwind/sframe.c
>   create mode 100644 kernel/unwind/sframe.h
> 
> diff --git a/arch/Kconfig b/arch/Kconfig
> index b1002b2da331..ff5d5bc5f947 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -428,6 +428,9 @@ config HAVE_HARDLOCKUP_DETECTOR_ARCH
>   config HAVE_USER_UNWIND
>   	bool
>   
> +config HAVE_USER_UNWIND_SFRAME
> +	bool

I found this unwinder of userspace using SFrame to depend on your 
generic unwinder of userspace framework. Does this warrant to add:

	depends on HAVE_USER_UNWIND

> +
>   config HAVE_PERF_REGS
>   	bool
>   	help
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 19fa49cd9907..923aed390f2e 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -47,6 +47,7 @@
>   #include <linux/dax.h>
>   #include <linux/uaccess.h>
>   #include <linux/rseq.h>
> +#include <linux/sframe.h>
>   #include <asm/param.h>
>   #include <asm/page.h>
>   
> @@ -633,11 +634,13 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>   		unsigned long no_base, struct elf_phdr *interp_elf_phdata,
>   		struct arch_elf_state *arch_state)
>   {
> -	struct elf_phdr *eppnt;
> +	struct elf_phdr *eppnt, *sframe_phdr = NULL;
>   	unsigned long load_addr = 0;
>   	int load_addr_set = 0;
>   	unsigned long error = ~0UL;
>   	unsigned long total_size;
> +	unsigned long start_code = ~0UL;
> +	unsigned long end_code = 0;
>   	int i;
>   
>   	/* First of all, some simple consistency checks */
> @@ -659,7 +662,8 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>   
>   	eppnt = interp_elf_phdata;
>   	for (i = 0; i < interp_elf_ex->e_phnum; i++, eppnt++) {
> -		if (eppnt->p_type == PT_LOAD) {
> +		switch (eppnt->p_type) {
> +		case PT_LOAD: {
>   			int elf_type = MAP_PRIVATE;
>   			int elf_prot = make_prot(eppnt->p_flags, arch_state,
>   						 true, true);
> @@ -688,7 +692,7 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>   			/*
>   			 * Check to see if the section's size will overflow the
>   			 * allowed task size. Note that p_filesz must always be
> -			 * <= p_memsize so it's only necessary to check p_memsz.
> +			 * <= p_memsz so it's only necessary to check p_memsz.
>   			 */
>   			k = load_addr + eppnt->p_vaddr;
>   			if (BAD_ADDR(k) ||
> @@ -698,7 +702,28 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
>   				error = -ENOMEM;
>   				goto out;
>   			}
> +
> +			if ((eppnt->p_flags & PF_X) && k < start_code)
> +				start_code = k;
> +
> +			if ((eppnt->p_flags & PF_X) && k + eppnt->p_filesz > end_code)
> +				end_code = k + eppnt->p_filesz;
> +			break;
>   		}
> +		case PT_GNU_SFRAME:
> +			sframe_phdr = eppnt;
> +			break;
> +		}
> +	}
> +
> +	if (sframe_phdr) {
> +		struct sframe_file sfile = {
> +			.sframe_addr	= load_addr + sframe_phdr->p_vaddr,
> +			.text_start	= start_code,
> +			.text_end	= end_code,
> +		};
> +
> +		__sframe_add_section(&sfile);
>   	}
>   
>   	error = load_addr;
> @@ -823,7 +848,7 @@ static int load_elf_binary(struct linux_binprm *bprm)
>   	int first_pt_load = 1;
>   	unsigned long error;
>   	struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
> -	struct elf_phdr *elf_property_phdata = NULL;
> +	struct elf_phdr *elf_property_phdata = NULL, *sframe_phdr = NULL;
>   	unsigned long elf_brk;
>   	int retval, i;
>   	unsigned long elf_entry;
> @@ -931,6 +956,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
>   				executable_stack = EXSTACK_DISABLE_X;
>   			break;
>   
> +		case PT_GNU_SFRAME:
> +			sframe_phdr = elf_ppnt;
> +			break;
> +
>   		case PT_LOPROC ... PT_HIPROC:
>   			retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
>   						  bprm->file, false,
> @@ -1316,6 +1345,16 @@ static int load_elf_binary(struct linux_binprm *bprm)
>   				MAP_FIXED | MAP_PRIVATE, 0);
>   	}
>   
> +	if (sframe_phdr) {
> +		struct sframe_file sfile = {
> +			.sframe_addr	= load_bias + sframe_phdr->p_vaddr,
> +			.text_start	= start_code,
> +			.text_end	= end_code,
> +		};
> +
> +		__sframe_add_section(&sfile);
> +	}
> +
>   	regs = current_pt_regs();
>   #ifdef ELF_PLAT_INIT
>   	/*
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 485424979254..1aee78cbea33 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -1019,6 +1019,9 @@ struct mm_struct {
>   #endif
>   		} lru_gen;
>   #endif /* CONFIG_LRU_GEN_WALKS_MMU */
> +#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
> +		struct maple_tree sframe_mt;
> +#endif
>   	} __randomize_layout;
>   
>   	/*
> diff --git a/include/linux/sframe.h b/include/linux/sframe.h
> new file mode 100644
> index 000000000000..3a44f76929e2
> --- /dev/null
> +++ b/include/linux/sframe.h
> @@ -0,0 +1,46 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_SFRAME_H
> +#define _LINUX_SFRAME_H
> +
> +#include <linux/mm_types.h>
> +
> +struct sframe_file {
> +	unsigned long sframe_addr, text_start, text_end;
> +};
> +
> +struct user_unwind_frame;
> +
> +#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
> +
> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0)
> +
> +extern void sframe_free_mm(struct mm_struct *mm);
> +
> +extern int __sframe_add_section(struct sframe_file *file);
> +extern int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end);
> +extern int sframe_remove_section(unsigned long sframe_addr);
> +extern int sframe_find(unsigned long ip, struct user_unwind_frame *frame);
> +
> +static inline bool current_has_sframe(void)
> +{
> +	struct mm_struct *mm = current->mm;
> +
> +	return mm && !mtree_empty(&mm->sframe_mt);
> +}
> +
> +#else /* !CONFIG_HAVE_USER_UNWIND_SFRAME */
> +
> +#define INIT_MM_SFRAME
> +
> +static inline void sframe_free_mm(struct mm_struct *mm) {}
> +
> +static inline int __sframe_add_section(struct sframe_file *file) { return -EINVAL; }
> +static inline int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end) { return -EINVAL; }
> +static inline int sframe_remove_section(unsigned long sframe_addr) { return -EINVAL; }
> +static inline int sframe_find(unsigned long ip, struct user_unwind_frame *frame) { return -EINVAL; }
> +
> +static inline bool current_has_sframe(void) { return false; }
> +
> +#endif /* CONFIG_HAVE_USER_UNWIND_SFRAME */
> +
> +#endif /* _LINUX_SFRAME_H */
> diff --git a/include/linux/user_unwind.h b/include/linux/user_unwind.h
> index 0a19ac6c92b2..8003f9d35405 100644
> --- a/include/linux/user_unwind.h
> +++ b/include/linux/user_unwind.h
> @@ -7,6 +7,7 @@
>   enum user_unwind_type {
>   	USER_UNWIND_TYPE_AUTO,
>   	USER_UNWIND_TYPE_FP,
> +	USER_UNWIND_TYPE_SFRAME,
>   };
>   
>   struct user_unwind_frame {
> diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
> index b54b313bcf07..b2aca31e1a49 100644
> --- a/include/uapi/linux/elf.h
> +++ b/include/uapi/linux/elf.h
> @@ -39,6 +39,7 @@ typedef __s64	Elf64_Sxword;
>   #define PT_GNU_STACK	(PT_LOOS + 0x474e551)
>   #define PT_GNU_RELRO	(PT_LOOS + 0x474e552)
>   #define PT_GNU_PROPERTY	(PT_LOOS + 0x474e553)
> +#define PT_GNU_SFRAME	(PT_LOOS + 0x474e554)
>   
>   
>   /* ARM MTE memory tag segment type */
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 35791791a879..69511077c910 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -328,4 +328,7 @@ struct prctl_mm_map {
>   # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
>   # define PR_PPC_DEXCR_CTRL_MASK		0x1f
>   
> +#define PR_ADD_SFRAME			74
> +#define PR_REMOVE_SFRAME		75
> +
>   #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cc760491f201..a216f091edfb 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -104,6 +104,7 @@
>   #include <linux/rseq.h>
>   #include <uapi/linux/pidfd.h>
>   #include <linux/pidfs.h>
> +#include <linux/sframe.h>
>   
>   #include <asm/pgalloc.h>
>   #include <linux/uaccess.h>
> @@ -923,6 +924,7 @@ void __mmdrop(struct mm_struct *mm)
>   	mm_pasid_drop(mm);
>   	mm_destroy_cid(mm);
>   	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
> +	sframe_free_mm(mm);
>   
>   	free_mm(mm);
>   }
> @@ -1249,6 +1251,13 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
>   #endif
>   }
>   
> +static void mm_init_sframe(struct mm_struct *mm)
> +{
> +#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
> +	mt_init(&mm->sframe_mt);
> +#endif
> +}
> +
>   static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>   	struct user_namespace *user_ns)
>   {
> @@ -1280,6 +1289,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>   	mm->pmd_huge_pte = NULL;
>   #endif
>   	mm_init_uprobes_state(mm);
> +	mm_init_sframe(mm);
>   	hugetlb_count_init(mm);
>   
>   	if (current->mm) {
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 3a2df1bd9f64..e4d2b64f4ae4 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -64,6 +64,7 @@
>   #include <linux/rcupdate.h>
>   #include <linux/uidgid.h>
>   #include <linux/cred.h>
> +#include <linux/sframe.h>
>   
>   #include <linux/nospec.h>
>   
> @@ -2782,6 +2783,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>   	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
>   		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
>   		break;
> +	case PR_ADD_SFRAME:
> +		if (arg5)
> +			return -EINVAL;
> +		error = sframe_add_section(arg2, arg3, arg4);
> +		break;
> +	case PR_REMOVE_SFRAME:
> +		if (arg3 || arg4 || arg5)
> +			return -EINVAL;
> +		error = sframe_remove_section(arg2);
> +		break;
>   	default:
>   		error = -EINVAL;
>   		break;
> diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
> index eb466d6a3295..6f202c5840cf 100644
> --- a/kernel/unwind/Makefile
> +++ b/kernel/unwind/Makefile
> @@ -1 +1,2 @@
>   obj-$(CONFIG_HAVE_USER_UNWIND) += user.o
> +obj-$(CONFIG_HAVE_USER_UNWIND_SFRAME) += sframe.o
> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
> new file mode 100644
> index 000000000000..3e4d29e737a1
> --- /dev/null
> +++ b/kernel/unwind/sframe.c
> @@ -0,0 +1,420 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/srcu.h>
> +#include <linux/uaccess.h>
> +#include <linux/mm.h>
> +#include <linux/sframe.h>
> +#include <linux/user_unwind.h>
> +
> +#include "sframe.h"
> +
> +#define SFRAME_FILENAME_LEN 32
> +
> +struct sframe_section {
> +	struct rcu_head rcu;
> +
> +	unsigned long sframe_addr;
> +	unsigned long text_addr;
> +
> +	unsigned long fdes_addr;
> +	unsigned long fres_addr;
> +	unsigned int  fdes_nr;
> +	signed char ra_off, fp_off;
> +};
> +
> +DEFINE_STATIC_SRCU(sframe_srcu);
> +
> +#define __SFRAME_GET_USER(out, user_ptr, type)				\
> +({									\
> +	type __tmp;							\
> +	if (get_user(__tmp, (type *)user_ptr))				\
> +		return -EFAULT;						\
> +	user_ptr += sizeof(__tmp);					\
> +	out = __tmp;							\
> +})
> +
> +#define SFRAME_GET_USER_SIGNED(out, user_ptr, size)			\
> +({									\
> +	switch (size) {							\
> +	case 1:								\
> +		__SFRAME_GET_USER(out, user_ptr, s8);			\
> +		break;							\
> +	case 2:								\
> +		__SFRAME_GET_USER(out, user_ptr, s16);			\
> +		break;							\
> +	case 4:								\
> +		__SFRAME_GET_USER(out, user_ptr, s32);			\
> +		break;							\
> +	default:							\
> +		return -EINVAL;						\
> +	}								\
> +})
> +
> +#define SFRAME_GET_USER_UNSIGNED(out, user_ptr, size)			\
> +({									\
> +	switch (size) {							\
> +	case 1:								\
> +		__SFRAME_GET_USER(out, user_ptr, u8);			\
> +		break;							\
> +	case 2:								\
> +		__SFRAME_GET_USER(out, user_ptr, u16);			\
> +		break;							\
> +	case 4:								\
> +		__SFRAME_GET_USER(out, user_ptr, u32);			\
> +		break;							\
> +	default:							\
> +		return -EINVAL;						\
> +	}								\
> +})
> +
> +static unsigned char fre_type_to_size(unsigned char fre_type)
> +{
> +	if (fre_type > 2)
> +		return 0;
> +	return 1 << fre_type;
> +}
> +
> +static unsigned char offset_size_enum_to_size(unsigned char off_size)
> +{
> +	if (off_size > 2)
> +		return 0;
> +	return 1 << off_size;
> +}
> +
> +static int find_fde(struct sframe_section *sec, unsigned long ip,
> +		    struct sframe_fde *fde)
> +{
> +	s32 func_off, ip_off;
> +	struct sframe_fde __user *first, *last, *mid, *found;
> +
> +	ip_off = ip - sec->sframe_addr;
> +
> +	first = (void *)sec->fdes_addr;
> +	last = first + sec->fdes_nr;
> +	while (first <= last) {
> +		mid = first + ((last - first) / 2);
> +		if (get_user(func_off, (s32 *)mid))
> +			return -EFAULT;
> +		if (ip_off >= func_off) {
> +			found = mid;
> +			first = mid + 1;
> +		} else
> +			last = mid - 1;
> +	}
> +
> +	if (!found)
> +		return -EINVAL;
> +
> +	if (copy_from_user(fde, found, sizeof(*fde)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int find_fre(struct sframe_section *sec, struct sframe_fde *fde,
> +		    unsigned long ip, struct user_unwind_frame *frame)
> +{
> +	unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info);
> +	unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info);
> +	s32 fre_ip_off, cfa_off, ra_off, fp_off, ip_off;

Doesn't fre_ip_off need to be u32 (see also below)? The SFrame format 
specification states the FRE sfre_start_address is either u8, u16, or u32:
https://sourceware.org/binutils/docs/sframe-spec.html#The-SFrame-FRE-Types

> +	unsigned char offset_count, offset_size;
> +	unsigned char addr_size;
> +	void __user *f, *last_f;
> +	u8 fre_info;
> +	int i;
> +
> +	addr_size = fre_type_to_size(fre_type);
> +	if (!addr_size)
> +		return -EINVAL;
> +
> +	ip_off = ip - sec->sframe_addr - fde->start_addr;
> +
> +	f = (void *)sec->fres_addr + fde->fres_off;
> +
> +	for (i = 0; i < fde->fres_num; i++) {
> +
> +		SFRAME_GET_USER_UNSIGNED(fre_ip_off, f, addr_size);

You already use SFRAME_GET_USER_UNSIGNED() to read it.

> +
> +		if (fde_type == SFRAME_FDE_TYPE_PCINC) {
> +			if (fre_ip_off > ip_off)
> +				break;
> +		} else {
> +			/* SFRAME_FDE_TYPE_PCMASK */
> +			if (ip_off % fde->rep_size < fre_ip_off)
> +				break;
> +		}
> +
> +		SFRAME_GET_USER_UNSIGNED(fre_info, f, 1);
> +
> +		offset_count = SFRAME_FRE_OFFSET_COUNT(fre_info);
> +		offset_size  = offset_size_enum_to_size(SFRAME_FRE_OFFSET_SIZE(fre_info));
> +
> +		if (!offset_count || !offset_size)
> +			return -EINVAL;
> +
> +		last_f = f;
> +		f += offset_count * offset_size;
> +	}
> +
> +	if (!last_f)
> +		return -EINVAL;
> +
> +	f = last_f;
> +
> +	SFRAME_GET_USER_UNSIGNED(cfa_off, f, offset_size);

As far as I know the CFA offset from CFA base register is signed in the 
SFrame file format. See Binutils include/sframe-api.h, 
sframe_fre_get_cfa_offset(). Therefore use SFRAME_GET_USER_SIGNED(). 
Both cfa_off and struct user_unwind_frame cfa_off are already defined as 
s32.

> +	offset_count--;
> +
> +	ra_off = sec->ra_off;
> +	if (!ra_off) {

On s390 there is no fixed RA offset from CFA ...

> +		if (!offset_count--)
> +			return -EINVAL;

... and the RA must not neccessarily be saved (e.g. at function entry). 
But we can address this when sending patches for s390 support.

> +		SFRAME_GET_USER_SIGNED(ra_off, f, offset_size);
> +	}
> +
> +	fp_off = sec->fp_off;
> +	if (!fp_off && offset_count) {
> +		offset_count--;
> +		SFRAME_GET_USER_SIGNED(fp_off, f, offset_size);
> +	}
> +
> +	if (offset_count)
> +		return -EINVAL;
> +
> +	frame->cfa_off = cfa_off;
> +	frame->ra_off = ra_off;
> +	frame->fp_off = fp_off;
> +	frame->use_fp = SFRAME_FRE_CFA_BASE_REG_ID(fre_info) == SFRAME_BASE_REG_FP;
> +
> +	return 0;
> +}
> +
> +int sframe_find(unsigned long ip, struct user_unwind_frame *frame)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct sframe_section *sec;
> +	struct sframe_fde fde;
> +	int srcu_idx;
> +	int ret = -EINVAL;
> +
> +	srcu_idx = srcu_read_lock(&sframe_srcu);
> +
> +	sec = mtree_load(&mm->sframe_mt, ip);
> +	if (!sec) {
> +		srcu_read_unlock(&sframe_srcu, srcu_idx);
> +		return -EINVAL;
> +	}
> +
> +
> +	ret = find_fde(sec, ip, &fde);
> +	if (ret)
> +		goto err_unlock;
> +
> +	ret = find_fre(sec, &fde, ip, frame);
> +	if (ret)
> +		goto err_unlock;
> +
> +	srcu_read_unlock(&sframe_srcu, srcu_idx);
> +	return 0;
> +
> +err_unlock:
> +	srcu_read_unlock(&sframe_srcu, srcu_idx);
> +	return ret;
> +}
> +
> +static int get_sframe_file(unsigned long sframe_addr, struct sframe_file *file)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *sframe_vma, *text_vma, *vma;
> +	VMA_ITERATOR(vmi, mm, 0);
> +
> +	mmap_read_lock(mm);
> +
> +	sframe_vma = vma_lookup(mm, sframe_addr);
> +	if (!sframe_vma || !sframe_vma->vm_file)
> +		goto err_unlock;
> +
> +	text_vma = NULL;
> +
> +	for_each_vma(vmi, vma) {
> +		if (vma->vm_file != sframe_vma->vm_file)
> +			continue;
> +		if (vma->vm_flags & VM_EXEC) {
> +			if (text_vma) {
> +				/*
> +				 * Multiple EXEC segments in a single file
> +				 * aren't currently supported, is that a thing?
> +				 */
> +				mmap_read_unlock(mm);
> +				pr_warn_once("unsupported multiple EXEC segments in task %s[%d]\n",
> +					     current->comm, current->pid);
> +				return -EINVAL;
> +			}
> +			text_vma = vma;
> +		}
> +	}
> +
> +	file->sframe_addr	= sframe_addr;
> +	file->text_start	= text_vma->vm_start;
> +	file->text_end		= text_vma->vm_end;
> +
> +	mmap_read_unlock(mm);
> +	return 0;
> +
> +err_unlock:
> +	mmap_read_unlock(mm);
> +	return -EINVAL;
> +}
> +
> +static int validate_sframe_addrs(struct sframe_file *file)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct vm_area_struct *text_vma;
> +
> +	mmap_read_lock(mm);
> +
> +	if (!vma_lookup(mm, file->sframe_addr))
> +		goto err_unlock;
> +
> +	text_vma = vma_lookup(mm, file->text_start);
> +	if (!(text_vma->vm_flags & VM_EXEC))
> +		goto err_unlock;
> +
> +	if (vma_lookup(mm, file->text_end-1) != text_vma)
> +		goto err_unlock;
> +
> +	mmap_read_unlock(mm);
> +	return 0;
> +
> +err_unlock:
> +	mmap_read_unlock(mm);
> +	return -EINVAL;
> +}
> +
> +int __sframe_add_section(struct sframe_file *file)
> +{
> +	struct maple_tree *sframe_mt = &current->mm->sframe_mt;
> +	struct sframe_section *sec;
> +	struct sframe_header shdr;
> +	unsigned long header_end;
> +	int ret;
> +
> +	if (copy_from_user(&shdr, (void *)file->sframe_addr, sizeof(shdr)))
> +		return -EFAULT;
> +
> +	if (shdr.preamble.magic != SFRAME_MAGIC ||
> +	    shdr.preamble.version != SFRAME_VERSION_2 ||
> +	    !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
> +	    shdr.auxhdr_len || !shdr.num_fdes || !shdr.num_fres ||
> +	    shdr.fdes_off > shdr.fres_off) {
> +		/*
> +		 * Either binutils < 2.41, corrupt sframe header, or
> +		 * unsupported feature.
> +		 * */
> +		pr_warn_once("bad sframe header in task %s[%d]\n",
> +			     current->comm, current->pid);
> +		return -EINVAL;
> +	}
> +
> +	header_end = file->sframe_addr + SFRAME_HDR_SIZE(shdr);
> +
> +	sec = kmalloc(sizeof(*sec), GFP_KERNEL);
> +	if (!sec)
> +		return -ENOMEM;
> +
> +	sec->sframe_addr	= file->sframe_addr;
> +	sec->text_addr		= file->text_start;
> +	sec->fdes_addr		= header_end + shdr.fdes_off;
> +	sec->fres_addr		= header_end + shdr.fres_off;
> +	sec->fdes_nr		= shdr.num_fdes;
> +	sec->ra_off		= shdr.cfa_fixed_ra_offset;
> +	sec->fp_off		= shdr.cfa_fixed_fp_offset;
> +
> +	ret = mtree_insert_range(sframe_mt, file->text_start, file->text_end,
> +				 sec, GFP_KERNEL);
> +	if (ret) {
> +		kfree(sec);
> +		return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end)
> +{
> +	struct sframe_file file;
> +	int ret;
> +
> +	if (!text_start || !text_end) {
> +		ret = get_sframe_file(sframe_addr, &file);
> +		if (ret)
> +			return ret;
> +	} else {
> +		/*
> +		 * This is mainly for generated code, for which the text isn't
> +		 * file-backed so the user has to give the text bounds.
> +		 */
> +		file.sframe_addr	= sframe_addr;
> +		file.text_start		= text_start;
> +		file.text_end		= text_end;
> +		ret = validate_sframe_addrs(&file);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return __sframe_add_section(&file);
> +}
> +
> +static void sframe_free_rcu(struct rcu_head *rcu)
> +{
> +	struct sframe_section *sec = container_of(rcu, struct sframe_section, rcu);
> +
> +	kfree(sec);
> +}
> +
> +static int __sframe_remove_section(struct mm_struct *mm,
> +				   struct sframe_section *sec)
> +{
> +	struct sframe_section *s;
> +
> +	s = mtree_erase(&mm->sframe_mt, sec->text_addr);
> +	if (!s || WARN_ON_ONCE(s != sec))
> +		return -EINVAL;
> +
> +	call_srcu(&sframe_srcu, &sec->rcu, sframe_free_rcu);
> +
> +	return 0;
> +}
> +
> +int sframe_remove_section(unsigned long sframe_addr)
> +{
> +	struct mm_struct *mm = current->mm;
> +	struct sframe_section *sec;
> +	unsigned long index = 0;
> +
> +	sec = mtree_load(&mm->sframe_mt, sframe_addr);
> +	if (!sec)
> +		return -EINVAL;
> +
> +	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX) {
> +		if (sec->sframe_addr == sframe_addr)
> +			return __sframe_remove_section(mm, sec);
> +	}
> +
> +	return -EINVAL;
> +}
> +
> +void sframe_free_mm(struct mm_struct *mm)
> +{
> +	struct sframe_section *sec;
> +	unsigned long index = 0;
> +
> +	if (!mm)
> +		return;
> +
> +	mt_for_each(&mm->sframe_mt, sec, index, ULONG_MAX)
> +		kfree(sec);
> +
> +	mtree_destroy(&mm->sframe_mt);
> +}
> diff --git a/kernel/unwind/sframe.h b/kernel/unwind/sframe.h
> new file mode 100644
> index 000000000000..aa468d6f1f4a
> --- /dev/null
> +++ b/kernel/unwind/sframe.h
> @@ -0,0 +1,215 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * Copyright (C) 2023, Oracle and/or its affiliates.
> + *
> + * This file contains definitions for the SFrame stack tracing format, which is
> + * documented at https://sourceware.org/binutils/docs
> + */
> +#ifndef _SFRAME_H
> +#define _SFRAME_H
> +
> +#include <linux/types.h>
> +
> +#define SFRAME_VERSION_1	1
> +#define SFRAME_VERSION_2	2
> +#define SFRAME_MAGIC		0xdee2
> +
> +/* Function Descriptor Entries are sorted on PC. */
> +#define SFRAME_F_FDE_SORTED	0x1
> +/* Frame-pointer based stack tracing. Defined, but not set. */
> +#define SFRAME_F_FRAME_POINTER	0x2
> +
> +#define SFRAME_CFA_FIXED_FP_INVALID 0
> +#define SFRAME_CFA_FIXED_RA_INVALID 0
> +
> +/* Supported ABIs/Arch. */
> +#define SFRAME_ABI_AARCH64_ENDIAN_BIG	    1 /* AARCH64 big endian. */
> +#define SFRAME_ABI_AARCH64_ENDIAN_LITTLE    2 /* AARCH64 little endian. */
> +#define SFRAME_ABI_AMD64_ENDIAN_LITTLE	    3 /* AMD64 little endian. */
> +
> +/* SFrame FRE types. */
> +#define SFRAME_FRE_TYPE_ADDR1	0
> +#define SFRAME_FRE_TYPE_ADDR2	1
> +#define SFRAME_FRE_TYPE_ADDR4	2
> +
> +/*
> + * SFrame Function Descriptor Entry types.
> + *
> + * The SFrame format has two possible representations for functions. The
> + * choice of which type to use is made according to the instruction patterns
> + * in the relevant program stub.
> + */
> +
> +/* Unwinders perform a (PC >= FRE_START_ADDR) to look up a matching FRE. */
> +#define SFRAME_FDE_TYPE_PCINC	0
> +/*
> + * Unwinders perform a (PC & FRE_START_ADDR_AS_MASK >= FRE_START_ADDR_AS_MASK)
> + * to look up a matching FRE. Typical usecases are pltN entries, trampolines
> + * etc.
> + */
> +#define SFRAME_FDE_TYPE_PCMASK	1
> +
> +/**
> + * struct sframe_preamble - SFrame Preamble.
> + * @magic: Magic number (SFRAME_MAGIC).
> + * @version: Format version number (SFRAME_VERSION).
> + * @flags: Various flags.
> + */
> +struct sframe_preamble {
> +	u16 magic;
> +	u8  version;
> +	u8  flags;
> +} __packed;
> +
> +/**
> + * struct sframe_header - SFrame Header.
> + * @preamble: SFrame preamble.
> + * @abi_arch: Identify the arch (including endianness) and ABI.
> + * @cfa_fixed_fp_offset: Offset for the Frame Pointer (FP) from CFA may be
> + *	  fixed  for some ABIs ((e.g, in AMD64 when -fno-omit-frame-pointer is

Nit: Two consecutive spaces in "fixed  for".

> + *	  used). When fixed, this field specifies the fixed stack frame offset
> + *	  and the individual FREs do not need to track it. When not fixed, it
> + *	  is set to SFRAME_CFA_FIXED_FP_INVALID, and the individual FREs may
> + *	  provide the applicable stack frame offset, if any.
> + * @cfa_fixed_ra_offset: Offset for the Return Address from CFA is fixed for
> + *	  some ABIs. When fixed, this field specifies the fixed stack frame
> + *	  offset and the individual FREs do not need to track it. When not
> + *	  fixed, it is set to SFRAME_CFA_FIXED_FP_INVALID.
> + * @auxhdr_len: Number of bytes making up the auxiliary header, if any.
> + *	  Some ABI/arch, in the future, may use this space for extending the
> + *	  information in SFrame header. Auxiliary header is contained in bytes
> + *	  sequentially following the sframe_header.
> + * @num_fdes: Number of SFrame FDEs in this SFrame section.
> + * @num_fres: Number of SFrame Frame Row Entries.
> + * @fre_len:  Number of bytes in the SFrame Frame Row Entry section.
> + * @fdes_off: Offset of SFrame Function Descriptor Entry section.
> + * @fres_off: Offset of SFrame Frame Row Entry section.
> + */
> +struct sframe_header {
> +	struct sframe_preamble preamble;
> +	u8  abi_arch;
> +	s8  cfa_fixed_fp_offset;
> +	s8  cfa_fixed_ra_offset;
> +	u8  auxhdr_len;
> +	u32 num_fdes;
> +	u32 num_fres;
> +	u32 fre_len;
> +	u32 fdes_off;
> +	u32 fres_off;
> +} __packed;
> +
> +#define SFRAME_HDR_SIZE(sframe_hdr)	\
> +	((sizeof(struct sframe_header) + (sframe_hdr).auxhdr_len))
> +
> +/* Two possible keys for executable (instruction) pointers signing. */
> +#define SFRAME_AARCH64_PAUTH_KEY_A    0 /* Key A. */
> +#define SFRAME_AARCH64_PAUTH_KEY_B    1 /* Key B. */
> +
> +/**
> + * struct sframe_fde - SFrame Function Descriptor Entry.
> + * @start_addr: Function start address. Encoded as a signed offset,
> + *	  relative to the current FDE.
> + * @size: Size of the function in bytes.
> + * @fres_off: Offset of the first SFrame Frame Row Entry of the function,
> + *	  relative to the beginning of the SFrame Frame Row Entry sub-section.
> + * @fres_num: Number of frame row entries for the function.
> + * @info: Additional information for deciphering the stack trace
> + *	  information for the function. Contains information about SFrame FRE
> + *	  type, SFrame FDE type, PAC authorization A/B key, etc.
> + * @rep_size: Block size for SFRAME_FDE_TYPE_PCMASK
> + * @padding: Unused
> + */
> +struct sframe_fde {
> +	s32 start_addr;
> +	u32 size;
> +	u32 fres_off;
> +	u32 fres_num;
> +	u8  info;
> +	u8  rep_size;
> +	u16 padding;
> +} __packed;
> +
> +/*
> + * 'func_info' in SFrame FDE contains additional information for deciphering
> + * the stack trace information for the function. In V1, the information is
> + * organized as follows:
> + *   - 4-bits: Identify the FRE type used for the function.
> + *   - 1-bit: Identify the FDE type of the function - mask or inc.
> + *   - 1-bit: PAC authorization A/B key (aarch64).
> + *   - 2-bits: Unused.
> + * ---------------------------------------------------------------------
> + * |  Unused  |  PAC auth A/B key (aarch64) |  FDE type |   FRE type   |
> + * |          |        Unused (amd64)       |           |              |
> + * ---------------------------------------------------------------------
> + * 8          6                             5           4              0
> + */
> +
> +/* Note: Set PAC auth key to SFRAME_AARCH64_PAUTH_KEY_A by default.  */
> +#define SFRAME_FUNC_INFO(fde_type, fre_enc_type) \
> +	(((SFRAME_AARCH64_PAUTH_KEY_A & 0x1) << 5) | \
> +	 (((fde_type) & 0x1) << 4) | ((fre_enc_type) & 0xf))
> +
> +#define SFRAME_FUNC_FRE_TYPE(data)	  ((data) & 0xf)
> +#define SFRAME_FUNC_FDE_TYPE(data)	  (((data) >> 4) & 0x1)
> +#define SFRAME_FUNC_PAUTH_KEY(data)	  (((data) >> 5) & 0x1)
> +
> +/*
> + * Size of stack frame offsets in an SFrame Frame Row Entry. A single
> + * SFrame FRE has all offsets of the same size. Offset size may vary
> + * across frame row entries.
> + */
> +#define SFRAME_FRE_OFFSET_1B	  0
> +#define SFRAME_FRE_OFFSET_2B	  1
> +#define SFRAME_FRE_OFFSET_4B	  2
> +
> +/* An SFrame Frame Row Entry can be SP or FP based.  */
> +#define SFRAME_BASE_REG_FP	0
> +#define SFRAME_BASE_REG_SP	1
> +
> +/*
> + * The index at which a specific offset is presented in the variable length
> + * bytes of an FRE.
> + */
> +#define SFRAME_FRE_CFA_OFFSET_IDX   0
> +/*
> + * The RA stack offset, if present, will always be at index 1 in the variable
> + * length bytes of the FRE.
> + */
> +#define SFRAME_FRE_RA_OFFSET_IDX    1
> +/*
> + * The FP stack offset may appear at offset 1 or 2, depending on the ABI as RA
> + * may or may not be tracked.
> + */
> +#define SFRAME_FRE_FP_OFFSET_IDX    2
> +
> +/*
> + * 'fre_info' in SFrame FRE contains information about:
> + *   - 1 bit: base reg for CFA
> + *   - 4 bits: Number of offsets (N). A value of up to 3 is allowed to track
> + *   all three of CFA, FP and RA (fixed implicit order).
> + *   - 2 bits: information about size of the offsets (S) in bytes.
> + *     Valid values are SFRAME_FRE_OFFSET_1B, SFRAME_FRE_OFFSET_2B,
> + *     SFRAME_FRE_OFFSET_4B
> + *   - 1 bit: Mangled RA state bit (aarch64 only).
> + * ---------------------------------------------------------------
> + * | Mangled-RA (aarch64) |  Size of   |   Number of  | base_reg |
> + * |  Unused (amd64)      |  offsets   |    offsets   |          |
> + * ---------------------------------------------------------------
> + * 8                      7             5             1          0
> + */
> +
> +/* Note: Set mangled_ra_p to zero by default. */
> +#define SFRAME_FRE_INFO(base_reg_id, offset_num, offset_size) \
> +	(((0 & 0x1) << 7) | (((offset_size) & 0x3) << 5) | \
> +	 (((offset_num) & 0xf) << 1) | ((base_reg_id) & 0x1))
> +
> +/* Set the mangled_ra_p bit as indicated. */
> +#define SFRAME_FRE_INFO_UPDATE_MANGLED_RA_P(mangled_ra_p, fre_info) \
> +	((((mangled_ra_p) & 0x1) << 7) | ((fre_info) & 0x7f))
> +
> +#define SFRAME_FRE_CFA_BASE_REG_ID(data)	  ((data) & 0x1)
> +#define SFRAME_FRE_OFFSET_COUNT(data)		  (((data) >> 1) & 0xf)
> +#define SFRAME_FRE_OFFSET_SIZE(data)		  (((data) >> 5) & 0x3)
> +#define SFRAME_FRE_MANGLED_RA_P(data)		  (((data) >> 7) & 0x1)
> +
> +#endif /* _SFRAME_H */
> diff --git a/kernel/unwind/user.c b/kernel/unwind/user.c
> index 5d16f9604a61..3a7b14cf522b 100644
> --- a/kernel/unwind/user.c
> +++ b/kernel/unwind/user.c
> @@ -8,6 +8,7 @@
>   #include <linux/sched.h>
>   #include <linux/sched/task_stack.h>
>   #include <linux/user_unwind.h>
> +#include <linux/sframe.h>
>   #include <linux/uaccess.h>
>   #include <asm/user_unwind.h>
>   
> @@ -29,6 +30,11 @@ int user_unwind_next(struct user_unwind_state *state)
>   	case USER_UNWIND_TYPE_FP:
>   		frame = &fp_frame;
>   		break;
> +	case USER_UNWIND_TYPE_SFRAME:
> +		ret = sframe_find(state->ip, frame);
> +		if (ret)
> +			goto the_end;
> +		break;
>   	default:
>   		BUG();
>   	}
> @@ -57,6 +63,7 @@ int user_unwind_start(struct user_unwind_state *state,
>   		      enum user_unwind_type type)
>   {
>   	struct pt_regs *regs = task_pt_regs(current);
> +	bool sframe_possible = current_has_sframe();
>   
>   	memset(state, 0, sizeof(*state));
>   
> @@ -67,6 +74,13 @@ int user_unwind_start(struct user_unwind_state *state,
>   
>   	switch (type) {
>   	case USER_UNWIND_TYPE_AUTO:
> +		state->type = sframe_possible ? USER_UNWIND_TYPE_SFRAME :
> +						USER_UNWIND_TYPE_FP;
> +		break;
> +	case USER_UNWIND_TYPE_SFRAME:
> +		if (!sframe_possible)
> +			return -EINVAL;
> +		break;
>   	case USER_UNWIND_TYPE_FP:
>   		break;
>   	default:
> diff --git a/mm/init-mm.c b/mm/init-mm.c
> index 24c809379274..c4c6af046778 100644
> --- a/mm/init-mm.c
> +++ b/mm/init-mm.c
> @@ -11,6 +11,7 @@
>   #include <linux/atomic.h>
>   #include <linux/user_namespace.h>
>   #include <linux/iommu.h>
> +#include <linux/sframe.h>
>   #include <asm/mmu.h>
>   
>   #ifndef INIT_MM_CONTEXT
> @@ -44,7 +45,8 @@ struct mm_struct init_mm = {
>   #endif
>   	.user_ns	= &init_user_ns,
>   	.cpu_bitmap	= CPU_BITS_NONE,
> -	INIT_MM_CONTEXT(init_mm)
> +	INIT_MM_CONTEXT(init_mm),
> +	INIT_MM_SFRAME,

This does not compile on s390, as INIT_MM_CONTEXT() is defined with a 
trailing comma, leading to two consecutive commas. Already reported by 
the kernel test robot for other architectures.
Same if INIT_MM_SFRAME expands into nothing there would be two 
consecutive commas.

>   };
>   
>   void setup_initial_init_mm(void *start_code, void *end_code,

Regards,
Jens
-- 
Jens Remus
Linux on Z Development (D3303) and z/VSE Support
+49-7031-16-1128 Office
jremus@de.ibm.com

IBM

IBM Deutschland Research & Development GmbH; Vorsitzender des 
Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der 
Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/

Re: [PATCH v2 03/11] unwind: Introduce SFrame user space unwinding
Posted by Josh Poimboeuf 1 month ago
On Wed, Oct 23, 2024 at 03:59:53PM +0200, Jens Remus wrote:
> > +++ b/arch/Kconfig
> > @@ -428,6 +428,9 @@ config HAVE_HARDLOCKUP_DETECTOR_ARCH
> >   config HAVE_USER_UNWIND
> >   	bool
> > +config HAVE_USER_UNWIND_SFRAME
> > +	bool
> 
> I found this unwinder of userspace using SFrame to depend on your generic
> unwinder of userspace framework. Does this warrant to add:

Based on your other suggestion I now have:

config UNWIND_USER
	bool

config HAVE_UNWIND_USER_FP
	bool
	select UNWIND_USER

config HAVE_UNWIND_USER_SFRAME
	bool
	select UNWIND_USER

The arches set HAVE_*, which in turn sets UNWIND_USER.

> > +static int find_fre(struct sframe_section *sec, struct sframe_fde *fde,
> > +		    unsigned long ip, struct user_unwind_frame *frame)
> > +{
> > +	unsigned char fde_type = SFRAME_FUNC_FDE_TYPE(fde->info);
> > +	unsigned char fre_type = SFRAME_FUNC_FRE_TYPE(fde->info);
> > +	s32 fre_ip_off, cfa_off, ra_off, fp_off, ip_off;
> 
> Doesn't fre_ip_off need to be u32 (see also below)? The SFrame format
> specification states the FRE sfre_start_address is either u8, u16, or u32:
> https://sourceware.org/binutils/docs/sframe-spec.html#The-SFrame-FRE-Types

Indeed, I also noticed that.

> > +	SFRAME_GET_USER_UNSIGNED(cfa_off, f, offset_size);
> 
> As far as I know the CFA offset from CFA base register is signed in the
> SFrame file format. See Binutils include/sframe-api.h,
> sframe_fre_get_cfa_offset(). Therefore use SFRAME_GET_USER_SIGNED(). Both
> cfa_off and struct user_unwind_frame cfa_off are already defined as s32.

Good catch.  There's no need to have separate SIGNED/UNSIGNED variants
anyway, I'll simplify that into SFRAME_GET_USER().

> > +++ b/mm/init-mm.c
> > @@ -11,6 +11,7 @@
> >   #include <linux/atomic.h>
> >   #include <linux/user_namespace.h>
> >   #include <linux/iommu.h>
> > +#include <linux/sframe.h>
> >   #include <asm/mmu.h>
> >   #ifndef INIT_MM_CONTEXT
> > @@ -44,7 +45,8 @@ struct mm_struct init_mm = {
> >   #endif
> >   	.user_ns	= &init_user_ns,
> >   	.cpu_bitmap	= CPU_BITS_NONE,
> > -	INIT_MM_CONTEXT(init_mm)
> > +	INIT_MM_CONTEXT(init_mm),
> > +	INIT_MM_SFRAME,
> 
> This does not compile on s390, as INIT_MM_CONTEXT() is defined with a
> trailing comma, leading to two consecutive commas. Already reported by the
> kernel test robot for other architectures.
> Same if INIT_MM_SFRAME expands into nothing there would be two consecutive
> commas.

Yeah, I saw those build errors and I've got that fixed now.

Thanks for the review!

-- 
Josh
Re: [PATCH v2 03/11] unwind: Introduce SFrame user space unwinding
Posted by Steven Rostedt 2 months, 2 weeks ago
On Sat, 14 Sep 2024 01:02:05 +0200
Josh Poimboeuf <jpoimboe@kernel.org> wrote:

> diff --git a/include/linux/sframe.h b/include/linux/sframe.h
> new file mode 100644
> index 000000000000..3a44f76929e2
> --- /dev/null
> +++ b/include/linux/sframe.h
> @@ -0,0 +1,46 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_SFRAME_H
> +#define _LINUX_SFRAME_H
> +
> +#include <linux/mm_types.h>
> +
> +struct sframe_file {
> +	unsigned long sframe_addr, text_start, text_end;

Please make each entry a separate line:

	unsigned long		sframe_addr;
	unsigned long		text_start;
	unsigned long		text_end;

It may be fine for declaring variables like this in a function, but it
should not be done in a structure.

> +};
> +
> +struct user_unwind_frame;
> +
> +#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
> +
> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0)
> +
> +extern void sframe_free_mm(struct mm_struct *mm);
> +
> +extern int __sframe_add_section(struct sframe_file *file);
> +extern int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end);
> +extern int sframe_remove_section(unsigned long sframe_addr);
> +extern int sframe_find(unsigned long ip, struct user_unwind_frame *frame);
> +
> +static inline bool current_has_sframe(void)
> +{
> +	struct mm_struct *mm = current->mm;
> +
> +	return mm && !mtree_empty(&mm->sframe_mt);
> +}
> +
> +#else /* !CONFIG_HAVE_USER_UNWIND_SFRAME */
> +
> +#define INIT_MM_SFRAME
> +
> +static inline void sframe_free_mm(struct mm_struct *mm) {}
> +
> +static inline int __sframe_add_section(struct sframe_file *file) { return -EINVAL; }
> +static inline int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end) { return -EINVAL; }
> +static inline int sframe_remove_section(unsigned long sframe_addr) { return -EINVAL; }
> +static inline int sframe_find(unsigned long ip, struct user_unwind_frame *frame) { return -EINVAL; }
> +
> +static inline bool current_has_sframe(void) { return false; }
> +
> +#endif /* CONFIG_HAVE_USER_UNWIND_SFRAME */
> +
> +#endif /* _LINUX_SFRAME_H */
> diff --git a/include/linux/user_unwind.h b/include/linux/user_unwind.h
> index 0a19ac6c92b2..8003f9d35405 100644
> --- a/include/linux/user_unwind.h
> +++ b/include/linux/user_unwind.h
> @@ -7,6 +7,7 @@
>  enum user_unwind_type {
>  	USER_UNWIND_TYPE_AUTO,
>  	USER_UNWIND_TYPE_FP,
> +	USER_UNWIND_TYPE_SFRAME,
>  };
>  
>  struct user_unwind_frame {
> diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
> index b54b313bcf07..b2aca31e1a49 100644
> --- a/include/uapi/linux/elf.h
> +++ b/include/uapi/linux/elf.h
> @@ -39,6 +39,7 @@ typedef __s64	Elf64_Sxword;
>  #define PT_GNU_STACK	(PT_LOOS + 0x474e551)
>  #define PT_GNU_RELRO	(PT_LOOS + 0x474e552)
>  #define PT_GNU_PROPERTY	(PT_LOOS + 0x474e553)
> +#define PT_GNU_SFRAME	(PT_LOOS + 0x474e554)
>  
>  
>  /* ARM MTE memory tag segment type */
> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
> index 35791791a879..69511077c910 100644
> --- a/include/uapi/linux/prctl.h
> +++ b/include/uapi/linux/prctl.h
> @@ -328,4 +328,7 @@ struct prctl_mm_map {
>  # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
>  # define PR_PPC_DEXCR_CTRL_MASK		0x1f
>  
> +#define PR_ADD_SFRAME			74
> +#define PR_REMOVE_SFRAME		75
> +
>  #endif /* _LINUX_PRCTL_H */
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cc760491f201..a216f091edfb 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -104,6 +104,7 @@
>  #include <linux/rseq.h>
>  #include <uapi/linux/pidfd.h>
>  #include <linux/pidfs.h>
> +#include <linux/sframe.h>
>  
>  #include <asm/pgalloc.h>
>  #include <linux/uaccess.h>
> @@ -923,6 +924,7 @@ void __mmdrop(struct mm_struct *mm)
>  	mm_pasid_drop(mm);
>  	mm_destroy_cid(mm);
>  	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
> +	sframe_free_mm(mm);
>  
>  	free_mm(mm);
>  }
> @@ -1249,6 +1251,13 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
>  #endif
>  }
>  
> +static void mm_init_sframe(struct mm_struct *mm)
> +{
> +#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
> +	mt_init(&mm->sframe_mt);
> +#endif
> +}
> +
>  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	struct user_namespace *user_ns)
>  {
> @@ -1280,6 +1289,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	mm->pmd_huge_pte = NULL;
>  #endif
>  	mm_init_uprobes_state(mm);
> +	mm_init_sframe(mm);
>  	hugetlb_count_init(mm);
>  
>  	if (current->mm) {
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 3a2df1bd9f64..e4d2b64f4ae4 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -64,6 +64,7 @@
>  #include <linux/rcupdate.h>
>  #include <linux/uidgid.h>
>  #include <linux/cred.h>
> +#include <linux/sframe.h>
>  
>  #include <linux/nospec.h>
>  
> @@ -2782,6 +2783,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>  	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
>  		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
>  		break;
> +	case PR_ADD_SFRAME:
> +		if (arg5)
> +			return -EINVAL;
> +		error = sframe_add_section(arg2, arg3, arg4);
> +		break;
> +	case PR_REMOVE_SFRAME:
> +		if (arg3 || arg4 || arg5)
> +			return -EINVAL;
> +		error = sframe_remove_section(arg2);
> +		break;
>  	default:
>  		error = -EINVAL;
>  		break;
> diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
> index eb466d6a3295..6f202c5840cf 100644
> --- a/kernel/unwind/Makefile
> +++ b/kernel/unwind/Makefile
> @@ -1 +1,2 @@
>  obj-$(CONFIG_HAVE_USER_UNWIND) += user.o
> +obj-$(CONFIG_HAVE_USER_UNWIND_SFRAME) += sframe.o
> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
> new file mode 100644
> index 000000000000..3e4d29e737a1
> --- /dev/null
> +++ b/kernel/unwind/sframe.c
> @@ -0,0 +1,420 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/srcu.h>
> +#include <linux/uaccess.h>
> +#include <linux/mm.h>
> +#include <linux/sframe.h>
> +#include <linux/user_unwind.h>
> +
> +#include "sframe.h"
> +
> +#define SFRAME_FILENAME_LEN 32
> +
> +struct sframe_section {
> +	struct rcu_head rcu;
> +
> +	unsigned long sframe_addr;
> +	unsigned long text_addr;
> +
> +	unsigned long fdes_addr;
> +	unsigned long fres_addr;
> +	unsigned int  fdes_nr;
> +	signed char ra_off, fp_off;
> +};
> +
> +DEFINE_STATIC_SRCU(sframe_srcu);
> +
> +#define __SFRAME_GET_USER(out, user_ptr, type)				\
> +({									\
> +	type __tmp;							\
> +	if (get_user(__tmp, (type *)user_ptr))				\
> +		return -EFAULT;						\
> +	user_ptr += sizeof(__tmp);					\
> +	out = __tmp;							\
> +})
> +
> +#define SFRAME_GET_USER_SIGNED(out, user_ptr, size)			\
> +({									\
> +	switch (size) {							\
> +	case 1:								\
> +		__SFRAME_GET_USER(out, user_ptr, s8);			\
> +		break;							\
> +	case 2:								\
> +		__SFRAME_GET_USER(out, user_ptr, s16);			\
> +		break;							\
> +	case 4:								\
> +		__SFRAME_GET_USER(out, user_ptr, s32);			\
> +		break;							\
> +	default:							\
> +		return -EINVAL;						\
> +	}								\
> +})
> +
> +#define SFRAME_GET_USER_UNSIGNED(out, user_ptr, size)			\
> +({									\
> +	switch (size) {							\
> +	case 1:								\
> +		__SFRAME_GET_USER(out, user_ptr, u8);			\
> +		break;							\
> +	case 2:								\
> +		__SFRAME_GET_USER(out, user_ptr, u16);			\
> +		break;							\
> +	case 4:								\
> +		__SFRAME_GET_USER(out, user_ptr, u32);			\
> +		break;							\
> +	default:							\
> +		return -EINVAL;						\
> +	}								\
> +})
> +
> +static unsigned char fre_type_to_size(unsigned char fre_type)
> +{
> +	if (fre_type > 2)
> +		return 0;
> +	return 1 << fre_type;
> +}
> +
> +static unsigned char offset_size_enum_to_size(unsigned char off_size)
> +{
> +	if (off_size > 2)
> +		return 0;
> +	return 1 << off_size;
> +}
> +
> +static int find_fde(struct sframe_section *sec, unsigned long ip,
> +		    struct sframe_fde *fde)
> +{
> +	s32 func_off, ip_off;
> +	struct sframe_fde __user *first, *last, *mid, *found;

Need to initialize found = NULL;

> +
> +	ip_off = ip - sec->sframe_addr;
> +
> +	first = (void *)sec->fdes_addr;
> +	last = first + sec->fdes_nr;
> +	while (first <= last) {

So we trust user space to have this table sorted?

> +		mid = first + ((last - first) / 2);
> +		if (get_user(func_off, (s32 *)mid))
> +			return -EFAULT;
> +		if (ip_off >= func_off) {
> +			found = mid;

If it's not sorted, this can return the wrong value.

We should have some check that has something like:

	s32 low_func_off = 0, high_func_off = 0;

			if (low_func_off && low_func_off > func_off)
				return -EINVAL;	
			low_func_off = func_off;

> +			first = mid + 1;
> +		} else
			{ /* Note, this needs a bracket anyway, because
			     rules are, if one if block has a bracket,
			     the other needs one too */

			if (high_func_off && high_func_off < func_off)
				return -EINVAL;

			high_func_off = func_off;

> +			last = mid - 1;
		}

> +	}
> +
> +	if (!found)
> +		return -EINVAL;
> +
> +	if (copy_from_user(fde, found, sizeof(*fde)))
> +		return -EFAULT;
> +
> +	return 0;
> +}

-- Steve
Re: [PATCH v2 03/11] unwind: Introduce SFrame user space unwinding
Posted by Indu Bhagat 1 month, 4 weeks ago
On 9/14/24 4:23 AM, Steven Rostedt wrote:
> On Sat, 14 Sep 2024 01:02:05 +0200
> Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> 
>> diff --git a/include/linux/sframe.h b/include/linux/sframe.h
>> new file mode 100644
>> index 000000000000..3a44f76929e2
>> --- /dev/null
>> +++ b/include/linux/sframe.h
>> @@ -0,0 +1,46 @@
>> +/* SPDX-License-Identifier: GPL-2.0 */
>> +#ifndef _LINUX_SFRAME_H
>> +#define _LINUX_SFRAME_H
>> +
>> +#include <linux/mm_types.h>
>> +
>> +struct sframe_file {
>> +	unsigned long sframe_addr, text_start, text_end;
> 
> Please make each entry a separate line:
> 
> 	unsigned long		sframe_addr;
> 	unsigned long		text_start;
> 	unsigned long		text_end;
> 
> It may be fine for declaring variables like this in a function, but it
> should not be done in a structure.
> 
>> +};
>> +
>> +struct user_unwind_frame;
>> +
>> +#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
>> +
>> +#define INIT_MM_SFRAME .sframe_mt = MTREE_INIT(sframe_mt, 0)
>> +
>> +extern void sframe_free_mm(struct mm_struct *mm);
>> +
>> +extern int __sframe_add_section(struct sframe_file *file);
>> +extern int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end);
>> +extern int sframe_remove_section(unsigned long sframe_addr);
>> +extern int sframe_find(unsigned long ip, struct user_unwind_frame *frame);
>> +
>> +static inline bool current_has_sframe(void)
>> +{
>> +	struct mm_struct *mm = current->mm;
>> +
>> +	return mm && !mtree_empty(&mm->sframe_mt);
>> +}
>> +
>> +#else /* !CONFIG_HAVE_USER_UNWIND_SFRAME */
>> +
>> +#define INIT_MM_SFRAME
>> +
>> +static inline void sframe_free_mm(struct mm_struct *mm) {}
>> +
>> +static inline int __sframe_add_section(struct sframe_file *file) { return -EINVAL; }
>> +static inline int sframe_add_section(unsigned long sframe_addr, unsigned long text_start, unsigned long text_end) { return -EINVAL; }
>> +static inline int sframe_remove_section(unsigned long sframe_addr) { return -EINVAL; }
>> +static inline int sframe_find(unsigned long ip, struct user_unwind_frame *frame) { return -EINVAL; }
>> +
>> +static inline bool current_has_sframe(void) { return false; }
>> +
>> +#endif /* CONFIG_HAVE_USER_UNWIND_SFRAME */
>> +
>> +#endif /* _LINUX_SFRAME_H */
>> diff --git a/include/linux/user_unwind.h b/include/linux/user_unwind.h
>> index 0a19ac6c92b2..8003f9d35405 100644
>> --- a/include/linux/user_unwind.h
>> +++ b/include/linux/user_unwind.h
>> @@ -7,6 +7,7 @@
>>   enum user_unwind_type {
>>   	USER_UNWIND_TYPE_AUTO,
>>   	USER_UNWIND_TYPE_FP,
>> +	USER_UNWIND_TYPE_SFRAME,
>>   };
>>   
>>   struct user_unwind_frame {
>> diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h
>> index b54b313bcf07..b2aca31e1a49 100644
>> --- a/include/uapi/linux/elf.h
>> +++ b/include/uapi/linux/elf.h
>> @@ -39,6 +39,7 @@ typedef __s64	Elf64_Sxword;
>>   #define PT_GNU_STACK	(PT_LOOS + 0x474e551)
>>   #define PT_GNU_RELRO	(PT_LOOS + 0x474e552)
>>   #define PT_GNU_PROPERTY	(PT_LOOS + 0x474e553)
>> +#define PT_GNU_SFRAME	(PT_LOOS + 0x474e554)
>>   
>>   
>>   /* ARM MTE memory tag segment type */
>> diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
>> index 35791791a879..69511077c910 100644
>> --- a/include/uapi/linux/prctl.h
>> +++ b/include/uapi/linux/prctl.h
>> @@ -328,4 +328,7 @@ struct prctl_mm_map {
>>   # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
>>   # define PR_PPC_DEXCR_CTRL_MASK		0x1f
>>   
>> +#define PR_ADD_SFRAME			74
>> +#define PR_REMOVE_SFRAME		75
>> +
>>   #endif /* _LINUX_PRCTL_H */
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index cc760491f201..a216f091edfb 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -104,6 +104,7 @@
>>   #include <linux/rseq.h>
>>   #include <uapi/linux/pidfd.h>
>>   #include <linux/pidfs.h>
>> +#include <linux/sframe.h>
>>   
>>   #include <asm/pgalloc.h>
>>   #include <linux/uaccess.h>
>> @@ -923,6 +924,7 @@ void __mmdrop(struct mm_struct *mm)
>>   	mm_pasid_drop(mm);
>>   	mm_destroy_cid(mm);
>>   	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
>> +	sframe_free_mm(mm);
>>   
>>   	free_mm(mm);
>>   }
>> @@ -1249,6 +1251,13 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
>>   #endif
>>   }
>>   
>> +static void mm_init_sframe(struct mm_struct *mm)
>> +{
>> +#ifdef CONFIG_HAVE_USER_UNWIND_SFRAME
>> +	mt_init(&mm->sframe_mt);
>> +#endif
>> +}
>> +
>>   static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>>   	struct user_namespace *user_ns)
>>   {
>> @@ -1280,6 +1289,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>>   	mm->pmd_huge_pte = NULL;
>>   #endif
>>   	mm_init_uprobes_state(mm);
>> +	mm_init_sframe(mm);
>>   	hugetlb_count_init(mm);
>>   
>>   	if (current->mm) {
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index 3a2df1bd9f64..e4d2b64f4ae4 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -64,6 +64,7 @@
>>   #include <linux/rcupdate.h>
>>   #include <linux/uidgid.h>
>>   #include <linux/cred.h>
>> +#include <linux/sframe.h>
>>   
>>   #include <linux/nospec.h>
>>   
>> @@ -2782,6 +2783,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
>>   	case PR_RISCV_SET_ICACHE_FLUSH_CTX:
>>   		error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
>>   		break;
>> +	case PR_ADD_SFRAME:
>> +		if (arg5)
>> +			return -EINVAL;
>> +		error = sframe_add_section(arg2, arg3, arg4);
>> +		break;
>> +	case PR_REMOVE_SFRAME:
>> +		if (arg3 || arg4 || arg5)
>> +			return -EINVAL;
>> +		error = sframe_remove_section(arg2);
>> +		break;
>>   	default:
>>   		error = -EINVAL;
>>   		break;
>> diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
>> index eb466d6a3295..6f202c5840cf 100644
>> --- a/kernel/unwind/Makefile
>> +++ b/kernel/unwind/Makefile
>> @@ -1 +1,2 @@
>>   obj-$(CONFIG_HAVE_USER_UNWIND) += user.o
>> +obj-$(CONFIG_HAVE_USER_UNWIND_SFRAME) += sframe.o
>> diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
>> new file mode 100644
>> index 000000000000..3e4d29e737a1
>> --- /dev/null
>> +++ b/kernel/unwind/sframe.c
>> @@ -0,0 +1,420 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +#include <linux/sched.h>
>> +#include <linux/slab.h>
>> +#include <linux/srcu.h>
>> +#include <linux/uaccess.h>
>> +#include <linux/mm.h>
>> +#include <linux/sframe.h>
>> +#include <linux/user_unwind.h>
>> +
>> +#include "sframe.h"
>> +
>> +#define SFRAME_FILENAME_LEN 32
>> +
>> +struct sframe_section {
>> +	struct rcu_head rcu;
>> +
>> +	unsigned long sframe_addr;
>> +	unsigned long text_addr;
>> +
>> +	unsigned long fdes_addr;
>> +	unsigned long fres_addr;
>> +	unsigned int  fdes_nr;
>> +	signed char ra_off, fp_off;
>> +};
>> +
>> +DEFINE_STATIC_SRCU(sframe_srcu);
>> +
>> +#define __SFRAME_GET_USER(out, user_ptr, type)				\
>> +({									\
>> +	type __tmp;							\
>> +	if (get_user(__tmp, (type *)user_ptr))				\
>> +		return -EFAULT;						\
>> +	user_ptr += sizeof(__tmp);					\
>> +	out = __tmp;							\
>> +})
>> +
>> +#define SFRAME_GET_USER_SIGNED(out, user_ptr, size)			\
>> +({									\
>> +	switch (size) {							\
>> +	case 1:								\
>> +		__SFRAME_GET_USER(out, user_ptr, s8);			\
>> +		break;							\
>> +	case 2:								\
>> +		__SFRAME_GET_USER(out, user_ptr, s16);			\
>> +		break;							\
>> +	case 4:								\
>> +		__SFRAME_GET_USER(out, user_ptr, s32);			\
>> +		break;							\
>> +	default:							\
>> +		return -EINVAL;						\
>> +	}								\
>> +})
>> +
>> +#define SFRAME_GET_USER_UNSIGNED(out, user_ptr, size)			\
>> +({									\
>> +	switch (size) {							\
>> +	case 1:								\
>> +		__SFRAME_GET_USER(out, user_ptr, u8);			\
>> +		break;							\
>> +	case 2:								\
>> +		__SFRAME_GET_USER(out, user_ptr, u16);			\
>> +		break;							\
>> +	case 4:								\
>> +		__SFRAME_GET_USER(out, user_ptr, u32);			\
>> +		break;							\
>> +	default:							\
>> +		return -EINVAL;						\
>> +	}								\
>> +})
>> +
>> +static unsigned char fre_type_to_size(unsigned char fre_type)
>> +{
>> +	if (fre_type > 2)
>> +		return 0;
>> +	return 1 << fre_type;
>> +}
>> +
>> +static unsigned char offset_size_enum_to_size(unsigned char off_size)
>> +{
>> +	if (off_size > 2)
>> +		return 0;
>> +	return 1 << off_size;
>> +}
>> +
>> +static int find_fde(struct sframe_section *sec, unsigned long ip,
>> +		    struct sframe_fde *fde)
>> +{
>> +	s32 func_off, ip_off;
>> +	struct sframe_fde __user *first, *last, *mid, *found;
> 
> Need to initialize found = NULL;
> 
>> +
>> +	ip_off = ip - sec->sframe_addr;
>> +
>> +	first = (void *)sec->fdes_addr;
>> +	last = first + sec->fdes_nr;
>> +	while (first <= last) {
> 
> So we trust user space to have this table sorted?
> 

GNU ld will create this table sorted when linking .sframe sections and 
will set SFRAME_F_FDE_SORTED in flags in the output .sframe section.  In 
the current patch, I see the __sframe_add_section () includes a check 
for SFRAME_F_FDE_SORTED for admitting SFrame sections.

So proceeding here with the assumption that the SFrame FDE list is 
sorted should work fine.

>> +		mid = first + ((last - first) / 2);
>> +		if (get_user(func_off, (s32 *)mid))
>> +			return -EFAULT;
>> +		if (ip_off >= func_off) {
>> +			found = mid;
> 
> If it's not sorted, this can return the wrong value.
> 
> We should have some check that has something like:
> 
> 	s32 low_func_off = 0, high_func_off = 0;
> 
> 			if (low_func_off && low_func_off > func_off)
> 				return -EINVAL;	
> 			low_func_off = func_off;
> 
>> +			first = mid + 1;
>> +		} else
> 			{ /* Note, this needs a bracket anyway, because
> 			     rules are, if one if block has a bracket,
> 			     the other needs one too */
> 
> 			if (high_func_off && high_func_off < func_off)
> 				return -EINVAL;
> 
> 			high_func_off = func_off;
> 
>> +			last = mid - 1;
> 		}
> 
>> +	}
>> +
>> +	if (!found)
>> +		return -EINVAL;
>> +
>> +	if (copy_from_user(fde, found, sizeof(*fde)))
>> +		return -EFAULT;
>> +
>> +	return 0;
>> +}
> 
> -- Steve
Re: [PATCH v2 03/11] unwind: Introduce SFrame user space unwinding
Posted by Steven Rostedt 1 month, 4 weeks ago
On Tue, 1 Oct 2024 11:20:35 -0700
Indu Bhagat <indu.bhagat@oracle.com> wrote:

> > So we trust user space to have this table sorted?
> >   
> 
> GNU ld will create this table sorted when linking .sframe sections and 
> will set SFRAME_F_FDE_SORTED in flags in the output .sframe section.  In 
> the current patch, I see the __sframe_add_section () includes a check 
> for SFRAME_F_FDE_SORTED for admitting SFrame sections.
> 
> So proceeding here with the assumption that the SFrame FDE list is 
> sorted should work fine.

No not at all! We *cannot trust* user space. This could lead to a security
hole if we assume it's sorted. The kernel must not trust anything it
receives from user space. Because an attacker will be looking for ways to
confuse the kernel to exploit it.

When I look at code that reads user space, I do not look at it as if it
were made by the compiler. I look at it as if it were made by someone
that's trying to find ways to crack the system. Every read from user space
*must* be validated *every* time it's read. It can not even validate it
once and then think its immutable (unless the kernel actually made it
immutable).

-- Steve
Re: [PATCH v2 03/11] unwind: Introduce SFrame user space unwinding
Posted by Florian Weimer 1 month, 4 weeks ago
* Steven Rostedt:

> On Tue, 1 Oct 2024 11:20:35 -0700
> Indu Bhagat <indu.bhagat@oracle.com> wrote:
>
>> > So we trust user space to have this table sorted?
>> >   
>> 
>> GNU ld will create this table sorted when linking .sframe sections and 
>> will set SFRAME_F_FDE_SORTED in flags in the output .sframe section.  In 
>> the current patch, I see the __sframe_add_section () includes a check 
>> for SFRAME_F_FDE_SORTED for admitting SFrame sections.
>> 
>> So proceeding here with the assumption that the SFrame FDE list is 
>> sorted should work fine.
>
> No not at all! We *cannot trust* user space. This could lead to a security
> hole if we assume it's sorted. The kernel must not trust anything it
> receives from user space. Because an attacker will be looking for ways to
> confuse the kernel to exploit it.

I don't quite understand, sorry.

Doing a binary search on an unordered table fails to find some entries
that could be discovered by a linear scan.  But an attacker could just
as well use an incomplete table from the start.  So assuming an ordered
table seems rather unlikely to introduce additional problems.  (Given
the lack of a formal threat model, it's impossible to make more precise
claims in either direction.)

Thanks,
Florian
Re: [PATCH v2 03/11] unwind: Introduce SFrame user space unwinding
Posted by Steven Rostedt 1 month, 4 weeks ago
On Wed, 02 Oct 2024 10:18:21 +0200
Florian Weimer <fweimer@redhat.com> wrote:
> 
> I don't quite understand, sorry.
> 
> Doing a binary search on an unordered table fails to find some entries
> that could be discovered by a linear scan.  But an attacker could just
> as well use an incomplete table from the start.  So assuming an ordered
> table seems rather unlikely to introduce additional problems.  (Given
> the lack of a formal threat model, it's impossible to make more precise
> claims in either direction.)

Basically, the idea is if anything is out of place, scrap the entire
process. An unordered table can give unpredictable results, that could be
used latter as a gadget. If the kernel expects a sorted table and it ends
up not being sorted, it should automatically flag it as corrupt and stop
all processing.

The kernel doesn't need to scan the entire table each time to see if it is
sorted, that would kill the point of it being sorted in the first place.
But it can check that the values merge towards a correct answer. All it
needs to do is keep track of the current highest and lowest values, and if
it finds something outside that range, it should abort immediately.

The effort needed to validate is very low, so it should be done.

-- Steve