In preparation for unwinding user space stacks with sframe, add basic
sframe compile infrastructure and support for reading the .sframe
section header.
sframe_add_section() reads the header and unconditionally returns an
error, so it's not very useful yet. A subsequent patch will improve
that.
Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
---
arch/Kconfig | 3 +
include/linux/sframe.h | 36 +++++++++++
kernel/unwind/Makefile | 3 +-
kernel/unwind/sframe.c | 136 +++++++++++++++++++++++++++++++++++++++++
kernel/unwind/sframe.h | 71 +++++++++++++++++++++
5 files changed, 248 insertions(+), 1 deletion(-)
create mode 100644 include/linux/sframe.h
create mode 100644 kernel/unwind/sframe.c
create mode 100644 kernel/unwind/sframe.h
diff --git a/arch/Kconfig b/arch/Kconfig
index f1f7a3857c97..23edd0e4e16a 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -446,6 +446,9 @@ config HAVE_UNWIND_USER_COMPAT_FP
bool
depends on HAVE_UNWIND_USER_FP
+config HAVE_UNWIND_USER_SFRAME
+ bool
+
config AS_SFRAME
def_bool $(as-instr,.cfi_sections .sframe\n.cfi_startproc\n.cfi_endproc)
diff --git a/include/linux/sframe.h b/include/linux/sframe.h
new file mode 100644
index 000000000000..3bfaf21869c2
--- /dev/null
+++ b/include/linux/sframe.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_SFRAME_H
+#define _LINUX_SFRAME_H
+
+#include <linux/mm_types.h>
+#include <linux/unwind_user_types.h>
+
+#ifdef CONFIG_HAVE_UNWIND_USER_SFRAME
+
+struct sframe_section {
+ unsigned long sframe_start;
+ unsigned long sframe_end;
+ unsigned long text_start;
+ unsigned long text_end;
+
+ unsigned long fdes_start;
+ unsigned long fres_start;
+ unsigned long fres_end;
+ unsigned int num_fdes;
+
+ signed char ra_off;
+ signed char fp_off;
+};
+
+extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
+ unsigned long text_start, unsigned long text_end);
+extern int sframe_remove_section(unsigned long sframe_addr);
+
+#else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
+
+static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, unsigned long text_start, unsigned long text_end) { return -ENOSYS; }
+static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; }
+
+#endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
+
+#endif /* _LINUX_SFRAME_H */
diff --git a/kernel/unwind/Makefile b/kernel/unwind/Makefile
index 349ce3677526..f70380d7a6a6 100644
--- a/kernel/unwind/Makefile
+++ b/kernel/unwind/Makefile
@@ -1 +1,2 @@
- obj-$(CONFIG_UNWIND_USER) += user.o
+ obj-$(CONFIG_UNWIND_USER) += user.o
+ obj-$(CONFIG_HAVE_UNWIND_USER_SFRAME) += sframe.o
diff --git a/kernel/unwind/sframe.c b/kernel/unwind/sframe.c
new file mode 100644
index 000000000000..20287f795b36
--- /dev/null
+++ b/kernel/unwind/sframe.c
@@ -0,0 +1,136 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Userspace sframe access functions
+ */
+
+#define pr_fmt(fmt) "sframe: " fmt
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/uaccess.h>
+#include <linux/mm.h>
+#include <linux/string_helpers.h>
+#include <linux/sframe.h>
+#include <linux/unwind_user_types.h>
+
+#include "sframe.h"
+
+#define dbg(fmt, ...) \
+ pr_debug("%s (%d): " fmt, current->comm, current->pid, ##__VA_ARGS__)
+
+static void free_section(struct sframe_section *sec)
+{
+ kfree(sec);
+}
+
+static int sframe_read_header(struct sframe_section *sec)
+{
+ unsigned long header_end, fdes_start, fdes_end, fres_start, fres_end;
+ struct sframe_header shdr;
+ unsigned int num_fdes;
+
+ if (copy_from_user(&shdr, (void __user *)sec->sframe_start, sizeof(shdr))) {
+ dbg("header usercopy failed\n");
+ return -EFAULT;
+ }
+
+ if (shdr.preamble.magic != SFRAME_MAGIC ||
+ shdr.preamble.version != SFRAME_VERSION_2 ||
+ !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
+ shdr.auxhdr_len) {
+ dbg("bad/unsupported sframe header\n");
+ return -EINVAL;
+ }
+
+ if (!shdr.num_fdes || !shdr.num_fres) {
+ dbg("no fde/fre entries\n");
+ return -EINVAL;
+ }
+
+ header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
+ if (header_end >= sec->sframe_end) {
+ dbg("header doesn't fit in section\n");
+ return -EINVAL;
+ }
+
+ num_fdes = shdr.num_fdes;
+ fdes_start = header_end + shdr.fdes_off;
+ fdes_end = fdes_start + (num_fdes * sizeof(struct sframe_fde));
+
+ fres_start = header_end + shdr.fres_off;
+ fres_end = fres_start + shdr.fre_len;
+
+ if (fres_start < fdes_end || fres_end > sec->sframe_end) {
+ dbg("inconsistent fde/fre offsets\n");
+ return -EINVAL;
+ }
+
+ sec->num_fdes = num_fdes;
+ sec->fdes_start = fdes_start;
+ sec->fres_start = fres_start;
+ sec->fres_end = fres_end;
+
+ sec->ra_off = shdr.cfa_fixed_ra_offset;
+ sec->fp_off = shdr.cfa_fixed_fp_offset;
+
+ return 0;
+}
+
+int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
+ unsigned long text_start, unsigned long text_end)
+{
+ struct maple_tree *sframe_mt = ¤t->mm->sframe_mt;
+ struct vm_area_struct *sframe_vma, *text_vma;
+ struct mm_struct *mm = current->mm;
+ struct sframe_section *sec;
+ int ret;
+
+ if (!sframe_start || !sframe_end || !text_start || !text_end) {
+ dbg("zero-length sframe/text address\n");
+ return -EINVAL;
+ }
+
+ scoped_guard(mmap_read_lock, mm) {
+ sframe_vma = vma_lookup(mm, sframe_start);
+ if (!sframe_vma || sframe_end > sframe_vma->vm_end) {
+ dbg("bad sframe address (0x%lx - 0x%lx)\n",
+ sframe_start, sframe_end);
+ return -EINVAL;
+ }
+
+ text_vma = vma_lookup(mm, text_start);
+ if (!text_vma ||
+ !(text_vma->vm_flags & VM_EXEC) ||
+ text_end > text_vma->vm_end) {
+ dbg("bad text address (0x%lx - 0x%lx)\n",
+ text_start, text_end);
+ return -EINVAL;
+ }
+ }
+
+ sec = kzalloc(sizeof(*sec), GFP_KERNEL);
+ if (!sec)
+ return -ENOMEM;
+
+ sec->sframe_start = sframe_start;
+ sec->sframe_end = sframe_end;
+ sec->text_start = text_start;
+ sec->text_end = text_end;
+
+ ret = sframe_read_header(sec);
+ if (ret)
+ goto err_free;
+
+ /* TODO nowhere to store it yet - just free it and return an error */
+ ret = -ENOSYS;
+
+err_free:
+ free_section(sec);
+ return ret;
+}
+
+int sframe_remove_section(unsigned long sframe_start)
+{
+ return -ENOSYS;
+}
diff --git a/kernel/unwind/sframe.h b/kernel/unwind/sframe.h
new file mode 100644
index 000000000000..e9bfccfaf5b4
--- /dev/null
+++ b/kernel/unwind/sframe.h
@@ -0,0 +1,71 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * From https://www.sourceware.org/binutils/docs/sframe-spec.html
+ */
+#ifndef _SFRAME_H
+#define _SFRAME_H
+
+#include <linux/types.h>
+
+#define SFRAME_VERSION_1 1
+#define SFRAME_VERSION_2 2
+#define SFRAME_MAGIC 0xdee2
+
+#define SFRAME_F_FDE_SORTED 0x1
+#define SFRAME_F_FRAME_POINTER 0x2
+
+#define SFRAME_ABI_AARCH64_ENDIAN_BIG 1
+#define SFRAME_ABI_AARCH64_ENDIAN_LITTLE 2
+#define SFRAME_ABI_AMD64_ENDIAN_LITTLE 3
+
+#define SFRAME_FDE_TYPE_PCINC 0
+#define SFRAME_FDE_TYPE_PCMASK 1
+
+struct sframe_preamble {
+ u16 magic;
+ u8 version;
+ u8 flags;
+} __packed;
+
+struct sframe_header {
+ struct sframe_preamble preamble;
+ u8 abi_arch;
+ s8 cfa_fixed_fp_offset;
+ s8 cfa_fixed_ra_offset;
+ u8 auxhdr_len;
+ u32 num_fdes;
+ u32 num_fres;
+ u32 fre_len;
+ u32 fdes_off;
+ u32 fres_off;
+} __packed;
+
+#define SFRAME_HEADER_SIZE(header) \
+ ((sizeof(struct sframe_header) + header.auxhdr_len))
+
+#define SFRAME_AARCH64_PAUTH_KEY_A 0
+#define SFRAME_AARCH64_PAUTH_KEY_B 1
+
+struct sframe_fde {
+ s32 start_addr;
+ u32 func_size;
+ u32 fres_off;
+ u32 fres_num;
+ u8 info;
+ u8 rep_size;
+ u16 padding;
+} __packed;
+
+#define SFRAME_FUNC_FRE_TYPE(data) (data & 0xf)
+#define SFRAME_FUNC_FDE_TYPE(data) ((data >> 4) & 0x1)
+#define SFRAME_FUNC_PAUTH_KEY(data) ((data >> 5) & 0x1)
+
+#define SFRAME_BASE_REG_FP 0
+#define SFRAME_BASE_REG_SP 1
+
+#define SFRAME_FRE_CFA_BASE_REG_ID(data) (data & 0x1)
+#define SFRAME_FRE_OFFSET_COUNT(data) ((data >> 1) & 0xf)
+#define SFRAME_FRE_OFFSET_SIZE(data) ((data >> 5) & 0x3)
+#define SFRAME_FRE_MANGLED_RA_P(data) ((data >> 7) & 0x1)
+
+#endif /* _SFRAME_H */
--
2.48.1
On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> In preparation for unwinding user space stacks with sframe, add basic
> sframe compile infrastructure and support for reading the .sframe
> section header.
>
> sframe_add_section() reads the header and unconditionally returns an
> error, so it's not very useful yet. A subsequent patch will improve
> that.
>
> Signed-off-by: Josh Poimboeuf <jpoimboe@kernel.org>
> ---
> arch/Kconfig | 3 +
> include/linux/sframe.h | 36 +++++++++++
> kernel/unwind/Makefile | 3 +-
> kernel/unwind/sframe.c | 136 +++++++++++++++++++++++++++++++++++++++++
> kernel/unwind/sframe.h | 71 +++++++++++++++++++++
> 5 files changed, 248 insertions(+), 1 deletion(-)
> create mode 100644 include/linux/sframe.h
> create mode 100644 kernel/unwind/sframe.c
> create mode 100644 kernel/unwind/sframe.h
>
[...]
> +
> +extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
> + unsigned long text_start, unsigned long text_end);
> +extern int sframe_remove_section(unsigned long sframe_addr);
> +
> +#else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
> +
> +static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, unsigned long text_start, unsigned long text_end) { return -ENOSYS; }
nit: very-very long, wrap it?
> +static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; }
> +
> +#endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
> +
> +#endif /* _LINUX_SFRAME_H */
[...]
> +static int sframe_read_header(struct sframe_section *sec)
> +{
> + unsigned long header_end, fdes_start, fdes_end, fres_start, fres_end;
> + struct sframe_header shdr;
> + unsigned int num_fdes;
> +
> + if (copy_from_user(&shdr, (void __user *)sec->sframe_start, sizeof(shdr))) {
> + dbg("header usercopy failed\n");
> + return -EFAULT;
> + }
> +
> + if (shdr.preamble.magic != SFRAME_MAGIC ||
> + shdr.preamble.version != SFRAME_VERSION_2 ||
> + !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
probably more a question to Indu, but why is this sorting not
mandatory and part of SFrame "standard"? How realistically non-sorted
FDEs would work in practice? Ain't nobody got time to sort them just
to unwind the stack...
> + shdr.auxhdr_len) {
> + dbg("bad/unsupported sframe header\n");
> + return -EINVAL;
> + }
> +
> + if (!shdr.num_fdes || !shdr.num_fres) {
given SFRAME_F_FRAME_POINTER in the header, is it really that
nonsensical and illegal to have zero FDEs/FREs? Maybe we should allow
that?
> + dbg("no fde/fre entries\n");
> + return -EINVAL;
> + }
> +
> + header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
> + if (header_end >= sec->sframe_end) {
if we allow zero FDEs/FREs, header_end == sec->sframe_end is legal, right?
> + dbg("header doesn't fit in section\n");
> + return -EINVAL;
> + }
> +
> + num_fdes = shdr.num_fdes;
> + fdes_start = header_end + shdr.fdes_off;
> + fdes_end = fdes_start + (num_fdes * sizeof(struct sframe_fde));
> +
> + fres_start = header_end + shdr.fres_off;
> + fres_end = fres_start + shdr.fre_len;
> +
maybe use check_add_overflow() in all the above calculation, at least
on 32-bit arches this all can overflow and it's not clear if below
sanity check detects all possible overflows
> + if (fres_start < fdes_end || fres_end > sec->sframe_end) {
> + dbg("inconsistent fde/fre offsets\n");
> + return -EINVAL;
> + }
> +
> + sec->num_fdes = num_fdes;
> + sec->fdes_start = fdes_start;
> + sec->fres_start = fres_start;
> + sec->fres_end = fres_end;
> +
> + sec->ra_off = shdr.cfa_fixed_ra_offset;
> + sec->fp_off = shdr.cfa_fixed_fp_offset;
> +
> + return 0;
> +}
> +
[...]
> diff --git a/kernel/unwind/sframe.h b/kernel/unwind/sframe.h
> new file mode 100644
> index 000000000000..e9bfccfaf5b4
> --- /dev/null
> +++ b/kernel/unwind/sframe.h
> @@ -0,0 +1,71 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * From https://www.sourceware.org/binutils/docs/sframe-spec.html
> + */
> +#ifndef _SFRAME_H
> +#define _SFRAME_H
> +
> +#include <linux/types.h>
> +
> +#define SFRAME_VERSION_1 1
> +#define SFRAME_VERSION_2 2
> +#define SFRAME_MAGIC 0xdee2
> +
> +#define SFRAME_F_FDE_SORTED 0x1
> +#define SFRAME_F_FRAME_POINTER 0x2
> +
> +#define SFRAME_ABI_AARCH64_ENDIAN_BIG 1
> +#define SFRAME_ABI_AARCH64_ENDIAN_LITTLE 2
> +#define SFRAME_ABI_AMD64_ENDIAN_LITTLE 3
> +
> +#define SFRAME_FDE_TYPE_PCINC 0
> +#define SFRAME_FDE_TYPE_PCMASK 1
> +
> +struct sframe_preamble {
> + u16 magic;
> + u8 version;
> + u8 flags;
> +} __packed;
> +
> +struct sframe_header {
> + struct sframe_preamble preamble;
> + u8 abi_arch;
> + s8 cfa_fixed_fp_offset;
> + s8 cfa_fixed_ra_offset;
> + u8 auxhdr_len;
> + u32 num_fdes;
> + u32 num_fres;
> + u32 fre_len;
> + u32 fdes_off;
> + u32 fres_off;
> +} __packed;
> +
> +#define SFRAME_HEADER_SIZE(header) \
> + ((sizeof(struct sframe_header) + header.auxhdr_len))
> +
> +#define SFRAME_AARCH64_PAUTH_KEY_A 0
> +#define SFRAME_AARCH64_PAUTH_KEY_B 1
> +
> +struct sframe_fde {
> + s32 start_addr;
> + u32 func_size;
> + u32 fres_off;
> + u32 fres_num;
> + u8 info;
> + u8 rep_size;
> + u16 padding;
> +} __packed;
I couldn't understand from SFrame itself, but why do sframe_header,
sframe_preamble, and sframe_fde have to be marked __packed, if it's
all naturally aligned (intentionally and by design)?..
> +
> +#define SFRAME_FUNC_FRE_TYPE(data) (data & 0xf)
> +#define SFRAME_FUNC_FDE_TYPE(data) ((data >> 4) & 0x1)
> +#define SFRAME_FUNC_PAUTH_KEY(data) ((data >> 5) & 0x1)
> +
> +#define SFRAME_BASE_REG_FP 0
> +#define SFRAME_BASE_REG_SP 1
> +
> +#define SFRAME_FRE_CFA_BASE_REG_ID(data) (data & 0x1)
> +#define SFRAME_FRE_OFFSET_COUNT(data) ((data >> 1) & 0xf)
> +#define SFRAME_FRE_OFFSET_SIZE(data) ((data >> 5) & 0x3)
> +#define SFRAME_FRE_MANGLED_RA_P(data) ((data >> 7) & 0x1)
> +
> +#endif /* _SFRAME_H */
> --
> 2.48.1
>
On 1/24/25 10:00 AM, Andrii Nakryiko wrote:
> On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf<jpoimboe@kernel.org> wrote:
>> In preparation for unwinding user space stacks with sframe, add basic
>> sframe compile infrastructure and support for reading the .sframe
>> section header.
>>
>> sframe_add_section() reads the header and unconditionally returns an
>> error, so it's not very useful yet. A subsequent patch will improve
>> that.
>>
>> Signed-off-by: Josh Poimboeuf<jpoimboe@kernel.org>
>> ---
>> arch/Kconfig | 3 +
>> include/linux/sframe.h | 36 +++++++++++
>> kernel/unwind/Makefile | 3 +-
>> kernel/unwind/sframe.c | 136 +++++++++++++++++++++++++++++++++++++++++
>> kernel/unwind/sframe.h | 71 +++++++++++++++++++++
>> 5 files changed, 248 insertions(+), 1 deletion(-)
>> create mode 100644 include/linux/sframe.h
>> create mode 100644 kernel/unwind/sframe.c
>> create mode 100644 kernel/unwind/sframe.h
>>
> [...]
>
>> +
>> +extern int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end,
>> + unsigned long text_start, unsigned long text_end);
>> +extern int sframe_remove_section(unsigned long sframe_addr);
>> +
>> +#else /* !CONFIG_HAVE_UNWIND_USER_SFRAME */
>> +
>> +static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, unsigned long text_start, unsigned long text_end) { return -ENOSYS; }
> nit: very-very long, wrap it?
>
>> +static inline int sframe_remove_section(unsigned long sframe_addr) { return -ENOSYS; }
>> +
>> +#endif /* CONFIG_HAVE_UNWIND_USER_SFRAME */
>> +
>> +#endif /* _LINUX_SFRAME_H */
> [...]
>
>> +static int sframe_read_header(struct sframe_section *sec)
>> +{
>> + unsigned long header_end, fdes_start, fdes_end, fres_start, fres_end;
>> + struct sframe_header shdr;
>> + unsigned int num_fdes;
>> +
>> + if (copy_from_user(&shdr, (void __user *)sec->sframe_start, sizeof(shdr))) {
>> + dbg("header usercopy failed\n");
>> + return -EFAULT;
>> + }
>> +
>> + if (shdr.preamble.magic != SFRAME_MAGIC ||
>> + shdr.preamble.version != SFRAME_VERSION_2 ||
>> + !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
> probably more a question to Indu, but why is this sorting not
> mandatory and part of SFrame "standard"? How realistically non-sorted
> FDEs would work in practice? Ain't nobody got time to sort them just
> to unwind the stack...
It is not worthwhile for the assembler (even wasteful as it adds to
build time for nothing) to generate an .sframe section with FDEs in
sorted order of start PC of function. This is because the final order
is decided by the linker as it merges all input sections.
Thats one reason why it is already necessary that the specification
allows SFRAME_F_FDE_SORTED not set in the section. I can also see how
not making the sorting mandatory may also be necessary for JIT use-case..
FWIW, for non-JIT environments, non-sorted FDEs are not expected in
linked binaries; such a thing does not seem to be useful in practice.
Hope that helps
Indu
On Fri, Jan 24, 2025 at 10:00:52AM -0800, Andrii Nakryiko wrote:
> On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> > +static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, unsigned long text_start, unsigned long text_end) { return -ENOSYS; }
>
> nit: very-very long, wrap it?
That was intentional as it's just an empty stub, but yeah, maybe 160
chars is a bit much.
> > + if (shdr.preamble.magic != SFRAME_MAGIC ||
> > + shdr.preamble.version != SFRAME_VERSION_2 ||
> > + !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
>
> probably more a question to Indu, but why is this sorting not
> mandatory and part of SFrame "standard"? How realistically non-sorted
> FDEs would work in practice? Ain't nobody got time to sort them just
> to unwind the stack...
No idea...
> > + if (!shdr.num_fdes || !shdr.num_fres) {
>
> given SFRAME_F_FRAME_POINTER in the header, is it really that
> nonsensical and illegal to have zero FDEs/FREs? Maybe we should allow
> that?
It would seem a bit silly to create an empty .sframe section just to set
that SFRAME_F_FRAME_POINTER bit. Regardless, there's nothing the kernel
can do with that.
> > + dbg("no fde/fre entries\n");
> > + return -EINVAL;
> > + }
> > +
> > + header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
> > + if (header_end >= sec->sframe_end) {
>
> if we allow zero FDEs/FREs, header_end == sec->sframe_end is legal, right?
I suppose so, but again I'm not seeing any reason to support that.
> > + dbg("header doesn't fit in section\n");
> > + return -EINVAL;
> > + }
> > +
> > + num_fdes = shdr.num_fdes;
> > + fdes_start = header_end + shdr.fdes_off;
> > + fdes_end = fdes_start + (num_fdes * sizeof(struct sframe_fde));
> > +
> > + fres_start = header_end + shdr.fres_off;
> > + fres_end = fres_start + shdr.fre_len;
> > +
>
> maybe use check_add_overflow() in all the above calculation, at least
> on 32-bit arches this all can overflow and it's not clear if below
> sanity check detects all possible overflows
Ok, I'll look into it.
> > +struct sframe_preamble {
> > + u16 magic;
> > + u8 version;
> > + u8 flags;
> > +} __packed;
> > +
> > +struct sframe_header {
> > + struct sframe_preamble preamble;
> > + u8 abi_arch;
> > + s8 cfa_fixed_fp_offset;
> > + s8 cfa_fixed_ra_offset;
> > + u8 auxhdr_len;
> > + u32 num_fdes;
> > + u32 num_fres;
> > + u32 fre_len;
> > + u32 fdes_off;
> > + u32 fres_off;
> > +} __packed;
> > +
> > +struct sframe_fde {
> > + s32 start_addr;
> > + u32 func_size;
> > + u32 fres_off;
> > + u32 fres_num;
> > + u8 info;
> > + u8 rep_size;
> > + u16 padding;
> > +} __packed;
>
> I couldn't understand from SFrame itself, but why do sframe_header,
> sframe_preamble, and sframe_fde have to be marked __packed, if it's
> all naturally aligned (intentionally and by design)?..
Right, but the spec says they're all packed. Maybe the point is that
some future sframe version is free to introduce unaligned fields.
--
Josh
On 1/24/25 11:21 AM, Josh Poimboeuf wrote:
> On Fri, Jan 24, 2025 at 10:00:52AM -0800, Andrii Nakryiko wrote:
>> On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>>> +static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, unsigned long text_start, unsigned long text_end) { return -ENOSYS; }
>>
>> nit: very-very long, wrap it?
>
> That was intentional as it's just an empty stub, but yeah, maybe 160
> chars is a bit much.
>
>>> + if (shdr.preamble.magic != SFRAME_MAGIC ||
>>> + shdr.preamble.version != SFRAME_VERSION_2 ||
>>> + !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
>>
>> probably more a question to Indu, but why is this sorting not
>> mandatory and part of SFrame "standard"? How realistically non-sorted
>> FDEs would work in practice? Ain't nobody got time to sort them just
>> to unwind the stack...
>
> No idea...
>
>>> + if (!shdr.num_fdes || !shdr.num_fres) {
>>
>> given SFRAME_F_FRAME_POINTER in the header, is it really that
>> nonsensical and illegal to have zero FDEs/FREs? Maybe we should allow
>> that?
>
> It would seem a bit silly to create an empty .sframe section just to set
> that SFRAME_F_FRAME_POINTER bit. Regardless, there's nothing the kernel
> can do with that.
>
Yes, in theory, it is allowed (as per the specification) to have an
SFrame section with zero number of FDEs/FREs. But since such a section
will not be useful, I share the opinion that it makes sense to disallow
it in the current unwinding contexts, for now (JIT usecase may change
things later).
SFRAME_F_FRAME_POINTER flag is not being set currently by GAS/GNU ld at all.
>>> + dbg("no fde/fre entries\n");
>>> + return -EINVAL;
>>> + }
>>> +
>>> + header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
>>> + if (header_end >= sec->sframe_end) {
>>
>> if we allow zero FDEs/FREs, header_end == sec->sframe_end is legal, right?
>
> I suppose so, but again I'm not seeing any reason to support that.
>
>>> + dbg("header doesn't fit in section\n");
>>> + return -EINVAL;
>>> + }
>>> +
>>> + num_fdes = shdr.num_fdes;
>>> + fdes_start = header_end + shdr.fdes_off;
>>> + fdes_end = fdes_start + (num_fdes * sizeof(struct sframe_fde));
>>> +
>>> + fres_start = header_end + shdr.fres_off;
>>> + fres_end = fres_start + shdr.fre_len;
>>> +
>>
>> maybe use check_add_overflow() in all the above calculation, at least
>> on 32-bit arches this all can overflow and it's not clear if below
>> sanity check detects all possible overflows
>
> Ok, I'll look into it.
>
>>> +struct sframe_preamble {
>>> + u16 magic;
>>> + u8 version;
>>> + u8 flags;
>>> +} __packed;
>>> +
>>> +struct sframe_header {
>>> + struct sframe_preamble preamble;
>>> + u8 abi_arch;
>>> + s8 cfa_fixed_fp_offset;
>>> + s8 cfa_fixed_ra_offset;
>>> + u8 auxhdr_len;
>>> + u32 num_fdes;
>>> + u32 num_fres;
>>> + u32 fre_len;
>>> + u32 fdes_off;
>>> + u32 fres_off;
>>> +} __packed;
>>> +
>>> +struct sframe_fde {
>>> + s32 start_addr;
>>> + u32 func_size;
>>> + u32 fres_off;
>>> + u32 fres_num;
>>> + u8 info;
>>> + u8 rep_size;
>>> + u16 padding;
>>> +} __packed;
>>
>> I couldn't understand from SFrame itself, but why do sframe_header,
>> sframe_preamble, and sframe_fde have to be marked __packed, if it's
>> all naturally aligned (intentionally and by design)?..
>
> Right, but the spec says they're all packed. Maybe the point is that
> some future sframe version is free to introduce unaligned fields.
>
SFrame specification aims to keep SFrame header and SFrame FDE members
at aligned boundaries in future versions.
Only SFrame FRE related accesses may have unaligned accesses.
On Fri, Jan 24, 2025 at 2:14 PM Indu Bhagat <indu.bhagat@oracle.com> wrote:
>
> On 1/24/25 11:21 AM, Josh Poimboeuf wrote:
> > On Fri, Jan 24, 2025 at 10:00:52AM -0800, Andrii Nakryiko wrote:
> >> On Tue, Jan 21, 2025 at 6:32 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> >>> +static inline int sframe_add_section(unsigned long sframe_start, unsigned long sframe_end, unsigned long text_start, unsigned long text_end) { return -ENOSYS; }
> >>
> >> nit: very-very long, wrap it?
> >
> > That was intentional as it's just an empty stub, but yeah, maybe 160
> > chars is a bit much.
> >
> >>> + if (shdr.preamble.magic != SFRAME_MAGIC ||
> >>> + shdr.preamble.version != SFRAME_VERSION_2 ||
> >>> + !(shdr.preamble.flags & SFRAME_F_FDE_SORTED) ||
> >>
> >> probably more a question to Indu, but why is this sorting not
> >> mandatory and part of SFrame "standard"? How realistically non-sorted
> >> FDEs would work in practice? Ain't nobody got time to sort them just
> >> to unwind the stack...
> >
> > No idea...
> >
> >>> + if (!shdr.num_fdes || !shdr.num_fres) {
> >>
> >> given SFRAME_F_FRAME_POINTER in the header, is it really that
> >> nonsensical and illegal to have zero FDEs/FREs? Maybe we should allow
> >> that?
> >
> > It would seem a bit silly to create an empty .sframe section just to set
> > that SFRAME_F_FRAME_POINTER bit. Regardless, there's nothing the kernel
> > can do with that.
> >
>
> Yes, in theory, it is allowed (as per the specification) to have an
> SFrame section with zero number of FDEs/FREs. But since such a section
> will not be useful, I share the opinion that it makes sense to disallow
> it in the current unwinding contexts, for now (JIT usecase may change
> things later).
>
I disagree, actually. If it's a legal thing, it shouldn't be randomly
rejected. If we later make use of that, we'd have to worry not to
accidentally cause problems on older kernels that arbitrarily rejected
empty FDE just because it didn't make sense at some point (without
causing any issues).
> SFRAME_F_FRAME_POINTER flag is not being set currently by GAS/GNU ld at all.
>
> >>> + dbg("no fde/fre entries\n");
> >>> + return -EINVAL;
> >>> + }
> >>> +
> >>> + header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
> >>> + if (header_end >= sec->sframe_end) {
> >>
> >> if we allow zero FDEs/FREs, header_end == sec->sframe_end is legal, right?
> >
> > I suppose so, but again I'm not seeing any reason to support that.
Let's invert this. Is there any reason why it shouldn't be supported? ;)
> >
> >>> + dbg("header doesn't fit in section\n");
> >>> + return -EINVAL;
> >>> + }
> >>> +
> >>> + num_fdes = shdr.num_fdes;
> >>> + fdes_start = header_end + shdr.fdes_off;
> >>> + fdes_end = fdes_start + (num_fdes * sizeof(struct sframe_fde));
> >>> +
> >>> + fres_start = header_end + shdr.fres_off;
> >>> + fres_end = fres_start + shdr.fre_len;
> >>> +
> >>
> >> maybe use check_add_overflow() in all the above calculation, at least
> >> on 32-bit arches this all can overflow and it's not clear if below
> >> sanity check detects all possible overflows
> >
> > Ok, I'll look into it.
> >
> >>> +struct sframe_preamble {
> >>> + u16 magic;
> >>> + u8 version;
> >>> + u8 flags;
> >>> +} __packed;
> >>> +
> >>> +struct sframe_header {
> >>> + struct sframe_preamble preamble;
> >>> + u8 abi_arch;
> >>> + s8 cfa_fixed_fp_offset;
> >>> + s8 cfa_fixed_ra_offset;
> >>> + u8 auxhdr_len;
> >>> + u32 num_fdes;
> >>> + u32 num_fres;
> >>> + u32 fre_len;
> >>> + u32 fdes_off;
> >>> + u32 fres_off;
> >>> +} __packed;
> >>> +
> >>> +struct sframe_fde {
> >>> + s32 start_addr;
> >>> + u32 func_size;
> >>> + u32 fres_off;
> >>> + u32 fres_num;
> >>> + u8 info;
> >>> + u8 rep_size;
> >>> + u16 padding;
> >>> +} __packed;
> >>
> >> I couldn't understand from SFrame itself, but why do sframe_header,
> >> sframe_preamble, and sframe_fde have to be marked __packed, if it's
> >> all naturally aligned (intentionally and by design)?..
> >
> > Right, but the spec says they're all packed. Maybe the point is that
> > some future sframe version is free to introduce unaligned fields.
> >
>
> SFrame specification aims to keep SFrame header and SFrame FDE members
> at aligned boundaries in future versions.
>
> Only SFrame FRE related accesses may have unaligned accesses.
Yeah, and it's actually bothering me quite a lot :) I have a tentative
proposal, maybe we can discuss this for SFrame v3? Let me briefly
outline the idea.
So, currently in v2, FREs within FDEs use an array-of-structs layout.
If we use preudo-C type definitions, it would be something like this
for FDE + its FREs:
struct FDE_and_FREs {
struct sframe_func_desc_entry fde_metadata;
union FRE {
struct FRE8 {
u8 sfre_start_address;
u8 sfre_info;
u8|u16|u32 offsets[M];
}
struct FRE16 {
u16 sfre_start_address;
u16 sfre_info;
u8|u16|u32 offsets[M];
}
struct FRE32 {
u32 sfre_start_address;
u32 sfre_info;
u8|u16|u32 offsets[M];
}
} fres[N] __packed;
};
where all fres[i]s are one of those FRE8/FRE16/FRE32, so start
addresses have the same size, but each FRE has potentially different
offsets sizing, so there is no common alignment, and so everything has
to be packed and unaligned.
But what if we take a struct-of-arrays approach and represent it more like:
struct FDE_and_FREs {
struct sframe_func_desc_entry fde_metadata;
u8|u16|u32 start_addrs[N]; /* can extend to u64 as well */
u8 sfre_infos[N];
u8 offsets8[M8];
u16 offsets16[M16] __aligned(2);
u32 offsets32[M32] __aligned(4);
/* we can naturally extend to support also u64 offsets */
};
i.e., we split all FRE records into their three constituents: start
addresses, info bytes, and then each FRE can fall into either 8-, 16-,
or 32-bit offsets "bucket". We collect all the offsets, depending on
their size, into these aligned offsets{8,16,32} arrays (with natural
extension to 64 bits, if necessary), with at most wasting 1-3 bytes to
ensure proper alignment everywhere.
Note, at this point we need to decide if we want to make FREs binary
searchable or not.
If not, we don't really need anything extra. As we process each
start_addrs[i] and sfre_infos[i] to find matching FRE, we keep track
of how many 8-, 16-, and 32-bit offsets already processed FREs
consumed, and when we find the right one, we know exactly the starting
index within offset{8,16,32}. Done.
But if we were to make FREs binary searchable, we need to basically
have an index of offset pointers to quickly find offsetsX[j] position
corresponding to FRE #i. For that, we can have an extra array right
next to start_addrs, "semantically parallel" to it:
u8|u16|u32 start_addrs[N];
u8|u16|u32 offset_idxs[N];
where start_addrs[i] corresponds to offset_idxs[i], and offset_idxs[i]
points to the first offset corresponding to FRE #i in offsetX[] array
(depending on FRE's "bitness"). This is a bit more storage for this
offset index, but for FDEs with lots of FREs this might be a
worthwhile tradeoff.
Few points:
a) we can decide this "binary searchability" per-FDE, and for FDEs
with 1-2-3 FREs not bother, while those with more FREs would be
searchable ones with index. So we can combine both fast lookups,
natural alignment of on-disk format, and compactness. The presence of
index is just another bit in FDE metadata.
b) bitness of offset_idxs[] can be coupled with bitness of
start_addrs (for simplicity), or could be completely independent and
identified by FDE's metadata (2 more bits to define this just like
start_addr bitness is defined). Independent probably would be my
preference, with linker (or whoever will be producing .sframe data)
can pick the smallest bitness that is sufficient to represent
everything.
Yes, it's a bit more complicated to draw and explain, but everything
will be nicely aligned, extensible to 64 bits, and (optionally at
least) binary searchable. Implementation-wise on the kernel side it
shouldn't be significantly more involved. Maybe the compiler would
need to be a bit smarter when producing FDE data, but it's no rocket
science.
Thoughts?
On 28.01.2025 02:10, Andrii Nakryiko wrote:
> So, currently in v2, FREs within FDEs use an array-of-structs layout.
> If we use preudo-C type definitions, it would be something like this
> for FDE + its FREs:
>
> struct FDE_and_FREs {
> struct sframe_func_desc_entry fde_metadata;
>
> union FRE {
> struct FRE8 {
> u8 sfre_start_address;
> u8 sfre_info;
> u8|u16|u32 offsets[M];
> }
> struct FRE16 {
> u16 sfre_start_address;
> u16 sfre_info;
> u8|u16|u32 offsets[M];
> }
> struct FRE32 {
> u32 sfre_start_address;
> u32 sfre_info;
> u8|u16|u32 offsets[M];
> }
> } fres[N] __packed;
> };
>
> where all fres[i]s are one of those FRE8/FRE16/FRE32, so start
> addresses have the same size, but each FRE has potentially different
> offsets sizing, so there is no common alignment, and so everything has
> to be packed and unaligned.
Just for clarification of the SFrame V2 format, as there may be some
misconception. Using pseudo-C type definition:
struct sframe_fre8 {
u8 sfre_start_address;
u8 sfre_info;
s8|s16|s32 offsets[M];
};
struct sframe_fre16 {
u16 sfre_start_address;
u8 sfre_info;
s8|s16|s32 offsets[M];
};
struct sframe_fre32 {
u32 sfre_start_address;
u8 sfre_info;
s8|s16|s32 offsets[M];
};
struct sframe_section {
/* Headers. */
struct sframe_preamble preamble;
struct sframe_header header;
struct sframe_auxiliary_header auxhdr;
/* FDEs. */
struct sframe_fde fdes[N_FDE];
/* FRE8s / FRE16s / FRE32s per FDE. */
struct sframe_fre{8|16|32} fres_fde1[N_FRE_FDE1] __packed;
...
struct sframe_fre{8|16|32} fres_fdeN[N_FRE_FDEN] __packed;
};
Where:
- fdes[] can be binary searched: All fdes[i] are of equal size and
sorted on start address.
- Each fdes[i] points at its fres_fdei[].
- fres_fdei[] cannot be binary searched: For each fdes[i] they are
one of those FRE8/FRE16/FRE32, so start addresses have the same
size, but each FRE has potentially different offsets sizing, so
there is no common alignment, and so everything has to be packed
and unaligned.
Regards,
Jens
--
Jens Remus
Linux on Z Development (D3303)
+49-7031-16-1128 Office
jremus@de.ibm.com
IBM
IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
IBM Data Privacy Statement: https://www.ibm.com/privacy/
On Wed, Feb 5, 2025 at 3:02 AM Jens Remus <jremus@linux.ibm.com> wrote:
>
> On 28.01.2025 02:10, Andrii Nakryiko wrote:
>
> > So, currently in v2, FREs within FDEs use an array-of-structs layout.
> > If we use preudo-C type definitions, it would be something like this
> > for FDE + its FREs:
> >
> > struct FDE_and_FREs {
> > struct sframe_func_desc_entry fde_metadata;
> >
> > union FRE {
> > struct FRE8 {
> > u8 sfre_start_address;
> > u8 sfre_info;
> > u8|u16|u32 offsets[M];
> > }
> > struct FRE16 {
> > u16 sfre_start_address;
> > u16 sfre_info;
> > u8|u16|u32 offsets[M];
> > }
> > struct FRE32 {
> > u32 sfre_start_address;
> > u32 sfre_info;
> > u8|u16|u32 offsets[M];
> > }
> > } fres[N] __packed;
> > };
> >
> > where all fres[i]s are one of those FRE8/FRE16/FRE32, so start
> > addresses have the same size, but each FRE has potentially different
> > offsets sizing, so there is no common alignment, and so everything has
> > to be packed and unaligned.
>
> Just for clarification of the SFrame V2 format, as there may be some
> misconception. Using pseudo-C type definition:
>
> struct sframe_fre8 {
> u8 sfre_start_address;
> u8 sfre_info;
> s8|s16|s32 offsets[M];
> };
> struct sframe_fre16 {
> u16 sfre_start_address;
> u8 sfre_info;
> s8|s16|s32 offsets[M];
> };
> struct sframe_fre32 {
> u32 sfre_start_address;
> u8 sfre_info;
> s8|s16|s32 offsets[M];
> };
>
yeah, sorry, copy/paste error for those sfre_info, it's always u8
> struct sframe_section {
> /* Headers. */
> struct sframe_preamble preamble;
> struct sframe_header header;
> struct sframe_auxiliary_header auxhdr;
>
> /* FDEs. */
> struct sframe_fde fdes[N_FDE];
>
> /* FRE8s / FRE16s / FRE32s per FDE. */
> struct sframe_fre{8|16|32} fres_fde1[N_FRE_FDE1] __packed;
> ...
> struct sframe_fre{8|16|32} fres_fdeN[N_FRE_FDEN] __packed;
> };
>
> Where:
> - fdes[] can be binary searched: All fdes[i] are of equal size and
> sorted on start address.
> - Each fdes[i] points at its fres_fdei[].
> - fres_fdei[] cannot be binary searched: For each fdes[i] they are
> one of those FRE8/FRE16/FRE32, so start addresses have the same
> size, but each FRE has potentially different offsets sizing, so
> there is no common alignment, and so everything has to be packed
> and unaligned.
>
yep
> Regards,
> Jens
> --
> Jens Remus
> Linux on Z Development (D3303)
> +49-7031-16-1128 Office
> jremus@de.ibm.com
>
> IBM
>
> IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294
> IBM Data Privacy Statement: https://www.ibm.com/privacy/
>
On 1/27/25 5:10 PM, Andrii Nakryiko wrote:
>>>>> +struct sframe_preamble {
>>>>> + u16 magic;
>>>>> + u8 version;
>>>>> + u8 flags;
>>>>> +} __packed;
>>>>> +
>>>>> +struct sframe_header {
>>>>> + struct sframe_preamble preamble;
>>>>> + u8 abi_arch;
>>>>> + s8 cfa_fixed_fp_offset;
>>>>> + s8 cfa_fixed_ra_offset;
>>>>> + u8 auxhdr_len;
>>>>> + u32 num_fdes;
>>>>> + u32 num_fres;
>>>>> + u32 fre_len;
>>>>> + u32 fdes_off;
>>>>> + u32 fres_off;
>>>>> +} __packed;
>>>>> +
>>>>> +struct sframe_fde {
>>>>> + s32 start_addr;
>>>>> + u32 func_size;
>>>>> + u32 fres_off;
>>>>> + u32 fres_num;
>>>>> + u8 info;
>>>>> + u8 rep_size;
>>>>> + u16 padding;
>>>>> +} __packed;
>>>> I couldn't understand from SFrame itself, but why do sframe_header,
>>>> sframe_preamble, and sframe_fde have to be marked __packed, if it's
>>>> all naturally aligned (intentionally and by design)?..
>>> Right, but the spec says they're all packed. Maybe the point is that
>>> some future sframe version is free to introduce unaligned fields.
>>>
>> SFrame specification aims to keep SFrame header and SFrame FDE members
>> at aligned boundaries in future versions.
>>
>> Only SFrame FRE related accesses may have unaligned accesses.
> Yeah, and it's actually bothering me quite a lot 🙂 I have a tentative
> proposal, maybe we can discuss this for SFrame v3? Let me briefly
> outline the idea.
>
I looked at the idea below. It could work wrt unaligned accesses.
Speaking of unaligned accesses, I will ask away: Is the reason to avoid
unaligned accesses performance hit or are there other practical reasons
to it ?
> So, currently in v2, FREs within FDEs use an array-of-structs layout.
> If we use preudo-C type definitions, it would be something like this
> for FDE + its FREs:
>
> struct FDE_and_FREs {
> struct sframe_func_desc_entry fde_metadata;
>
> union FRE {
> struct FRE8 {
> u8 sfre_start_address;
> u8 sfre_info;
> u8|u16|u32 offsets[M];
> }
> struct FRE16 {
> u16 sfre_start_address;
> u16 sfre_info;
> u8|u16|u32 offsets[M];
> }
> struct FRE32 {
> u32 sfre_start_address;
> u32 sfre_info;
> u8|u16|u32 offsets[M];
> }
> } fres[N] __packed;
> };
>
> where all fres[i]s are one of those FRE8/FRE16/FRE32, so start
> addresses have the same size, but each FRE has potentially different
> offsets sizing, so there is no common alignment, and so everything has
> to be packed and unaligned.
>
> But what if we take a struct-of-arrays approach and represent it more like:
>
> struct FDE_and_FREs {
> struct sframe_func_desc_entry fde_metadata;
> u8|u16|u32 start_addrs[N]; /* can extend to u64 as well */
> u8 sfre_infos[N];
> u8 offsets8[M8];
> u16 offsets16[M16] __aligned(2);
> u32 offsets32[M32] __aligned(4);
> /* we can naturally extend to support also u64 offsets */
> };
>
> i.e., we split all FRE records into their three constituents: start
> addresses, info bytes, and then each FRE can fall into either 8-, 16-,
> or 32-bit offsets "bucket". We collect all the offsets, depending on
> their size, into these aligned offsets{8,16,32} arrays (with natural
> extension to 64 bits, if necessary), with at most wasting 1-3 bytes to
> ensure proper alignment everywhere.
>
> Note, at this point we need to decide if we want to make FREs binary
> searchable or not.
>
> If not, we don't really need anything extra. As we process each
> start_addrs[i] and sfre_infos[i] to find matching FRE, we keep track
> of how many 8-, 16-, and 32-bit offsets already processed FREs
> consumed, and when we find the right one, we know exactly the starting
> index within offset{8,16,32}. Done.
>
> But if we were to make FREs binary searchable, we need to basically
> have an index of offset pointers to quickly find offsetsX[j] position
> corresponding to FRE #i. For that, we can have an extra array right
> next to start_addrs, "semantically parallel" to it:
>
> u8|u16|u32 start_addrs[N];
> u8|u16|u32 offset_idxs[N];
>
> where start_addrs[i] corresponds to offset_idxs[i], and offset_idxs[i]
> points to the first offset corresponding to FRE #i in offsetX[] array
> (depending on FRE's "bitness"). This is a bit more storage for this
> offset index, but for FDEs with lots of FREs this might be a
> worthwhile tradeoff.
>
> Few points:
> a) we can decide this "binary searchability" per-FDE, and for FDEs
> with 1-2-3 FREs not bother, while those with more FREs would be
> searchable ones with index. So we can combine both fast lookups,
> natural alignment of on-disk format, and compactness. The presence of
> index is just another bit in FDE metadata.
I have been going back and forth on this one. So there seem to be the
following options here:
#1. Make "binary searchability" a per-FDE decision.
#2. Make "binary searchability" a per-section decision (I expect
aarch64 to have very low number of FREs per FDE).
#3. Bake "binary searchability" into the SFrame FRE specification.
So its always ON for all FDEs. The advantage is that it makes stack
tracers simpler to implement with less code.
I do think #2, #3 appear simpler in concept.
> b) bitness of offset_idxs[] can be coupled with bitness of
> start_addrs (for simplicity), or could be completely independent and
> identified by FDE's metadata (2 more bits to define this just like
> start_addr bitness is defined). Independent probably would be my
> preference, with linker (or whoever will be producing .sframe data)
> can pick the smallest bitness that is sufficient to represent
> everything.
>
ATM, GAS does apply special logic to decide the bitness of start_addrs
per function, and ld just uses that info. Coupling the bitness of
offset_idx with bitness of start_addrs will be easy (or _easier_ I
think), but for now, I leave it as "should be doable" :)
> Yes, it's a bit more complicated to draw and explain, but everything
> will be nicely aligned, extensible to 64 bits, and (optionally at
> least) binary searchable. Implementation-wise on the kernel side it
> shouldn't be significantly more involved. Maybe the compiler would
> need to be a bit smarter when producing FDE data, but it's no rocket
> science.
>
> Thoughts?
Combining the requirements from your email and Josh's follow up:
- No unaligned accesses
- Sorted FREs
I would put compaction as a "good to have" requirement. It appears to
me that any compaction will mean a sort of post-processing which will
interfere with JIT usecase.
On Thu, Jan 30, 2025 at 1:21 PM Indu Bhagat <indu.bhagat@oracle.com> wrote:
>
> On 1/27/25 5:10 PM, Andrii Nakryiko wrote:
> >>>>> +struct sframe_preamble {
> >>>>> + u16 magic;
> >>>>> + u8 version;
> >>>>> + u8 flags;
> >>>>> +} __packed;
> >>>>> +
> >>>>> +struct sframe_header {
> >>>>> + struct sframe_preamble preamble;
> >>>>> + u8 abi_arch;
> >>>>> + s8 cfa_fixed_fp_offset;
> >>>>> + s8 cfa_fixed_ra_offset;
> >>>>> + u8 auxhdr_len;
> >>>>> + u32 num_fdes;
> >>>>> + u32 num_fres;
> >>>>> + u32 fre_len;
> >>>>> + u32 fdes_off;
> >>>>> + u32 fres_off;
> >>>>> +} __packed;
> >>>>> +
> >>>>> +struct sframe_fde {
> >>>>> + s32 start_addr;
> >>>>> + u32 func_size;
> >>>>> + u32 fres_off;
> >>>>> + u32 fres_num;
> >>>>> + u8 info;
> >>>>> + u8 rep_size;
> >>>>> + u16 padding;
> >>>>> +} __packed;
> >>>> I couldn't understand from SFrame itself, but why do sframe_header,
> >>>> sframe_preamble, and sframe_fde have to be marked __packed, if it's
> >>>> all naturally aligned (intentionally and by design)?..
> >>> Right, but the spec says they're all packed. Maybe the point is that
> >>> some future sframe version is free to introduce unaligned fields.
> >>>
> >> SFrame specification aims to keep SFrame header and SFrame FDE members
> >> at aligned boundaries in future versions.
> >>
> >> Only SFrame FRE related accesses may have unaligned accesses.
> > Yeah, and it's actually bothering me quite a lot 🙂 I have a tentative
> > proposal, maybe we can discuss this for SFrame v3? Let me briefly
> > outline the idea.
> >
>
> I looked at the idea below. It could work wrt unaligned accesses.
>
> Speaking of unaligned accesses, I will ask away: Is the reason to avoid
> unaligned accesses performance hit or are there other practical reasons
> to it ?
Performance hit on architectures like x86-64 that do support
unaligned, but it's actually a CPU error for some other architectures,
so you'd need to code with that in mind, making local aligned copies,
etc. In general, I'd say it's a bit of a red flag that a format that
is meant to be memory-mapped (effectively) and used without
pre-processing requires dealing with unaligned accesses. So if we can
fix that, that would be a win.
>
> > So, currently in v2, FREs within FDEs use an array-of-structs layout.
> > If we use preudo-C type definitions, it would be something like this
> > for FDE + its FREs:
> >
> > struct FDE_and_FREs {
> > struct sframe_func_desc_entry fde_metadata;
> >
> > union FRE {
> > struct FRE8 {
> > u8 sfre_start_address;
> > u8 sfre_info;
> > u8|u16|u32 offsets[M];
> > }
> > struct FRE16 {
> > u16 sfre_start_address;
> > u16 sfre_info;
> > u8|u16|u32 offsets[M];
> > }
> > struct FRE32 {
> > u32 sfre_start_address;
> > u32 sfre_info;
> > u8|u16|u32 offsets[M];
> > }
> > } fres[N] __packed;
> > };
> >
> > where all fres[i]s are one of those FRE8/FRE16/FRE32, so start
> > addresses have the same size, but each FRE has potentially different
> > offsets sizing, so there is no common alignment, and so everything has
> > to be packed and unaligned.
> >
> > But what if we take a struct-of-arrays approach and represent it more like:
> >
> > struct FDE_and_FREs {
> > struct sframe_func_desc_entry fde_metadata;
> > u8|u16|u32 start_addrs[N]; /* can extend to u64 as well */
> > u8 sfre_infos[N];
> > u8 offsets8[M8];
> > u16 offsets16[M16] __aligned(2);
> > u32 offsets32[M32] __aligned(4);
> > /* we can naturally extend to support also u64 offsets */
> > };
> >
> > i.e., we split all FRE records into their three constituents: start
> > addresses, info bytes, and then each FRE can fall into either 8-, 16-,
> > or 32-bit offsets "bucket". We collect all the offsets, depending on
> > their size, into these aligned offsets{8,16,32} arrays (with natural
> > extension to 64 bits, if necessary), with at most wasting 1-3 bytes to
> > ensure proper alignment everywhere.
> >
> > Note, at this point we need to decide if we want to make FREs binary
> > searchable or not.
> >
> > If not, we don't really need anything extra. As we process each
> > start_addrs[i] and sfre_infos[i] to find matching FRE, we keep track
> > of how many 8-, 16-, and 32-bit offsets already processed FREs
> > consumed, and when we find the right one, we know exactly the starting
> > index within offset{8,16,32}. Done.
> >
> > But if we were to make FREs binary searchable, we need to basically
> > have an index of offset pointers to quickly find offsetsX[j] position
> > corresponding to FRE #i. For that, we can have an extra array right
> > next to start_addrs, "semantically parallel" to it:
> >
> > u8|u16|u32 start_addrs[N];
> > u8|u16|u32 offset_idxs[N];
> >
> > where start_addrs[i] corresponds to offset_idxs[i], and offset_idxs[i]
> > points to the first offset corresponding to FRE #i in offsetX[] array
> > (depending on FRE's "bitness"). This is a bit more storage for this
> > offset index, but for FDEs with lots of FREs this might be a
> > worthwhile tradeoff.
> >
> > Few points:
> > a) we can decide this "binary searchability" per-FDE, and for FDEs
> > with 1-2-3 FREs not bother, while those with more FREs would be
> > searchable ones with index. So we can combine both fast lookups,
> > natural alignment of on-disk format, and compactness. The presence of
> > index is just another bit in FDE metadata.
>
> I have been going back and forth on this one. So there seem to be the
> following options here:
> #1. Make "binary searchability" a per-FDE decision.
> #2. Make "binary searchability" a per-section decision (I expect
> aarch64 to have very low number of FREs per FDE).
> #3. Bake "binary searchability" into the SFrame FRE specification.
> So its always ON for all FDEs. The advantage is that it makes stack
> tracers simpler to implement with less code.
>
> I do think #2, #3 appear simpler in concept.
Whichever makes it easier across the entire stack (compiler, linker,
kernel/unwinder). As long as binary searchability is possible,
especially for FDEs with lots of FREs. Making it per-FDE just allows
to pick most compact (but still with good performance) representation.
>
> > b) bitness of offset_idxs[] can be coupled with bitness of
> > start_addrs (for simplicity), or could be completely independent and
> > identified by FDE's metadata (2 more bits to define this just like
> > start_addr bitness is defined). Independent probably would be my
> > preference, with linker (or whoever will be producing .sframe data)
> > can pick the smallest bitness that is sufficient to represent
> > everything.
> >
>
> ATM, GAS does apply special logic to decide the bitness of start_addrs
> per function, and ld just uses that info. Coupling the bitness of
> offset_idx with bitness of start_addrs will be easy (or _easier_ I
> think), but for now, I leave it as "should be doable" :)
Those offsets are relative to fde's start_addr, right? So, generally
speaking, should be usually small? I my understanding is correct, then
yeah, coupling is probably ok.
>
> > Yes, it's a bit more complicated to draw and explain, but everything
> > will be nicely aligned, extensible to 64 bits, and (optionally at
> > least) binary searchable. Implementation-wise on the kernel side it
> > shouldn't be significantly more involved. Maybe the compiler would
> > need to be a bit smarter when producing FDE data, but it's no rocket
> > science.
> >
> > Thoughts?
>
> Combining the requirements from your email and Josh's follow up:
> - No unaligned accesses
> - Sorted FREs
>
> I would put compaction as a "good to have" requirement. It appears to
> me that any compaction will mean a sort of post-processing which will
> interfere with JIT usecase.
>
sgtm
On Thu, Jan 30, 2025 at 01:21:21PM -0800, Indu Bhagat wrote: > > Yeah, and it's actually bothering me quite a lot 🙂 I have a tentative > > proposal, maybe we can discuss this for SFrame v3? Let me briefly > > outline the idea. > > > > I looked at the idea below. It could work wrt unaligned accesses. > > Speaking of unaligned accesses, I will ask away: Is the reason to avoid > unaligned accesses performance hit or are there other practical reasons to > it ? I think performance is the main concern, though there are still some CPU arches out there which don't support unaligned accesses. > Combining the requirements from your email and Josh's follow up: > - No unaligned accesses > - Sorted FREs > > I would put compaction as a "good to have" requirement. It appears to me > that any compaction will mean a sort of post-processing which will interfere > with JIT usecase. I think we should still consider the fast lookup table. We might want to prototype something just to see what the speedup looks like. Similar to compaction it could just be an optional feature implemented by the linker which JIT doesn't have to use. -- Josh
On Mon, Jan 27, 2025 at 05:10:27PM -0800, Andrii Nakryiko wrote:
> > Yes, in theory, it is allowed (as per the specification) to have an
> > SFrame section with zero number of FDEs/FREs. But since such a section
> > will not be useful, I share the opinion that it makes sense to disallow
> > it in the current unwinding contexts, for now (JIT usecase may change
> > things later).
> >
>
> I disagree, actually. If it's a legal thing, it shouldn't be randomly
> rejected. If we later make use of that, we'd have to worry not to
> accidentally cause problems on older kernels that arbitrarily rejected
> empty FDE just because it didn't make sense at some point (without
> causing any issues).
If such older kernels don't do anything with the section anyway, what's
the point of pretending they do?
Returning an error would actually make more sense as it communicates
that the kernel doesn't support whatever hypothetical thing you're
trying to do with 0 FDEs.
> > SFRAME_F_FRAME_POINTER flag is not being set currently by GAS/GNU ld at all.
> >
> > >>> + dbg("no fde/fre entries\n");
> > >>> + return -EINVAL;
> > >>> + }
> > >>> +
> > >>> + header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
> > >>> + if (header_end >= sec->sframe_end) {
> > >>
> > >> if we allow zero FDEs/FREs, header_end == sec->sframe_end is legal, right?
> > >
> > > I suppose so, but again I'm not seeing any reason to support that.
>
> Let's invert this. Is there any reason why it shouldn't be supported? ;)
It's simple, we don't add code to "support" some vague hypothetical.
For whatever definition of "support", since there's literally nothing
the kernel can do with that.
> But what if we take a struct-of-arrays approach and represent it more like:
>
> struct FDE_and_FREs {
> struct sframe_func_desc_entry fde_metadata;
> u8|u16|u32 start_addrs[N]; /* can extend to u64 as well */
> u8 sfre_infos[N];
> u8 offsets8[M8];
> u16 offsets16[M16] __aligned(2);
> u32 offsets32[M32] __aligned(4);
> /* we can naturally extend to support also u64 offsets */
> };
>
> i.e., we split all FRE records into their three constituents: start
> addresses, info bytes, and then each FRE can fall into either 8-, 16-,
> or 32-bit offsets "bucket". We collect all the offsets, depending on
> their size, into these aligned offsets{8,16,32} arrays (with natural
> extension to 64 bits, if necessary), with at most wasting 1-3 bytes to
> ensure proper alignment everywhere.
Makes sense. Though I also have another idea below.
> Note, at this point we need to decide if we want to make FREs binary
> searchable or not.
>
> If not, we don't really need anything extra. As we process each
> start_addrs[i] and sfre_infos[i] to find matching FRE, we keep track
> of how many 8-, 16-, and 32-bit offsets already processed FREs
> consumed, and when we find the right one, we know exactly the starting
> index within offset{8,16,32}. Done.
>
> But if we were to make FREs binary searchable, we need to basically
> have an index of offset pointers to quickly find offsetsX[j] position
> corresponding to FRE #i. For that, we can have an extra array right
> next to start_addrs, "semantically parallel" to it:
>
> u8|u16|u32 start_addrs[N];
> u8|u16|u32 offset_idxs[N];
Binary search would definitely help. I did a crude histogram for "FREs
per FDE" for a few binaries on my test system:
gdb (the biggest binary on my test system):
10th Percentile: 1
20th Percentile: 1
30th Percentile: 1
40th Percentile: 3
50th Percentile: 5
60th Percentile: 8
70th Percentile: 11
80th Percentile: 14
90th Percentile: 16
100th Percentile: 472
bash:
10th Percentile: 1
20th Percentile: 1
30th Percentile: 3
40th Percentile: 5
50th Percentile: 7
60th Percentile: 9
70th Percentile: 12
80th Percentile: 16
90th Percentile: 17
100th Percentile: 46
glibc:
10th Percentile: 1
20th Percentile: 1
30th Percentile: 1
40th Percentile: 1
50th Percentile: 4
60th Percentile: 6
70th Percentile: 9
80th Percentile: 14
90th Percentile: 16
100th Percentile: 44
libpython:
10th Percentile: 1
20th Percentile: 3
30th Percentile: 4
40th Percentile: 6
50th Percentile: 8
60th Percentile: 11
70th Percentile: 12
80th Percentile: 16
90th Percentile: 20
100th Percentile: 112
So binary search would help in a lot of cases.
However, if we're going that route, we might want to even consider a
completely revamped data layout. For example:
One insight is that the vast majority of (cfa, fp, ra) tuples aren't
unique. They could be deduped by storing the unique tuples in a
standalone 'fre_data' array which is referenced by another
address-specific array.
struct fre_data {
s8|s16|s32 cfa, fp, ra;
u8 info;
};
struct fre_data fre_data[num_fre_data];
The storage sizes of cfa/fp/ra can be a constant specified in the global
sframe header. It's fine all being the same size as it looks like this
array wouldn't typically be more than a few hundred entries anyway.
Then there would be an array of sorted FRE entries which reference the
fre_data[] entries:
struct fre {
s32|s64 start_address;
u8|u16 fre_data_idx;
} __packed;
struct fre fres[num_fres];
(For alignment reasons that should probably be two separate arrays, even
though not ideal for cache locality)
Here again the field storage sizes would be specified in the global
sframe header.
Note FDEs aren't even needed here as the unwinder doesn't need to know
when a function begins/ends. The only info needed by the unwinder is
just the fre_data struct. So a simple binary search of fres[] is all
that's really needed.
But wait, there's more...
The binary search could be made significantly faster using a small fast
lookup array which is indexed evenly across the text address offset
space, similar to what ORC does:
u32 fre_chunks[num_chunks];
The text address space (starting at offset 0) can be split into
'num_chunks' chunks of size 'chunk_size'. The value of
fre_chunks[offset/chunk_size] is an index into the fres[] array.
Taking my gdb binary as an example:
.text is 6417058 bytes, with 146997 total sframe FREs. Assuming a chunk
size of 1024, fre_chunks[] needs 6417058/1024 = 6267 entries.
For unwinding at text offset 0x400000, the index into fre_chunks[] would
be 0x400000/1024 = 4096. If fre_chunks[4096] == 96074 and
fre_chunks[4096+1] == 96098, you need only do a binary search of the 24
entries between fres[96074] && fres[96098] rather than searching the
entire 146997 byte array.
.sframe size calculation:
374 unique fre_data entries (out of 146997 total FREs!)
= 374 * (2 * 3) = 2244 bytes
146997 fre entries
= 146997 * (4 + 2) = 881982 bytes
.text size 6417058 (chunk_size = 1024, num_chunks=6267)
= 6267 * 8 = 43869 bytes
Approximate total .sframe size would be 2244 + 881982 + 8192 = 906k,
plus negligible header size. Which is smaller than the v2 .sframe on my
gdb binary (985k).
With the chunked lookup table, the avg lookup is:
log2(146997/6267) = ~4.5 iterations
whereas a full binary search would be:
log2(146997) = 17 iterations
So assuming I got all that math right, it's over 3x faster and the
binary is smaller (or at least should be roughly comparable).
Of course the downside is it's an all new format. Presumably the linker
would need to do more work than it's currently doing, e.g., find all the
duplicates and set up the data accordingly.
--
Josh
On 29.01.2025 03:02, Josh Poimboeuf wrote: > Note FDEs aren't even needed here as the unwinder doesn't need to know > when a function begins/ends. The only info needed by the unwinder is > just the fre_data struct. So a simple binary search of fres[] is all > that's really needed. In SFrame V2 FDEs specify ranges bound by function start address and length. FREs in contrast specify open ranges bounded by start address. Their effect ends either with the next FRE becoming into effect or when their FDE range ends. This concept enables holes in the .text section which do not have any valid FDE/FRE information associated. Your proposal lacks some sort of mechanism to replicate those holes. It could be FDEs with a flag (or no offsets?) that specifies their range has no valid information. Regards, Jens -- Jens Remus Linux on Z Development (D3303) +49-7031-16-1128 Office jremus@de.ibm.com IBM IBM Deutschland Research & Development GmbH; Vorsitzender des Aufsichtsrats: Wolfgang Wendt; Geschäftsführung: David Faller; Sitz der Gesellschaft: Böblingen; Registergericht: Amtsgericht Stuttgart, HRB 243294 IBM Data Privacy Statement: https://www.ibm.com/privacy/
On Wed, Feb 05, 2025 at 02:56:36PM +0100, Jens Remus wrote: > On 29.01.2025 03:02, Josh Poimboeuf wrote: > > > Note FDEs aren't even needed here as the unwinder doesn't need to know > > when a function begins/ends. The only info needed by the unwinder is > > just the fre_data struct. So a simple binary search of fres[] is all > > that's really needed. > > In SFrame V2 FDEs specify ranges bound by function start address and > length. FREs in contrast specify open ranges bounded by start address. > Their effect ends either with the next FRE becoming into effect or when > their FDE range ends. > This concept enables holes in the .text section which do not have any > valid FDE/FRE information associated. > > Your proposal lacks some sort of mechanism to replicate those holes. > It could be FDEs with a flag (or no offsets?) that specifies their > range has no valid information. In ORC, a hole is simply specified by an ORC entry (aka "FRE") of type UNDEFINED. That could be done here as well: the linker would replace a gap with an "undefined" FRE which could be identified either by setting a bit in the FRE header, or by using a special fre_data[] entry. For example, fre_data index 0 could just be a placeholder for the undefined FRE type. -- Josh
On 1/28/25 6:02 PM, Josh Poimboeuf wrote:
> On Mon, Jan 27, 2025 at 05:10:27PM -0800, Andrii Nakryiko wrote:
>>> Yes, in theory, it is allowed (as per the specification) to have an
>>> SFrame section with zero number of FDEs/FREs. But since such a section
>>> will not be useful, I share the opinion that it makes sense to disallow
>>> it in the current unwinding contexts, for now (JIT usecase may change
>>> things later).
>>>
>>
>> I disagree, actually. If it's a legal thing, it shouldn't be randomly
>> rejected. If we later make use of that, we'd have to worry not to
>> accidentally cause problems on older kernels that arbitrarily rejected
>> empty FDE just because it didn't make sense at some point (without
>> causing any issues).
>
> If such older kernels don't do anything with the section anyway, what's
> the point of pretending they do?
>
> Returning an error would actually make more sense as it communicates
> that the kernel doesn't support whatever hypothetical thing you're
> trying to do with 0 FDEs.
>
>>> SFRAME_F_FRAME_POINTER flag is not being set currently by GAS/GNU ld at all.
>>>
>>>>>> + dbg("no fde/fre entries\n");
>>>>>> + return -EINVAL;
>>>>>> + }
>>>>>> +
>>>>>> + header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
>>>>>> + if (header_end >= sec->sframe_end) {
>>>>>
>>>>> if we allow zero FDEs/FREs, header_end == sec->sframe_end is legal, right?
>>>>
>>>> I suppose so, but again I'm not seeing any reason to support that.
>>
>> Let's invert this. Is there any reason why it shouldn't be supported? ;)
>
> It's simple, we don't add code to "support" some vague hypothetical.
>
> For whatever definition of "support", since there's literally nothing
> the kernel can do with that.
>
>> But what if we take a struct-of-arrays approach and represent it more like:
>>
>> struct FDE_and_FREs {
>> struct sframe_func_desc_entry fde_metadata;
>> u8|u16|u32 start_addrs[N]; /* can extend to u64 as well */
>> u8 sfre_infos[N];
>> u8 offsets8[M8];
>> u16 offsets16[M16] __aligned(2);
>> u32 offsets32[M32] __aligned(4);
>> /* we can naturally extend to support also u64 offsets */
>> };
>>
>> i.e., we split all FRE records into their three constituents: start
>> addresses, info bytes, and then each FRE can fall into either 8-, 16-,
>> or 32-bit offsets "bucket". We collect all the offsets, depending on
>> their size, into these aligned offsets{8,16,32} arrays (with natural
>> extension to 64 bits, if necessary), with at most wasting 1-3 bytes to
>> ensure proper alignment everywhere.
>
> Makes sense. Though I also have another idea below.
>
>> Note, at this point we need to decide if we want to make FREs binary
>> searchable or not.
>>
>> If not, we don't really need anything extra. As we process each
>> start_addrs[i] and sfre_infos[i] to find matching FRE, we keep track
>> of how many 8-, 16-, and 32-bit offsets already processed FREs
>> consumed, and when we find the right one, we know exactly the starting
>> index within offset{8,16,32}. Done.
>>
>> But if we were to make FREs binary searchable, we need to basically
>> have an index of offset pointers to quickly find offsetsX[j] position
>> corresponding to FRE #i. For that, we can have an extra array right
>> next to start_addrs, "semantically parallel" to it:
>>
>> u8|u16|u32 start_addrs[N];
>> u8|u16|u32 offset_idxs[N];
>
> Binary search would definitely help. I did a crude histogram for "FREs
> per FDE" for a few binaries on my test system:
>
> gdb (the biggest binary on my test system):
>
> 10th Percentile: 1
> 20th Percentile: 1
> 30th Percentile: 1
> 40th Percentile: 3
> 50th Percentile: 5
> 60th Percentile: 8
> 70th Percentile: 11
> 80th Percentile: 14
> 90th Percentile: 16
> 100th Percentile: 472
>
> bash:
>
> 10th Percentile: 1
> 20th Percentile: 1
> 30th Percentile: 3
> 40th Percentile: 5
> 50th Percentile: 7
> 60th Percentile: 9
> 70th Percentile: 12
> 80th Percentile: 16
> 90th Percentile: 17
> 100th Percentile: 46
>
> glibc:
>
> 10th Percentile: 1
> 20th Percentile: 1
> 30th Percentile: 1
> 40th Percentile: 1
> 50th Percentile: 4
> 60th Percentile: 6
> 70th Percentile: 9
> 80th Percentile: 14
> 90th Percentile: 16
> 100th Percentile: 44
>
> libpython:
>
> 10th Percentile: 1
> 20th Percentile: 3
> 30th Percentile: 4
> 40th Percentile: 6
> 50th Percentile: 8
> 60th Percentile: 11
> 70th Percentile: 12
> 80th Percentile: 16
> 90th Percentile: 20
> 100th Percentile: 112
>
> So binary search would help in a lot of cases.
>
Thanks for gathering this.
I suspect on aarch64, the numbers will be very different (leaning
towards very low number of FREs per FDE).
Making SFrame FREs amenable to binary search can be targeted. Both your
and Andrii's proposal do address that...
> However, if we're going that route, we might want to even consider a
> completely revamped data layout. For example:
>
> One insight is that the vast majority of (cfa, fp, ra) tuples aren't
> unique. They could be deduped by storing the unique tuples in a
> standalone 'fre_data' array which is referenced by another
> address-specific array.
>
> struct fre_data {
> s8|s16|s32 cfa, fp, ra;
> u8 info;
> };
> struct fre_data fre_data[num_fre_data];
>
We had the same observation at the time of SFrame V1. And this method
of compaction (deduped tuples) was brain-stormed a bit. Back then, the
costs were thought to be:
- more work at build time.
- an additional data access once the FRE is found (as there is
indirection).
So it was really compaction at the costs above. We did steer towards
simplicity and the SFrame FRE is what it stands today.
The difference in the pros and cons now from then:
- pros: helps mitigate unaligned accesses
- cons: interferes slightly with the design goal of efficient
addition and removal of stack trace information per function for JIT.
Think "removal" as the set of actions necessary for addressing
fragmentation in SFrame section data in JIT usecase.
> The storage sizes of cfa/fp/ra can be a constant specified in the global
> sframe header. It's fine all being the same size as it looks like this
> array wouldn't typically be more than a few hundred entries anyway.
>
> Then there would be an array of sorted FRE entries which reference the
> fre_data[] entries:
>
> struct fre {
> s32|s64 start_address;
> u8|u16 fre_data_idx;
>
> } __packed;
> struct fre fres[num_fres];
>
> (For alignment reasons that should probably be two separate arrays, even
> though not ideal for cache locality)
>
> Here again the field storage sizes would be specified in the global
> sframe header.
>
> Note FDEs aren't even needed here as the unwinder doesn't need to know
> when a function begins/ends. The only info needed by the unwinder is
> just the fre_data struct. So a simple binary search of fres[] is all
> that's really needed.
>
Splitting out information (start_address) to an FDE (as done in V1/V2)
has the benefit that a job like relocating information is proportional
to O(NumFunctions).
In the case above, IIUC, where the proposal puts start_address in the
FRE, these costs will be (much) higher.
In addition, not being able to identify stack trace information per
function will affect the JIT usecase. We need to able to mark stack
trace information stale for functions in JIT environment.
I think the first conceptual landing point in the information layout
should be a per-function entry.
> But wait, there's more...
>
> The binary search could be made significantly faster using a small fast
> lookup array which is indexed evenly across the text address offset
> space, similar to what ORC does:
>
> u32 fre_chunks[num_chunks];
>
> The text address space (starting at offset 0) can be split into
> 'num_chunks' chunks of size 'chunk_size'. The value of
> fre_chunks[offset/chunk_size] is an index into the fres[] array.
>
> Taking my gdb binary as an example:
>
> .text is 6417058 bytes, with 146997 total sframe FREs. Assuming a chunk
> size of 1024, fre_chunks[] needs 6417058/1024 = 6267 entries.
>
> For unwinding at text offset 0x400000, the index into fre_chunks[] would
> be 0x400000/1024 = 4096. If fre_chunks[4096] == 96074 and
> fre_chunks[4096+1] == 96098, you need only do a binary search of the 24
> entries between fres[96074] && fres[96098] rather than searching the
> entire 146997 byte array.
>
> .sframe size calculation:
>
> 374 unique fre_data entries (out of 146997 total FREs!)
> = 374 * (2 * 3) = 2244 bytes
>
> 146997 fre entries
> = 146997 * (4 + 2) = 881982 bytes
>
> .text size 6417058 (chunk_size = 1024, num_chunks=6267)
> = 6267 * 8 = 43869 bytes
>
> Approximate total .sframe size would be 2244 + 881982 + 8192 = 906k,
> plus negligible header size. Which is smaller than the v2 .sframe on my
> gdb binary (985k).
>
> With the chunked lookup table, the avg lookup is:
>
> log2(146997/6267) = ~4.5 iterations
>
> whereas a full binary search would be:
>
> log2(146997) = 17 iterations
>
> So assuming I got all that math right, it's over 3x faster and the
> binary is smaller (or at least should be roughly comparable).
>
> Of course the downside is it's an all new format. Presumably the linker
> would need to do more work than it's currently doing, e.g., find all the
> duplicates and set up the data accordingly.
>
On Thu, Jan 30, 2025 at 01:39:52PM -0800, Indu Bhagat wrote:
> On 1/28/25 6:02 PM, Josh Poimboeuf wrote:
> > However, if we're going that route, we might want to even consider a
> > completely revamped data layout. For example:
> >
> > One insight is that the vast majority of (cfa, fp, ra) tuples aren't
> > unique. They could be deduped by storing the unique tuples in a
> > standalone 'fre_data' array which is referenced by another
> > address-specific array.
> >
> > struct fre_data {
> > s8|s16|s32 cfa, fp, ra;
> > u8 info;
> > };
> > struct fre_data fre_data[num_fre_data];
> >
>
> We had the same observation at the time of SFrame V1. And this method of
> compaction (deduped tuples) was brain-stormed a bit. Back then, the costs
> were thought to be:
> - more work at build time.
> - an additional data access once the FRE is found (as there is
> indirection).
>
> So it was really compaction at the costs above. We did steer towards
> simplicity and the SFrame FRE is what it stands today.
>
> The difference in the pros and cons now from then:
> - pros: helps mitigate unaligned accesses
> - cons: interferes slightly with the design goal of efficient addition and
> removal of stack trace information per function for JIT. Think "removal" as
> the set of actions necessary for addressing fragmentation in SFrame section
> data in JIT usecase.
If fre_data[] is allowed to have duplicates then the deduping could be
optional.
> > Note FDEs aren't even needed here as the unwinder doesn't need to know
> > when a function begins/ends. The only info needed by the unwinder is
> > just the fre_data struct. So a simple binary search of fres[] is all
> > that's really needed.
>
> Splitting out information (start_address) to an FDE (as done in V1/V2) has
> the benefit that a job like relocating information is proportional to
> O(NumFunctions).
>
> In the case above, IIUC, where the proposal puts start_address in the FRE,
> these costs will be (much) higher.
I'm not sure I follow, is this referring to the link-time work of
sorting things?
> In addition, not being able to identify stack trace information per function
> will affect the JIT usecase. We need to able to mark stack trace
> information stale for functions in JIT environment.
Maybe, though it's hard to really say how any of these changes would
affect JIT without knowing what those interfaces are going to look like.
--
Josh
On 2/4/25 4:57 PM, Josh Poimboeuf wrote:
> On Thu, Jan 30, 2025 at 01:39:52PM -0800, Indu Bhagat wrote:
>> On 1/28/25 6:02 PM, Josh Poimboeuf wrote:
>>> However, if we're going that route, we might want to even consider a
>>> completely revamped data layout. For example:
>>>
>>> One insight is that the vast majority of (cfa, fp, ra) tuples aren't
>>> unique. They could be deduped by storing the unique tuples in a
>>> standalone 'fre_data' array which is referenced by another
>>> address-specific array.
>>>
>>> struct fre_data {
>>> s8|s16|s32 cfa, fp, ra;
>>> u8 info;
>>> };
>>> struct fre_data fre_data[num_fre_data];
>>>
>>
>> We had the same observation at the time of SFrame V1. And this method of
>> compaction (deduped tuples) was brain-stormed a bit. Back then, the costs
>> were thought to be:
>> - more work at build time.
>> - an additional data access once the FRE is found (as there is
>> indirection).
>>
>> So it was really compaction at the costs above. We did steer towards
>> simplicity and the SFrame FRE is what it stands today.
>>
>> The difference in the pros and cons now from then:
>> - pros: helps mitigate unaligned accesses
>> - cons: interferes slightly with the design goal of efficient addition and
>> removal of stack trace information per function for JIT. Think "removal" as
>> the set of actions necessary for addressing fragmentation in SFrame section
>> data in JIT usecase.
>
> If fre_data[] is allowed to have duplicates then the deduping could be
> optional.
>
>>> Note FDEs aren't even needed here as the unwinder doesn't need to know
>>> when a function begins/ends. The only info needed by the unwinder is
>>> just the fre_data struct. So a simple binary search of fres[] is all
>>> that's really needed.
>>
>> Splitting out information (start_address) to an FDE (as done in V1/V2) has
>> the benefit that a job like relocating information is proportional to
>> O(NumFunctions).
>>
>> In the case above, IIUC, where the proposal puts start_address in the FRE,
>> these costs will be (much) higher.
>
> I'm not sure I follow, is this referring to the link-time work of
> sorting things?
>
I meant the work of tracking the start address of each function. This
could be done at link-time as is done in most cases.
But also depending on the case : e.g., kernel module loader will need to
apply these relocations in the .rela.sframe section...
If the granularity is finer than a function, more number of relocations
will need to be applied.
>> In addition, not being able to identify stack trace information per function
>> will affect the JIT usecase. We need to able to mark stack trace
>> information stale for functions in JIT environment.
>
> Maybe, though it's hard to really say how any of these changes would
> affect JIT without knowing what those interfaces are going to look like.
>
On Tue, Jan 28, 2025 at 6:02 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> On Mon, Jan 27, 2025 at 05:10:27PM -0800, Andrii Nakryiko wrote:
> > > Yes, in theory, it is allowed (as per the specification) to have an
> > > SFrame section with zero number of FDEs/FREs. But since such a section
> > > will not be useful, I share the opinion that it makes sense to disallow
> > > it in the current unwinding contexts, for now (JIT usecase may change
> > > things later).
> > >
> >
> > I disagree, actually. If it's a legal thing, it shouldn't be randomly
> > rejected. If we later make use of that, we'd have to worry not to
> > accidentally cause problems on older kernels that arbitrarily rejected
> > empty FDE just because it didn't make sense at some point (without
> > causing any issues).
>
> If such older kernels don't do anything with the section anyway, what's
> the point of pretending they do?
>
> Returning an error would actually make more sense as it communicates
> that the kernel doesn't support whatever hypothetical thing you're
> trying to do with 0 FDEs.
>
> > > SFRAME_F_FRAME_POINTER flag is not being set currently by GAS/GNU ld at all.
> > >
> > > >>> + dbg("no fde/fre entries\n");
> > > >>> + return -EINVAL;
> > > >>> + }
> > > >>> +
> > > >>> + header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
> > > >>> + if (header_end >= sec->sframe_end) {
> > > >>
> > > >> if we allow zero FDEs/FREs, header_end == sec->sframe_end is legal, right?
> > > >
> > > > I suppose so, but again I'm not seeing any reason to support that.
> >
> > Let's invert this. Is there any reason why it shouldn't be supported? ;)
>
> It's simple, we don't add code to "support" some vague hypothetical.
>
> For whatever definition of "support", since there's literally nothing
> the kernel can do with that.
If under the format specification it's legal, there is no reason for
the kernel to proactively reject it, IMO. But ok, whatever.
>
> > But what if we take a struct-of-arrays approach and represent it more like:
> >
> > struct FDE_and_FREs {
> > struct sframe_func_desc_entry fde_metadata;
> > u8|u16|u32 start_addrs[N]; /* can extend to u64 as well */
> > u8 sfre_infos[N];
> > u8 offsets8[M8];
> > u16 offsets16[M16] __aligned(2);
> > u32 offsets32[M32] __aligned(4);
> > /* we can naturally extend to support also u64 offsets */
> > };
> >
> > i.e., we split all FRE records into their three constituents: start
> > addresses, info bytes, and then each FRE can fall into either 8-, 16-,
> > or 32-bit offsets "bucket". We collect all the offsets, depending on
> > their size, into these aligned offsets{8,16,32} arrays (with natural
> > extension to 64 bits, if necessary), with at most wasting 1-3 bytes to
> > ensure proper alignment everywhere.
>
> Makes sense. Though I also have another idea below.
>
> > Note, at this point we need to decide if we want to make FREs binary
> > searchable or not.
> >
> > If not, we don't really need anything extra. As we process each
> > start_addrs[i] and sfre_infos[i] to find matching FRE, we keep track
> > of how many 8-, 16-, and 32-bit offsets already processed FREs
> > consumed, and when we find the right one, we know exactly the starting
> > index within offset{8,16,32}. Done.
> >
> > But if we were to make FREs binary searchable, we need to basically
> > have an index of offset pointers to quickly find offsetsX[j] position
> > corresponding to FRE #i. For that, we can have an extra array right
> > next to start_addrs, "semantically parallel" to it:
> >
> > u8|u16|u32 start_addrs[N];
> > u8|u16|u32 offset_idxs[N];
>
> Binary search would definitely help. I did a crude histogram for "FREs
> per FDE" for a few binaries on my test system:
>
> gdb (the biggest binary on my test system):
>
> 10th Percentile: 1
> 20th Percentile: 1
> 30th Percentile: 1
> 40th Percentile: 3
> 50th Percentile: 5
> 60th Percentile: 8
> 70th Percentile: 11
> 80th Percentile: 14
> 90th Percentile: 16
> 100th Percentile: 472
>
> bash:
>
> 10th Percentile: 1
> 20th Percentile: 1
> 30th Percentile: 3
> 40th Percentile: 5
> 50th Percentile: 7
> 60th Percentile: 9
> 70th Percentile: 12
> 80th Percentile: 16
> 90th Percentile: 17
> 100th Percentile: 46
>
> glibc:
>
> 10th Percentile: 1
> 20th Percentile: 1
> 30th Percentile: 1
> 40th Percentile: 1
> 50th Percentile: 4
> 60th Percentile: 6
> 70th Percentile: 9
> 80th Percentile: 14
> 90th Percentile: 16
> 100th Percentile: 44
>
> libpython:
>
> 10th Percentile: 1
> 20th Percentile: 3
> 30th Percentile: 4
> 40th Percentile: 6
> 50th Percentile: 8
> 60th Percentile: 11
> 70th Percentile: 12
> 80th Percentile: 16
> 90th Percentile: 20
> 100th Percentile: 112
>
> So binary search would help in a lot of cases.
yep, agreed, seems like a worthwhile thing to support, given the stats
(I suspect big production C++ applications might be even worse in this
regard)
>
> However, if we're going that route, we might want to even consider a
> completely revamped data layout. For example:
>
> One insight is that the vast majority of (cfa, fp, ra) tuples aren't
> unique. They could be deduped by storing the unique tuples in a
> standalone 'fre_data' array which is referenced by another
> address-specific array.
>
> struct fre_data {
> s8|s16|s32 cfa, fp, ra;
> u8 info;
> };
> struct fre_data fre_data[num_fre_data];
>
> The storage sizes of cfa/fp/ra can be a constant specified in the global
> sframe header. It's fine all being the same size as it looks like this
> array wouldn't typically be more than a few hundred entries anyway.
>
> Then there would be an array of sorted FRE entries which reference the
> fre_data[] entries:
>
> struct fre {
> s32|s64 start_address;
> u8|u16 fre_data_idx;
even u16 seems dangerous, I'd use u32, not sure it's worth limiting
the format just to 64K unique combinations
>
> } __packed;
> struct fre fres[num_fres];
>
> (For alignment reasons that should probably be two separate arrays, even
> though not ideal for cache locality)
>
> Here again the field storage sizes would be specified in the global
> sframe header.
>
> Note FDEs aren't even needed here as the unwinder doesn't need to know
> when a function begins/ends. The only info needed by the unwinder is
> just the fre_data struct. So a simple binary search of fres[] is all
> that's really needed.
>
> But wait, there's more...
>
> The binary search could be made significantly faster using a small fast
> lookup array which is indexed evenly across the text address offset
> space, similar to what ORC does:
>
> u32 fre_chunks[num_chunks];
>
> The text address space (starting at offset 0) can be split into
> 'num_chunks' chunks of size 'chunk_size'. The value of
> fre_chunks[offset/chunk_size] is an index into the fres[] array.
>
> Taking my gdb binary as an example:
>
> .text is 6417058 bytes, with 146997 total sframe FREs. Assuming a chunk
> size of 1024, fre_chunks[] needs 6417058/1024 = 6267 entries.
>
> For unwinding at text offset 0x400000, the index into fre_chunks[] would
> be 0x400000/1024 = 4096. If fre_chunks[4096] == 96074 and
> fre_chunks[4096+1] == 96098, you need only do a binary search of the 24
> entries between fres[96074] && fres[96098] rather than searching the
> entire 146997 byte array.
>
> .sframe size calculation:
>
> 374 unique fre_data entries (out of 146997 total FREs!)
> = 374 * (2 * 3) = 2244 bytes
>
> 146997 fre entries
> = 146997 * (4 + 2) = 881982 bytes
>
> .text size 6417058 (chunk_size = 1024, num_chunks=6267)
> = 6267 * 8 = 43869 bytes
>
> Approximate total .sframe size would be 2244 + 881982 + 8192 = 906k,
> plus negligible header size. Which is smaller than the v2 .sframe on my
> gdb binary (985k).
>
> With the chunked lookup table, the avg lookup is:
>
> log2(146997/6267) = ~4.5 iterations
>
> whereas a full binary search would be:
>
> log2(146997) = 17 iterations
>
> So assuming I got all that math right, it's over 3x faster and the
> binary is smaller (or at least should be roughly comparable).
I'm not sure about this chunked lookup approach for arbitrary user
space applications. Those executable sections can be a) big and b)
discontiguous. E.g., one of the production binaries I looked at. Here
are its three main executable sections:
...
[17] .bolt.org.text PROGBITS 000000000b00e640 0ae0d640
0000000011ad621c 0000000000000000 AX 0 0 64
...
[48] .text PROGBITS 000000001e600000 1ce00000
0000000000775dd8 0000000000000000 AX 0 0 2097152
[49] .text.cold PROGBITS 000000001ed75e00 1d575e00
00000000007d3271 0000000000000000 AX 0 0 64
...
Total text size is about 300MB:
>>> 0x0000000000775dd8 + 0x00000000007d3271 + 0x0000000011ad621c
312603237
Section #17 ends at:
>>> hex(0x0000000011ad621c + 0x000000000b00e640)
'0x1cae485c'
While .text starts at 000000001e600000, so we have a gap of ~28MB:
>>> 0x000000001e600000 - 0x1cae485c
28424100
So unless we do something more clever to support multiple
discontiguous chunks, this seems like a bad fit for user space.
I think having all this just binary searchable is already a big win
anyways and should be plenty fast, no?
>
> Of course the downside is it's an all new format. Presumably the linker
> would need to do more work than it's currently doing, e.g., find all the
> duplicates and set up the data accordingly.
I do like the idea of deduplicating those records and just indexing
them, as in practice this should probably be much more compact. So it
might be worth looking into this some more. I'll defer to Indu.
>
> --
> Josh
On Wed, Jan 29, 2025 at 04:02:34PM -0800, Andrii Nakryiko wrote: > On Tue, Jan 28, 2025 at 6:02 PM Josh Poimboeuf <jpoimboe@kernel.org> wrote: > I'm not sure about this chunked lookup approach for arbitrary user > space applications. Those executable sections can be a) big and b) > discontiguous. E.g., one of the production binaries I looked at. Here > are its three main executable sections: > > ... > [17] .bolt.org.text PROGBITS 000000000b00e640 0ae0d640 > 0000000011ad621c 0000000000000000 AX 0 0 64 > ... > [48] .text PROGBITS 000000001e600000 1ce00000 > 0000000000775dd8 0000000000000000 AX 0 0 2097152 > [49] .text.cold PROGBITS 000000001ed75e00 1d575e00 > 00000000007d3271 0000000000000000 AX 0 0 64 > ... > > Total text size is about 300MB: > >>> 0x0000000000775dd8 + 0x00000000007d3271 + 0x0000000011ad621c > 312603237 > > Section #17 ends at: > > >>> hex(0x0000000011ad621c + 0x000000000b00e640) > '0x1cae485c' > > While .text starts at 000000001e600000, so we have a gap of ~28MB: > > >>> 0x000000001e600000 - 0x1cae485c > 28424100 > > So unless we do something more clever to support multiple > discontiguous chunks, this seems like a bad fit for user space. Nothing clever needed, we could just have multiple sframe sections, each one with a pointer to its text segment. That would also have the benefit of allowing the sframe data to be much more compact for the noncontiguous cases. > I think having all this just binary searchable is already a big win > anyways and should be plenty fast, no? Sframe is trying to compete with frame pointers which are MUCH faster. 3-4x faster in my testing, not including the page faults (which tend to only affect performance in the very beginning). -- Josh
On Fri, 24 Jan 2025 11:21:59 -0800
Josh Poimboeuf <jpoimboe@kernel.org> wrote:
> > given SFRAME_F_FRAME_POINTER in the header, is it really that
> > nonsensical and illegal to have zero FDEs/FREs? Maybe we should allow
> > that?
>
> It would seem a bit silly to create an empty .sframe section just to set
> that SFRAME_F_FRAME_POINTER bit. Regardless, there's nothing the kernel
> can do with that.
>
> > > + dbg("no fde/fre entries\n");
> > > + return -EINVAL;
> > > + }
> > > +
> > > + header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
> > > + if (header_end >= sec->sframe_end) {
> >
> > if we allow zero FDEs/FREs, header_end == sec->sframe_end is legal, right?
>
> I suppose so, but again I'm not seeing any reason to support that.
Hmm, could that be useful for implementing a way to dynamically grow or
shrink an sframe because of jits? I'm just thinking about placeholders or
something.
-- Steve
On Fri, Jan 24, 2025 at 03:13:40PM -0500, Steven Rostedt wrote:
> On Fri, 24 Jan 2025 11:21:59 -0800
> Josh Poimboeuf <jpoimboe@kernel.org> wrote:
>
> > > given SFRAME_F_FRAME_POINTER in the header, is it really that
> > > nonsensical and illegal to have zero FDEs/FREs? Maybe we should allow
> > > that?
> >
> > It would seem a bit silly to create an empty .sframe section just to set
> > that SFRAME_F_FRAME_POINTER bit. Regardless, there's nothing the kernel
> > can do with that.
> >
> > > > + dbg("no fde/fre entries\n");
> > > > + return -EINVAL;
> > > > + }
> > > > +
> > > > + header_end = sec->sframe_start + SFRAME_HEADER_SIZE(shdr);
> > > > + if (header_end >= sec->sframe_end) {
> > >
> > > if we allow zero FDEs/FREs, header_end == sec->sframe_end is legal, right?
> >
> > I suppose so, but again I'm not seeing any reason to support that.
>
> Hmm, could that be useful for implementing a way to dynamically grow or
> shrink an sframe because of jits? I'm just thinking about placeholders or
> sohething.
Maybe?
I was thinking the kernel would have sframe_section placeholders for JIT
code sections, so when sframe_find() retrieves the struct for a given
IP, it sees the JIT flag is set along with a pointer to the in-memory
shared "sframe section", then goes to read that to get the corresponding
sframe entry (insert erratic hand waving).
It's still early days but it's quite possible the in-memory "sframe
section" formats might end up looking pretty different from the .sframe
file section spec.
--
Josh
© 2016 - 2026 Red Hat, Inc.