arch/x86/Kconfig | 15 ++++ arch/x86/Makefile | 17 ++-- include/linux/func_meta.h | 17 ++++ kernel/trace/Makefile | 1 + kernel/trace/func_meta.c | 184 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 228 insertions(+), 6 deletions(-) create mode 100644 include/linux/func_meta.h create mode 100644 kernel/trace/func_meta.c
With CONFIG_CALL_PADDING enabled, there will be 16-bytes(or more) padding
space before all the kernel functions. And some kernel features can use
it, such as MITIGATION_CALL_DEPTH_TRACKING, CFI_CLANG, FINEIBT, etc.
In this commit, we add supporting to store per-function metadata in the
function padding, and previous discussion can be found in [1]. Generally
speaking, we have two way to implement this feature:
1. create a function metadata array, and prepend a 5-bytes insn
"mov %eax, 0x12345678", and store the insn to the function padding.
And 0x12345678 means the index of the function metadata in the array.
By this way, 5-bytes will be consumed in the function padding.
2. prepend a 10-bytes insn "mov %rax, 0x12345678aabbccdd" and store
the insn to the function padding, and 0x12345678aabbccdd means the address
of the function metadata.
Compared with way 2, way 1 consume less space, but we need to do more work
on the global function metadata array. And in this commit, we implemented
the way 1.
In my research, MITIGATION_CALL_DEPTH_TRACKING will consume the tail
9-bytes in the function padding, and FINEIBT+CFI_CLANG will consume
the head 7-bytes. So there will be no space for us if
MITIGATION_CALL_DEPTH_TRACKING and CFI_CLANG are both enabled. So I have
following logic:
1. use the head 5-bytes if CFI_CLANG is not enabled
2. use the tail 5-bytes if MITIGATION_CALL_DEPTH_TRACKING is not enabled
3. compile the kernel with extra 5-bytes padding if
MITIGATION_CALL_DEPTH_TRACKING and CFI_CLANG are both enabled.
In the third case, we compile the kernel with a function padding of
21-bytes, which means the real function is not 16-bytes aligned any more.
And in [2], I tested the performance of the kernel with different padding,
and it seems that extra 5-bytes don't have impact on the performance.
However, it's a huge change to make the kernel function unaligned in
16-bytes, and I'm sure if there are any other influence. So another choice
is to compile the kernel with 32-bytes aligned if there is no space
available for us in the function padding. But this will increase the text
size ~5%. (And I'm not sure with method to use.)
The beneficiaries of this feature can be BPF and ftrace. For BPF, we can
implement a "dynamic trampoline" and add tracing multi-link supporting
base on this feature. And for ftrace, we can optimize its performance
base on this feature.
This commit is not complete, as the synchronous of func_metas is not
considered :/
Link: https://lore.kernel.org/bpf/CADxym3anLzM6cAkn_z71GDd_VeKiqqk1ts=xuiP7pr4PO6USPA@mail.gmail.com/ [1]
Link: https://lore.kernel.org/bpf/CADxym3af+CU5Mx8myB8UowdXSc3wJOqWyH4oyq+eXKahXBTXyg@mail.gmail.com/ [2]
Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
---
arch/x86/Kconfig | 15 ++++
arch/x86/Makefile | 17 ++--
include/linux/func_meta.h | 17 ++++
kernel/trace/Makefile | 1 +
kernel/trace/func_meta.c | 184 ++++++++++++++++++++++++++++++++++++++
5 files changed, 228 insertions(+), 6 deletions(-)
create mode 100644 include/linux/func_meta.h
create mode 100644 kernel/trace/func_meta.c
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6df7779ed6da..0ff3cb74cfc0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2510,6 +2510,21 @@ config PREFIX_SYMBOLS
def_bool y
depends on CALL_PADDING && !CFI_CLANG
+config FUNCTION_METADATA
+ bool "Enable function metadata support"
+ select CALL_PADDING
+ default y
+ help
+ This feature allow us to set metadata for kernel functions, and
+ get the metadata of the function by its address without any
+ costing.
+
+ The index of the metadata will be stored in the function padding,
+ which will consume 5-bytes. The spare space of the padding
+ is enough for us with CALL_PADDING and FUNCTION_ALIGNMENT_16B if
+ CALL_THUNKS or CFI_CLANG not enabled. Or, we need extra 5-bytes
+ in the function padding, which will increases text ~1%.
+
menuconfig CPU_MITIGATIONS
bool "Mitigations for CPU vulnerabilities"
default y
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5b773b34768d..05689c9a8942 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -240,13 +240,18 @@ ifdef CONFIG_MITIGATION_SLS
endif
ifdef CONFIG_CALL_PADDING
-PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
-KBUILD_CFLAGS += $(PADDING_CFLAGS)
-export PADDING_CFLAGS
+ __padding_bytes := $(CONFIG_FUNCTION_PADDING_BYTES)
+ ifneq ($(and $(CONFIG_FUNCTION_METADATA),$(CONFIG_CALL_THUNKS),$(CONFIG_CFI_CLANG)),)
+ __padding_bytes := $(shell echo $(__padding_bytes) + 5 | bc)
+ endif
+
+ PADDING_CFLAGS := -fpatchable-function-entry=$(__padding_bytes),$(__padding_bytes)
+ KBUILD_CFLAGS += $(PADDING_CFLAGS)
+ export PADDING_CFLAGS
-PADDING_RUSTFLAGS := -Zpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
-KBUILD_RUSTFLAGS += $(PADDING_RUSTFLAGS)
-export PADDING_RUSTFLAGS
+ PADDING_RUSTFLAGS := -Zpatchable-function-entry=$(__padding_bytes),$(__padding_bytes)
+ KBUILD_RUSTFLAGS += $(PADDING_RUSTFLAGS)
+ export PADDING_RUSTFLAGS
endif
KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
diff --git a/include/linux/func_meta.h b/include/linux/func_meta.h
new file mode 100644
index 000000000000..840cbd858c47
--- /dev/null
+++ b/include/linux/func_meta.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_FUNC_META_H
+#define _LINUX_FUNC_META_H
+
+#include <linux/kernel.h>
+
+struct func_meta {
+ int users;
+ void *func;
+};
+
+extern struct func_meta *func_metas;
+
+struct func_meta *func_meta_get(void *ip);
+void func_meta_put(void *ip, struct func_meta *meta);
+
+#endif
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 057cd975d014..4042c168dcfc 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -106,6 +106,7 @@ obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
obj-$(CONFIG_FPROBE) += fprobe.o
obj-$(CONFIG_RETHOOK) += rethook.o
obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o
+obj-$(CONFIG_FUNCTION_METADATA) += func_meta.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
obj-$(CONFIG_RV) += rv/
diff --git a/kernel/trace/func_meta.c b/kernel/trace/func_meta.c
new file mode 100644
index 000000000000..9ce77da81e71
--- /dev/null
+++ b/kernel/trace/func_meta.c
@@ -0,0 +1,184 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/slab.h>
+#include <linux/memory.h>
+#include <linux/func_meta.h>
+#include <linux/text-patching.h>
+
+#define FUNC_META_INSN_SIZE 5
+
+#ifdef CONFIG_CFI_CLANG
+ #ifdef CONFIG_CALL_THUNKS
+ /* use the extra 5-bytes that we reserve */
+ #define FUNC_META_INSN_OFFSET (CONFIG_FUNCTION_PADDING_BYTES + 5)
+ #define FUNC_META_DATA_OFFSET (CONFIG_FUNCTION_PADDING_BYTES + 4)
+ #else
+ /* use the space that CALL_THUNKS suppose to use */
+ #define FUNC_META_INSN_OFFSET (5)
+ #define FUNC_META_DATA_OFFSET (4)
+ #endif
+#else
+ /* use the space that CFI_CLANG support to use */
+ #define FUNC_META_INSN_OFFSET (CONFIG_FUNCTION_PADDING_BYTES)
+ #define FUNC_META_DATA_OFFSET (CONFIG_FUNCTION_PADDING_BYTES - 1)
+#endif
+
+static u32 func_meta_count = 32, func_meta_used;
+struct func_meta *func_metas;
+EXPORT_SYMBOL_GPL(func_metas);
+
+static DEFINE_MUTEX(func_meta_lock);
+
+static u32 func_meta_get_index(void *ip)
+{
+ return *(u32 *)(ip - FUNC_META_DATA_OFFSET);
+}
+
+static bool func_meta_exist(void *ip)
+{
+ return *(u8 *)(ip - FUNC_META_INSN_OFFSET) == 0xB8;
+}
+
+static void func_meta_init(struct func_meta *metas, u32 start, u32 end)
+{
+ u32 i;
+
+ for (i = start; i < end; i++)
+ metas[i].users = 0;
+}
+
+/* get next usable function metadata */
+static struct func_meta *func_meta_get_next(u32 *index)
+{
+ struct func_meta *tmp;
+ u32 i;
+
+ if (func_metas == NULL) {
+ func_metas = kmalloc_array(func_meta_count, sizeof(*tmp),
+ GFP_KERNEL);
+ if (!func_metas)
+ return NULL;
+ func_meta_init(func_metas, 0, func_meta_count);
+ }
+
+ /* maybe we can manage the used function metadata entry with a bit
+ * map ?
+ */
+ for (i = 0; i < func_meta_count; i++) {
+ if (!func_metas[i].users) {
+ func_meta_used++;
+ *index = i;
+ func_metas[i].users++;
+ return func_metas + i;
+ }
+ }
+
+ tmp = kmalloc_array(func_meta_count * 2, sizeof(*tmp), GFP_KERNEL);
+ if (!tmp)
+ return NULL;
+
+ memcpy(tmp, func_metas, func_meta_count * sizeof(*tmp));
+ kfree(func_metas);
+
+ /* TODO: we need a way to update func_metas synchronously, RCU ? */
+ func_metas = tmp;
+ func_meta_init(func_metas, func_meta_count, func_meta_count * 2);
+
+ tmp += func_meta_count;
+ *index = func_meta_count;
+ func_meta_count <<= 1;
+ func_meta_used++;
+ tmp->users++;
+
+ return tmp;
+}
+
+static int func_meta_text_poke(void *ip, u32 index, bool nop)
+{
+ const u8 nop_insn[FUNC_META_INSN_SIZE] = { BYTES_NOP1, BYTES_NOP1,
+ BYTES_NOP1, BYTES_NOP1,
+ BYTES_NOP1 };
+ u8 insn[FUNC_META_INSN_SIZE] = { 0xB8, };
+ const u8 *prog;
+ void *target;
+ int ret = 0;
+
+ target = ip - FUNC_META_INSN_OFFSET;
+ mutex_lock(&text_mutex);
+ if (nop) {
+ if (!memcmp(target, nop_insn, FUNC_META_INSN_SIZE))
+ goto out;
+ prog = nop_insn;
+ } else {
+ *(u32 *)(insn + 1) = index;
+ if (!memcmp(target, insn, FUNC_META_INSN_SIZE))
+ goto out;
+
+ if (memcmp(target, nop_insn, FUNC_META_INSN_SIZE)) {
+ ret = -EBUSY;
+ goto out;
+ }
+ prog = insn;
+ }
+ text_poke(target, prog, FUNC_META_INSN_SIZE);
+ text_poke_sync();
+out:
+ mutex_unlock(&text_mutex);
+ return ret;
+}
+
+static void __func_meta_put(void *ip, struct func_meta *meta)
+{
+ if (WARN_ON_ONCE(meta->users <= 0))
+ return;
+
+ meta->users--;
+ if (meta->users > 0)
+ return;
+
+ /* TODO: we need a way to shrink the array "func_metas" */
+ func_meta_used--;
+ if (!func_meta_exist(ip))
+ return;
+
+ func_meta_text_poke(ip, 0, true);
+}
+
+void func_meta_put(void *ip, struct func_meta *meta)
+{
+ mutex_lock(&func_meta_lock);
+ __func_meta_put(ip, meta);
+ mutex_unlock(&func_meta_lock);
+}
+EXPORT_SYMBOL_GPL(func_meta_put);
+
+struct func_meta *func_meta_get(void *ip)
+{
+ struct func_meta *tmp = NULL;
+ u32 index;
+
+ mutex_lock(&func_meta_lock);
+ if (func_meta_exist(ip)) {
+ index = func_meta_get_index(ip);
+ if (WARN_ON_ONCE(index >= func_meta_count))
+ goto out;
+
+ tmp = &func_metas[index];
+ tmp->users++;
+ goto out;
+ }
+
+ tmp = func_meta_get_next(&index);
+ if (!tmp)
+ goto out;
+
+ if (func_meta_text_poke(ip, index, false)) {
+ __func_meta_put(ip, tmp);
+ goto out;
+ }
+ tmp->func = ip;
+out:
+ mutex_unlock(&func_meta_lock);
+ return tmp;
+}
+EXPORT_SYMBOL_GPL(func_meta_get);
--
2.39.5
On Mon, 10 Feb 2025 18:40:34 +0800
Menglong Dong <menglong8.dong@gmail.com> wrote:
> With CONFIG_CALL_PADDING enabled, there will be 16-bytes(or more) padding
> space before all the kernel functions. And some kernel features can use
> it, such as MITIGATION_CALL_DEPTH_TRACKING, CFI_CLANG, FINEIBT, etc.
Please start your change log off with why you are doing this. Then you can
go into the details of what you are doing.
Explain what and how kernel features can use this.
>
> In this commit, we add supporting to store per-function metadata in the
Don't ever say "In this commit" in a change long. We know this is a commit.
The above can be written like:
"Support per-function metadata storage.." Although that should be after you
explain why this is being submitted.
> function padding, and previous discussion can be found in [1]. Generally
> speaking, we have two way to implement this feature:
>
> 1. create a function metadata array, and prepend a 5-bytes insn
> "mov %eax, 0x12345678", and store the insn to the function padding.
> And 0x12345678 means the index of the function metadata in the array.
> By this way, 5-bytes will be consumed in the function padding.
>
> 2. prepend a 10-bytes insn "mov %rax, 0x12345678aabbccdd" and store
> the insn to the function padding, and 0x12345678aabbccdd means the address
> of the function metadata.
>
> Compared with way 2, way 1 consume less space, but we need to do more work
> on the global function metadata array. And in this commit, we implemented
> the way 1.
>
> In my research, MITIGATION_CALL_DEPTH_TRACKING will consume the tail
> 9-bytes in the function padding, and FINEIBT+CFI_CLANG will consume
> the head 7-bytes. So there will be no space for us if
> MITIGATION_CALL_DEPTH_TRACKING and CFI_CLANG are both enabled. So I have
> following logic:
> 1. use the head 5-bytes if CFI_CLANG is not enabled
> 2. use the tail 5-bytes if MITIGATION_CALL_DEPTH_TRACKING is not enabled
> 3. compile the kernel with extra 5-bytes padding if
> MITIGATION_CALL_DEPTH_TRACKING and CFI_CLANG are both enabled.
>
> In the third case, we compile the kernel with a function padding of
> 21-bytes, which means the real function is not 16-bytes aligned any more.
> And in [2], I tested the performance of the kernel with different padding,
> and it seems that extra 5-bytes don't have impact on the performance.
> However, it's a huge change to make the kernel function unaligned in
> 16-bytes, and I'm sure if there are any other influence. So another choice
> is to compile the kernel with 32-bytes aligned if there is no space
> available for us in the function padding. But this will increase the text
> size ~5%. (And I'm not sure with method to use.)
>
> The beneficiaries of this feature can be BPF and ftrace. For BPF, we can
> implement a "dynamic trampoline" and add tracing multi-link supporting
> base on this feature. And for ftrace, we can optimize its performance
> base on this feature.
>
> This commit is not complete, as the synchronous of func_metas is not
> considered :/
>
> Link: https://lore.kernel.org/bpf/CADxym3anLzM6cAkn_z71GDd_VeKiqqk1ts=xuiP7pr4PO6USPA@mail.gmail.com/ [1]
> Link: https://lore.kernel.org/bpf/CADxym3af+CU5Mx8myB8UowdXSc3wJOqWyH4oyq+eXKahXBTXyg@mail.gmail.com/ [2]
> Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> ---
> arch/x86/Kconfig | 15 ++++
> arch/x86/Makefile | 17 ++--
> include/linux/func_meta.h | 17 ++++
> kernel/trace/Makefile | 1 +
> kernel/trace/func_meta.c | 184 ++++++++++++++++++++++++++++++++++++++
> 5 files changed, 228 insertions(+), 6 deletions(-)
> create mode 100644 include/linux/func_meta.h
> create mode 100644 kernel/trace/func_meta.c
>
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 6df7779ed6da..0ff3cb74cfc0 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -2510,6 +2510,21 @@ config PREFIX_SYMBOLS
> def_bool y
> depends on CALL_PADDING && !CFI_CLANG
>
> +config FUNCTION_METADATA
> + bool "Enable function metadata support"
> + select CALL_PADDING
> + default y
> + help
> + This feature allow us to set metadata for kernel functions, and
Who's "us"?
> + get the metadata of the function by its address without any
> + costing.
> +
> + The index of the metadata will be stored in the function padding,
> + which will consume 5-bytes. The spare space of the padding
> + is enough for us with CALL_PADDING and FUNCTION_ALIGNMENT_16B if
> + CALL_THUNKS or CFI_CLANG not enabled. Or, we need extra 5-bytes
> + in the function padding, which will increases text ~1%.
> +
> menuconfig CPU_MITIGATIONS
> bool "Mitigations for CPU vulnerabilities"
> default y
> diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> index 5b773b34768d..05689c9a8942 100644
> --- a/arch/x86/Makefile
> +++ b/arch/x86/Makefile
> @@ -240,13 +240,18 @@ ifdef CONFIG_MITIGATION_SLS
> endif
>
> ifdef CONFIG_CALL_PADDING
> -PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
> -KBUILD_CFLAGS += $(PADDING_CFLAGS)
> -export PADDING_CFLAGS
> + __padding_bytes := $(CONFIG_FUNCTION_PADDING_BYTES)
> + ifneq ($(and $(CONFIG_FUNCTION_METADATA),$(CONFIG_CALL_THUNKS),$(CONFIG_CFI_CLANG)),)
> + __padding_bytes := $(shell echo $(__padding_bytes) + 5 | bc)
> + endif
> +
> + PADDING_CFLAGS := -fpatchable-function-entry=$(__padding_bytes),$(__padding_bytes)
> + KBUILD_CFLAGS += $(PADDING_CFLAGS)
> + export PADDING_CFLAGS
Arm64 and other archs add meta data before the functions too. Can we have
an effort to perhaps share these methods?
>
> -PADDING_RUSTFLAGS := -Zpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
> -KBUILD_RUSTFLAGS += $(PADDING_RUSTFLAGS)
> -export PADDING_RUSTFLAGS
> + PADDING_RUSTFLAGS := -Zpatchable-function-entry=$(__padding_bytes),$(__padding_bytes)
> + KBUILD_RUSTFLAGS += $(PADDING_RUSTFLAGS)
> + export PADDING_RUSTFLAGS
> endif
>
> KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
> diff --git a/include/linux/func_meta.h b/include/linux/func_meta.h
> new file mode 100644
> index 000000000000..840cbd858c47
> --- /dev/null
> +++ b/include/linux/func_meta.h
If this is x86 only, why is this in generic code?
> @@ -0,0 +1,17 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _LINUX_FUNC_META_H
> +#define _LINUX_FUNC_META_H
> +
> +#include <linux/kernel.h>
> +
> +struct func_meta {
> + int users;
> + void *func;
> +};
> +
> +extern struct func_meta *func_metas;
> +
> +struct func_meta *func_meta_get(void *ip);
> +void func_meta_put(void *ip, struct func_meta *meta);
> +
> +#endif
> diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> index 057cd975d014..4042c168dcfc 100644
> --- a/kernel/trace/Makefile
> +++ b/kernel/trace/Makefile
Same here.
> @@ -106,6 +106,7 @@ obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
> obj-$(CONFIG_FPROBE) += fprobe.o
> obj-$(CONFIG_RETHOOK) += rethook.o
> obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o
> +obj-$(CONFIG_FUNCTION_METADATA) += func_meta.o
>
> obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
> obj-$(CONFIG_RV) += rv/
> diff --git a/kernel/trace/func_meta.c b/kernel/trace/func_meta.c
> new file mode 100644
> index 000000000000..9ce77da81e71
> --- /dev/null
> +++ b/kernel/trace/func_meta.c
Unless this file will support arm64 meta data and other architectures
besides x86, then it should not be in the kernel/trace directory.
-- Steve
> @@ -0,0 +1,184 @@
> +// SPDX-License-Identifier: GPL-2.0
> +
> +#include <linux/slab.h>
> +#include <linux/memory.h>
> +#include <linux/func_meta.h>
> +#include <linux/text-patching.h>
> +
On Tue, Feb 11, 2025 at 7:05 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Mon, 10 Feb 2025 18:40:34 +0800
> Menglong Dong <menglong8.dong@gmail.com> wrote:
>
> > With CONFIG_CALL_PADDING enabled, there will be 16-bytes(or more) padding
> > space before all the kernel functions. And some kernel features can use
> > it, such as MITIGATION_CALL_DEPTH_TRACKING, CFI_CLANG, FINEIBT, etc.
>
> Please start your change log off with why you are doing this. Then you can
> go into the details of what you are doing.
>
> Explain what and how kernel features can use this.
Yeah, you are right, and I'll add the following messages to the
starting of the log:
For now, there isn't a way to set and get per-function metadata with
a low overhead, which is not convenient for some situations. Take
BPF trampoline for example, we need to create a trampoline for each
kernel function, as we have to store the information about the function,
such as BPF progs, function arg count, etc, to the trampoline. The
performance cost and memory consumption can be higher to create
these trampolines. With per-function metadata storage support, we
can store the BPF progs, function arg count, etc, to the metadata, and
create a global BPF trampoline for all the kernel functions. In the global
trampoline, we can get the information that we need from the function
metadata through the ip (function address) with almost no overhead.
Another beneficiary can be ftrace. For now, all the kernel functions that
are enabled by dynamic ftrace will be added to a filter hash. And hash
lookup will happen when then traced functions are called, which has an
impact on the performance, see
__ftrace_ops_list_func() -> ftrace_ops_test(). With the per-function metadata
support, we can store the information that if the ftrace ops is enabled on the
kernel function to the metadata.
>
> >
> > In this commit, we add supporting to store per-function metadata in the
>
> Don't ever say "In this commit" in a change long. We know this is a commit.
> The above can be written like:
>
> "Support per-function metadata storage.." Although that should be after you
> explain why this is being submitted.
Okay, looks much better!
>
> > function padding, and previous discussion can be found in [1]. Generally
> > speaking, we have two way to implement this feature:
> >
> > 1. create a function metadata array, and prepend a 5-bytes insn
> > "mov %eax, 0x12345678", and store the insn to the function padding.
> > And 0x12345678 means the index of the function metadata in the array.
> > By this way, 5-bytes will be consumed in the function padding.
> >
> > 2. prepend a 10-bytes insn "mov %rax, 0x12345678aabbccdd" and store
> > the insn to the function padding, and 0x12345678aabbccdd means the address
> > of the function metadata.
> >
> > Compared with way 2, way 1 consume less space, but we need to do more work
> > on the global function metadata array. And in this commit, we implemented
> > the way 1.
> >
> > In my research, MITIGATION_CALL_DEPTH_TRACKING will consume the tail
> > 9-bytes in the function padding, and FINEIBT+CFI_CLANG will consume
> > the head 7-bytes. So there will be no space for us if
> > MITIGATION_CALL_DEPTH_TRACKING and CFI_CLANG are both enabled. So I have
> > following logic:
> > 1. use the head 5-bytes if CFI_CLANG is not enabled
> > 2. use the tail 5-bytes if MITIGATION_CALL_DEPTH_TRACKING is not enabled
> > 3. compile the kernel with extra 5-bytes padding if
> > MITIGATION_CALL_DEPTH_TRACKING and CFI_CLANG are both enabled.
> >
> > In the third case, we compile the kernel with a function padding of
> > 21-bytes, which means the real function is not 16-bytes aligned any more.
> > And in [2], I tested the performance of the kernel with different padding,
> > and it seems that extra 5-bytes don't have impact on the performance.
> > However, it's a huge change to make the kernel function unaligned in
> > 16-bytes, and I'm sure if there are any other influence. So another choice
> > is to compile the kernel with 32-bytes aligned if there is no space
> > available for us in the function padding. But this will increase the text
> > size ~5%. (And I'm not sure with method to use.)
> >
> > The beneficiaries of this feature can be BPF and ftrace. For BPF, we can
> > implement a "dynamic trampoline" and add tracing multi-link supporting
> > base on this feature. And for ftrace, we can optimize its performance
> > base on this feature.
> >
> > This commit is not complete, as the synchronous of func_metas is not
> > considered :/
> >
> > Link: https://lore.kernel.org/bpf/CADxym3anLzM6cAkn_z71GDd_VeKiqqk1ts=xuiP7pr4PO6USPA@mail.gmail.com/ [1]
> > Link: https://lore.kernel.org/bpf/CADxym3af+CU5Mx8myB8UowdXSc3wJOqWyH4oyq+eXKahXBTXyg@mail.gmail.com/ [2]
> > Signed-off-by: Menglong Dong <dongml2@chinatelecom.cn>
> > ---
> > arch/x86/Kconfig | 15 ++++
> > arch/x86/Makefile | 17 ++--
> > include/linux/func_meta.h | 17 ++++
> > kernel/trace/Makefile | 1 +
> > kernel/trace/func_meta.c | 184 ++++++++++++++++++++++++++++++++++++++
> > 5 files changed, 228 insertions(+), 6 deletions(-)
> > create mode 100644 include/linux/func_meta.h
> > create mode 100644 kernel/trace/func_meta.c
> >
> > diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> > index 6df7779ed6da..0ff3cb74cfc0 100644
> > --- a/arch/x86/Kconfig
> > +++ b/arch/x86/Kconfig
> > @@ -2510,6 +2510,21 @@ config PREFIX_SYMBOLS
> > def_bool y
> > depends on CALL_PADDING && !CFI_CLANG
> >
> > +config FUNCTION_METADATA
> > + bool "Enable function metadata support"
> > + select CALL_PADDING
> > + default y
> > + help
> > + This feature allow us to set metadata for kernel functions, and
>
> Who's "us"?
The user of this feature? Maybe change it to:
Support per-function metadata storage......
Just like what you suggest above :/
>
> > + get the metadata of the function by its address without any
> > + costing.
> > +
> > + The index of the metadata will be stored in the function padding,
> > + which will consume 5-bytes. The spare space of the padding
> > + is enough for us with CALL_PADDING and FUNCTION_ALIGNMENT_16B if
> > + CALL_THUNKS or CFI_CLANG not enabled. Or, we need extra 5-bytes
> > + in the function padding, which will increases text ~1%.
> > +
> > menuconfig CPU_MITIGATIONS
> > bool "Mitigations for CPU vulnerabilities"
> > default y
> > diff --git a/arch/x86/Makefile b/arch/x86/Makefile
> > index 5b773b34768d..05689c9a8942 100644
> > --- a/arch/x86/Makefile
> > +++ b/arch/x86/Makefile
> > @@ -240,13 +240,18 @@ ifdef CONFIG_MITIGATION_SLS
> > endif
> >
> > ifdef CONFIG_CALL_PADDING
> > -PADDING_CFLAGS := -fpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
> > -KBUILD_CFLAGS += $(PADDING_CFLAGS)
> > -export PADDING_CFLAGS
> > + __padding_bytes := $(CONFIG_FUNCTION_PADDING_BYTES)
> > + ifneq ($(and $(CONFIG_FUNCTION_METADATA),$(CONFIG_CALL_THUNKS),$(CONFIG_CFI_CLANG)),)
> > + __padding_bytes := $(shell echo $(__padding_bytes) + 5 | bc)
> > + endif
> > +
> > + PADDING_CFLAGS := -fpatchable-function-entry=$(__padding_bytes),$(__padding_bytes)
> > + KBUILD_CFLAGS += $(PADDING_CFLAGS)
> > + export PADDING_CFLAGS
>
> Arm64 and other archs add meta data before the functions too. Can we have
> an effort to perhaps share these methods?
I have not done research on arm64 yet. AFAIK, arm64 insn is 16-bytes aligned,
so the way we process can be a little different here, as making kernel function
non 16-bytes aligned can have a huge influence.
And I'm hesitant about 2 points, as I described in the commit log:
1. prepend 5-bytes insn or 10-bytes? 10-bytes will be much simpler
in coding, but consume more padding space.
2. add extra 5/10 bytes padding, or select FUNCTION_ALIGNMENT_32B?
The latter can make the text size bigger, but less influential.
Does anyone have any idea about the choice above? I would
appreciate hearing some advice here :/
And yes, when the final solution is determined, the other arch
(at least arm64) can support this feature too.
>
> >
> > -PADDING_RUSTFLAGS := -Zpatchable-function-entry=$(CONFIG_FUNCTION_PADDING_BYTES),$(CONFIG_FUNCTION_PADDING_BYTES)
> > -KBUILD_RUSTFLAGS += $(PADDING_RUSTFLAGS)
> > -export PADDING_RUSTFLAGS
> > + PADDING_RUSTFLAGS := -Zpatchable-function-entry=$(__padding_bytes),$(__padding_bytes)
> > + KBUILD_RUSTFLAGS += $(PADDING_RUSTFLAGS)
> > + export PADDING_RUSTFLAGS
> > endif
> >
> > KBUILD_LDFLAGS += -m elf_$(UTS_MACHINE)
> > diff --git a/include/linux/func_meta.h b/include/linux/func_meta.h
> > new file mode 100644
> > index 000000000000..840cbd858c47
> > --- /dev/null
> > +++ b/include/linux/func_meta.h
>
> If this is x86 only, why is this in generic code?
>
> > @@ -0,0 +1,17 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _LINUX_FUNC_META_H
> > +#define _LINUX_FUNC_META_H
> > +
> > +#include <linux/kernel.h>
> > +
> > +struct func_meta {
> > + int users;
> > + void *func;
> > +};
> > +
> > +extern struct func_meta *func_metas;
> > +
> > +struct func_meta *func_meta_get(void *ip);
> > +void func_meta_put(void *ip, struct func_meta *meta);
> > +
> > +#endif
> > diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
> > index 057cd975d014..4042c168dcfc 100644
> > --- a/kernel/trace/Makefile
> > +++ b/kernel/trace/Makefile
>
> Same here.
>
> > @@ -106,6 +106,7 @@ obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o
> > obj-$(CONFIG_FPROBE) += fprobe.o
> > obj-$(CONFIG_RETHOOK) += rethook.o
> > obj-$(CONFIG_FPROBE_EVENTS) += trace_fprobe.o
> > +obj-$(CONFIG_FUNCTION_METADATA) += func_meta.o
> >
> > obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
> > obj-$(CONFIG_RV) += rv/
> > diff --git a/kernel/trace/func_meta.c b/kernel/trace/func_meta.c
> > new file mode 100644
> > index 000000000000..9ce77da81e71
> > --- /dev/null
> > +++ b/kernel/trace/func_meta.c
>
> Unless this file will support arm64 meta data and other architectures
> besides x86, then it should not be in the kernel/trace directory.
Yeah, I am planning to support arm64 metadata (maybe in
the next commit?). Some code in this file is x86 related, and
I should move them to the arch/x86 directory :/
Thanks!
Menglong Dong
>
> -- Steve
>
>
> > @@ -0,0 +1,184 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +
> > +#include <linux/slab.h>
> > +#include <linux/memory.h>
> > +#include <linux/func_meta.h>
> > +#include <linux/text-patching.h>
> > +
On Tue, 11 Feb 2025 20:03:38 +0800 Menglong Dong <menglong8.dong@gmail.com> wrote: > > Another beneficiary can be ftrace. For now, all the kernel functions that > are enabled by dynamic ftrace will be added to a filter hash. And hash > lookup will happen when then traced functions are called, which has an > impact on the performance, see > __ftrace_ops_list_func() -> ftrace_ops_test(). With the per-function metadata > support, we can store the information that if the ftrace ops is enabled on the > kernel function to the metadata. Note, ftrace only uses ftrace_ops_list if there's more than one callback attached to the same function. Otherwise it calls directly to a single trampoline, and is rather efficient. No meta data needed. > > Arm64 and other archs add meta data before the functions too. Can we have > > an effort to perhaps share these methods? > > I have not done research on arm64 yet. AFAIK, arm64 insn is 16-bytes aligned, > so the way we process can be a little different here, as making kernel function > non 16-bytes aligned can have a huge influence. Arm64 already uses the meta data before every function. That's where it stores a pointer to the ftrace_ops. So in ftrace, when there's a single callback attached to a function in arm64, it jumps to a ftrace trampoline, that will reference the function's meta data to find the ftrace_ops to use for that callback. If more than one callback is attached to the same function, then it acts just like x86 and does the loop. -- Steve
On Wed, Feb 12, 2025 at 12:24 AM Steven Rostedt <rostedt@goodmis.org> wrote: > > On Tue, 11 Feb 2025 20:03:38 +0800 > Menglong Dong <menglong8.dong@gmail.com> wrote: > > > > > > Another beneficiary can be ftrace. For now, all the kernel functions that > > are enabled by dynamic ftrace will be added to a filter hash. And hash > > lookup will happen when then traced functions are called, which has an > > impact on the performance, see > > __ftrace_ops_list_func() -> ftrace_ops_test(). With the per-function metadata > > support, we can store the information that if the ftrace ops is enabled on the > > kernel function to the metadata. > > Note, ftrace only uses ftrace_ops_list if there's more than one callback > attached to the same function. Otherwise it calls directly to a single > trampoline, and is rather efficient. No meta data needed. Sorry that the log didn't describe it accurately, the multi callback case is just what I meant. I'm not sure if it is suitable for such a case, so let me just remove this part from the commit log, and let's see it later :/ > > > > Arm64 and other archs add meta data before the functions too. Can we have > > > an effort to perhaps share these methods? > > > > I have not done research on arm64 yet. AFAIK, arm64 insn is 16-bytes aligned, > > so the way we process can be a little different here, as making kernel function > > non 16-bytes aligned can have a huge influence. > > Arm64 already uses the meta data before every function. That's where it > stores a pointer to the ftrace_ops. So in ftrace, when there's a single > callback attached to a function in arm64, it jumps to a ftrace trampoline, > that will reference the function's meta data to find the ftrace_ops to use > for that callback. > > If more than one callback is attached to the same function, then it acts > just like x86 and does the loop. Thank you for your explanation. It seems that it is quite simple to implement the function meta data in arm64. Let me dig it deeper on arm64, and I'll implement it together with x86 in the next version if it is possible. Thanks! Menglong Dong > > -- Steve
© 2016 - 2025 Red Hat, Inc.