Use separate per-call-site kmem_cache or kmem_buckets. These are
allocated on demand to avoid wasting memory for unused caches.
A few caches need to be allocated very early to support allocating the
caches themselves: kstrdup(), kvasprintf(), and pcpu_mem_zalloc(). Any
GFP_ATOMIC allocations are currently left to be allocated from
KMALLOC_NORMAL.
With a distro config, /proc/slabinfo grows from ~400 entries to ~2200.
Since this feature (CONFIG_SLAB_PER_SITE) is redundant to
CONFIG_RANDOM_KMALLOC_CACHES, mark it a incompatible. Add Kconfig help
text that compares the features.
Improvements needed:
- Retain call site gfp flags in alloc_tag meta field to:
- pre-allocate all GFP_ATOMIC caches (since their caches cannot
be allocated on demand unless we want them to be GFP_ATOMIC
themselves...)
- Separate MEMCG allocations as well
- Allocate individual caches within kmem_buckets on demand to
further reduce memory usage overhead.
Signed-off-by: Kees Cook <kees@kernel.org>
---
Cc: Suren Baghdasaryan <surenb@google.com>
Cc: Kent Overstreet <kent.overstreet@linux.dev>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
Cc: linux-mm@kvack.org
---
include/linux/alloc_tag.h | 8 +++
lib/alloc_tag.c | 121 +++++++++++++++++++++++++++++++++++---
mm/Kconfig | 19 +++++-
mm/slab_common.c | 1 +
mm/slub.c | 31 +++++++++-
5 files changed, 170 insertions(+), 10 deletions(-)
diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
index f5d8c5849b82..c95628f9b049 100644
--- a/include/linux/alloc_tag.h
+++ b/include/linux/alloc_tag.h
@@ -24,6 +24,7 @@ struct alloc_tag_counters {
struct alloc_meta {
/* 0 means non-slab, SIZE_MAX means dynamic, and everything else is fixed-size. */
size_t sized;
+ void *cache;
};
#define ALLOC_META_INIT(_size) { \
.sized = (__builtin_constant_p(_size) ? (_size) : SIZE_MAX), \
@@ -216,6 +217,13 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
#endif /* CONFIG_MEM_ALLOC_PROFILING */
+#ifdef CONFIG_SLAB_PER_SITE
+void alloc_tag_early_walk(void);
+void alloc_tag_site_init(struct codetag *ct, bool ondemand);
+#else
+static inline void alloc_tag_early_walk(void) {}
+#endif
+
#define alloc_hooks_tag(_tag, _do_alloc) \
({ \
struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag); \
diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
index 6d2cb72bf269..e8a66a7c4a6b 100644
--- a/lib/alloc_tag.c
+++ b/lib/alloc_tag.c
@@ -157,6 +157,89 @@ static void __init procfs_init(void)
proc_create_seq("allocinfo", 0400, NULL, &allocinfo_seq_op);
}
+#ifdef CONFIG_SLAB_PER_SITE
+static bool ondemand_ready;
+
+void alloc_tag_site_init(struct codetag *ct, bool ondemand)
+{
+ struct alloc_tag *tag = ct_to_alloc_tag(ct);
+ char *name;
+ void *p, *old;
+
+ /* Only handle kmalloc allocations. */
+ if (!tag->meta.sized)
+ return;
+
+ /* Must be ready for on-demand allocations. */
+ if (ondemand && !ondemand_ready)
+ return;
+
+ old = READ_ONCE(tag->meta.cache);
+ /* Already allocated? */
+ if (old)
+ return;
+
+ if (tag->meta.sized < SIZE_MAX) {
+ /* Fixed-size allocations. */
+ name = kasprintf(GFP_KERNEL, "f:%zu:%s:%d", tag->meta.sized, ct->function, ct->lineno);
+ if (WARN_ON_ONCE(!name))
+ return;
+ /*
+ * As with KMALLOC_NORMAL, the entire allocation needs to be
+ * open to usercopy access. :(
+ */
+ p = kmem_cache_create_usercopy(name, tag->meta.sized, 0,
+ SLAB_NO_MERGE, 0, tag->meta.sized,
+ NULL);
+ } else {
+ /* Dynamically-size allocations. */
+ name = kasprintf(GFP_KERNEL, "d:%s:%d", ct->function, ct->lineno);
+ if (WARN_ON_ONCE(!name))
+ return;
+ p = kmem_buckets_create(name, SLAB_NO_MERGE, 0, UINT_MAX, NULL);
+ }
+ if (p) {
+ if (unlikely(!try_cmpxchg(&tag->meta.cache, &old, p))) {
+ /* We lost the allocation race; clean up. */
+ if (tag->meta.sized < SIZE_MAX)
+ kmem_cache_destroy(p);
+ else
+ kmem_buckets_destroy(p);
+ }
+ }
+ kfree(name);
+}
+
+static void alloc_tag_site_init_early(struct codetag *ct)
+{
+ /* Explicitly initialize the caches needed to initialize caches. */
+ if (strcmp(ct->function, "kstrdup") == 0 ||
+ strcmp(ct->function, "kvasprintf") == 0 ||
+ strcmp(ct->function, "pcpu_mem_zalloc") == 0)
+ alloc_tag_site_init(ct, false);
+
+ /* TODO: pre-allocate GFP_ATOMIC caches here. */
+}
+#endif
+
+static void alloc_tag_module_load(struct codetag_type *cttype,
+ struct codetag_module *cmod)
+{
+#ifdef CONFIG_SLAB_PER_SITE
+ struct codetag_iterator iter;
+ struct codetag *ct;
+
+ iter = codetag_get_ct_iter(cttype);
+ for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
+ if (iter.cmod != cmod)
+ continue;
+
+ /* TODO: pre-allocate GFP_ATOMIC caches here. */
+ //alloc_tag_site_init(ct, false);
+ }
+#endif
+}
+
static bool alloc_tag_module_unload(struct codetag_type *cttype,
struct codetag_module *cmod)
{
@@ -175,8 +258,21 @@ static bool alloc_tag_module_unload(struct codetag_type *cttype,
if (WARN(counter.bytes,
"%s:%u module %s func:%s has %llu allocated at module unload",
- ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
+ ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) {
module_unused = false;
+ }
+#ifdef CONFIG_SLAB_PER_SITE
+ else if (tag->meta.sized) {
+ /* Remove the allocated caches, if possible. */
+ void *p = READ_ONCE(tag->meta.cache);
+
+ WRITE_ONCE(tag->meta.cache, NULL);
+ if (tag->meta.sized < SIZE_MAX)
+ kmem_cache_destroy(p);
+ else
+ kmem_buckets_destroy(p);
+ }
+#endif
}
return module_unused;
@@ -260,15 +356,16 @@ static void __init sysctl_init(void)
static inline void sysctl_init(void) {}
#endif /* CONFIG_SYSCTL */
+static const struct codetag_type_desc alloc_tag_desc = {
+ .section = "alloc_tags",
+ .tag_size = sizeof(struct alloc_tag),
+ .module_load = alloc_tag_module_load,
+ .module_unload = alloc_tag_module_unload,
+};
+
static int __init alloc_tag_init(void)
{
- const struct codetag_type_desc desc = {
- .section = "alloc_tags",
- .tag_size = sizeof(struct alloc_tag),
- .module_unload = alloc_tag_module_unload,
- };
-
- alloc_tag_cttype = codetag_register_type(&desc);
+ alloc_tag_cttype = codetag_register_type(&alloc_tag_desc);
if (IS_ERR(alloc_tag_cttype))
return PTR_ERR(alloc_tag_cttype);
@@ -278,3 +375,11 @@ static int __init alloc_tag_init(void)
return 0;
}
module_init(alloc_tag_init);
+
+#ifdef CONFIG_SLAB_PER_SITE
+void alloc_tag_early_walk(void)
+{
+ codetag_early_walk(&alloc_tag_desc, alloc_tag_site_init_early);
+ ondemand_ready = true;
+}
+#endif
diff --git a/mm/Kconfig b/mm/Kconfig
index 855c63c3270d..4f01cb6dd32e 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -302,7 +302,20 @@ config SLAB_PER_SITE
default SLAB_FREELIST_HARDENED
select SLAB_BUCKETS
help
- Track sizes of kmalloc() call sites.
+ As a defense against shared-cache "type confusion" use-after-free
+ attacks, every kmalloc()-family call allocates from a separate
+ kmem_cache (or when dynamically sized, kmem_buckets). Attackers
+ will no longer be able to groom malicious objects via similarly
+ sized allocations that share the same cache as the target object.
+
+ This increases the "at rest" kmalloc slab memory usage by
+ roughly 5x (around 7MiB), and adds the potential for greater
+ long-term memory fragmentation. However, some workloads
+ actually see performance improvements when single allocation
+ sites are hot.
+
+ For a similar defense, see CONFIG_RANDOM_KMALLOC_CACHES, which
+ has less memory usage overhead, but is probabilistic.
config SLUB_STATS
default n
@@ -331,6 +344,7 @@ config SLUB_CPU_PARTIAL
config RANDOM_KMALLOC_CACHES
default n
depends on !SLUB_TINY
+ depends on !SLAB_PER_SITE
bool "Randomize slab caches for normal kmalloc"
help
A hardening feature that creates multiple copies of slab caches for
@@ -345,6 +359,9 @@ config RANDOM_KMALLOC_CACHES
limited degree of memory and CPU overhead that relates to hardware and
system workload.
+ For a similar defense, see CONFIG_SLAB_PER_SITE, which is
+ deterministic, but has greater memory usage overhead.
+
endmenu # Slab allocator options
config SHUFFLE_PAGE_ALLOCATOR
diff --git a/mm/slab_common.c b/mm/slab_common.c
index fc698cba0ebe..09506bfa972c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1040,6 +1040,7 @@ void __init create_kmalloc_caches(void)
kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
sizeof(kmem_buckets),
0, SLAB_NO_MERGE, NULL);
+ alloc_tag_early_walk();
}
/**
diff --git a/mm/slub.c b/mm/slub.c
index 3520acaf9afa..d14102c4b4d7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4135,6 +4135,35 @@ void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
}
EXPORT_SYMBOL(__kmalloc_large_node_noprof);
+static __always_inline
+struct kmem_cache *choose_slab(size_t size, kmem_buckets *b, gfp_t flags,
+ unsigned long caller)
+{
+#ifdef CONFIG_SLAB_PER_SITE
+ struct alloc_tag *tag = current->alloc_tag;
+
+ if (!b && tag && tag->meta.sized &&
+ kmalloc_type(flags, caller) == KMALLOC_NORMAL &&
+ (flags & GFP_ATOMIC) != GFP_ATOMIC) {
+ void *p = READ_ONCE(tag->meta.cache);
+
+ if (!p && slab_state >= UP) {
+ alloc_tag_site_init(&tag->ct, true);
+ p = READ_ONCE(tag->meta.cache);
+ }
+
+ if (tag->meta.sized < SIZE_MAX) {
+ if (p)
+ return p;
+ /* Otherwise continue with default buckets. */
+ } else {
+ b = p;
+ }
+ }
+#endif
+ return kmalloc_slab(size, b, flags, caller);
+}
+
static __always_inline
void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
unsigned long caller)
@@ -4152,7 +4181,7 @@ void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
if (unlikely(!size))
return ZERO_SIZE_PTR;
- s = kmalloc_slab(size, b, flags, caller);
+ s = choose_slab(size, b, flags, caller);
ret = slab_alloc_node(s, NULL, flags, node, caller, size);
ret = kasan_kmalloc(s, ret, size, flags);
--
2.34.1
On Fri, Aug 9, 2024 at 12:33 AM Kees Cook <kees@kernel.org> wrote:
>
> Use separate per-call-site kmem_cache or kmem_buckets. These are
> allocated on demand to avoid wasting memory for unused caches.
>
> A few caches need to be allocated very early to support allocating the
> caches themselves: kstrdup(), kvasprintf(), and pcpu_mem_zalloc(). Any
> GFP_ATOMIC allocations are currently left to be allocated from
> KMALLOC_NORMAL.
>
> With a distro config, /proc/slabinfo grows from ~400 entries to ~2200.
>
> Since this feature (CONFIG_SLAB_PER_SITE) is redundant to
> CONFIG_RANDOM_KMALLOC_CACHES, mark it a incompatible. Add Kconfig help
> text that compares the features.
>
> Improvements needed:
> - Retain call site gfp flags in alloc_tag meta field to:
> - pre-allocate all GFP_ATOMIC caches (since their caches cannot
> be allocated on demand unless we want them to be GFP_ATOMIC
> themselves...)
I'm currently working on a feature to identify allocations with
__GFP_ACCOUNT known at compile time (similar to how you handle the
size in the previous patch). Might be something you can reuse/extend.
> - Separate MEMCG allocations as well
Do you mean allocations with __GFP_ACCOUNT or something else?
> - Allocate individual caches within kmem_buckets on demand to
> further reduce memory usage overhead.
>
> Signed-off-by: Kees Cook <kees@kernel.org>
> ---
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Kent Overstreet <kent.overstreet@linux.dev>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Roman Gushchin <roman.gushchin@linux.dev>
> Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> Cc: linux-mm@kvack.org
> ---
> include/linux/alloc_tag.h | 8 +++
> lib/alloc_tag.c | 121 +++++++++++++++++++++++++++++++++++---
> mm/Kconfig | 19 +++++-
> mm/slab_common.c | 1 +
> mm/slub.c | 31 +++++++++-
> 5 files changed, 170 insertions(+), 10 deletions(-)
>
> diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h
> index f5d8c5849b82..c95628f9b049 100644
> --- a/include/linux/alloc_tag.h
> +++ b/include/linux/alloc_tag.h
> @@ -24,6 +24,7 @@ struct alloc_tag_counters {
> struct alloc_meta {
> /* 0 means non-slab, SIZE_MAX means dynamic, and everything else is fixed-size. */
> size_t sized;
> + void *cache;
I see now where that meta.cache in the previous patch came from...
That part should be moved here.
> };
> #define ALLOC_META_INIT(_size) { \
> .sized = (__builtin_constant_p(_size) ? (_size) : SIZE_MAX), \
> @@ -216,6 +217,13 @@ static inline void alloc_tag_sub(union codetag_ref *ref, size_t bytes) {}
>
> #endif /* CONFIG_MEM_ALLOC_PROFILING */
>
> +#ifdef CONFIG_SLAB_PER_SITE
> +void alloc_tag_early_walk(void);
> +void alloc_tag_site_init(struct codetag *ct, bool ondemand);
> +#else
> +static inline void alloc_tag_early_walk(void) {}
> +#endif
> +
> #define alloc_hooks_tag(_tag, _do_alloc) \
> ({ \
> struct alloc_tag * __maybe_unused _old = alloc_tag_save(_tag); \
> diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c
> index 6d2cb72bf269..e8a66a7c4a6b 100644
> --- a/lib/alloc_tag.c
> +++ b/lib/alloc_tag.c
> @@ -157,6 +157,89 @@ static void __init procfs_init(void)
> proc_create_seq("allocinfo", 0400, NULL, &allocinfo_seq_op);
> }
>
> +#ifdef CONFIG_SLAB_PER_SITE
> +static bool ondemand_ready;
> +
> +void alloc_tag_site_init(struct codetag *ct, bool ondemand)
> +{
> + struct alloc_tag *tag = ct_to_alloc_tag(ct);
> + char *name;
> + void *p, *old;
> +
> + /* Only handle kmalloc allocations. */
> + if (!tag->meta.sized)
> + return;
> +
> + /* Must be ready for on-demand allocations. */
> + if (ondemand && !ondemand_ready)
> + return;
> +
> + old = READ_ONCE(tag->meta.cache);
> + /* Already allocated? */
> + if (old)
> + return;
> +
> + if (tag->meta.sized < SIZE_MAX) {
> + /* Fixed-size allocations. */
> + name = kasprintf(GFP_KERNEL, "f:%zu:%s:%d", tag->meta.sized, ct->function, ct->lineno);
> + if (WARN_ON_ONCE(!name))
> + return;
> + /*
> + * As with KMALLOC_NORMAL, the entire allocation needs to be
> + * open to usercopy access. :(
> + */
> + p = kmem_cache_create_usercopy(name, tag->meta.sized, 0,
> + SLAB_NO_MERGE, 0, tag->meta.sized,
> + NULL);
> + } else {
> + /* Dynamically-size allocations. */
> + name = kasprintf(GFP_KERNEL, "d:%s:%d", ct->function, ct->lineno);
> + if (WARN_ON_ONCE(!name))
> + return;
> + p = kmem_buckets_create(name, SLAB_NO_MERGE, 0, UINT_MAX, NULL);
> + }
> + if (p) {
> + if (unlikely(!try_cmpxchg(&tag->meta.cache, &old, p))) {
> + /* We lost the allocation race; clean up. */
> + if (tag->meta.sized < SIZE_MAX)
> + kmem_cache_destroy(p);
> + else
> + kmem_buckets_destroy(p);
> + }
> + }
> + kfree(name);
> +}
> +
> +static void alloc_tag_site_init_early(struct codetag *ct)
> +{
> + /* Explicitly initialize the caches needed to initialize caches. */
> + if (strcmp(ct->function, "kstrdup") == 0 ||
> + strcmp(ct->function, "kvasprintf") == 0 ||
> + strcmp(ct->function, "pcpu_mem_zalloc") == 0)
I hope we can find a better way to distinguish these allocations.
Maybe have a specialized hook for them, like alloc_hooks_early() which
sets a bit inside ct->flags to distinguish them?
> + alloc_tag_site_init(ct, false);
> +
> + /* TODO: pre-allocate GFP_ATOMIC caches here. */
You could pre-allocate GFP_ATOMIC caches during
alloc_tag_module_load() only if gfp_flags are known at compile time I
think. I guess for the dynamic case choose_slab() will fall back to
kmalloc_slab()?
> +}
> +#endif
> +
> +static void alloc_tag_module_load(struct codetag_type *cttype,
> + struct codetag_module *cmod)
> +{
> +#ifdef CONFIG_SLAB_PER_SITE
> + struct codetag_iterator iter;
> + struct codetag *ct;
> +
> + iter = codetag_get_ct_iter(cttype);
> + for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) {
> + if (iter.cmod != cmod)
> + continue;
> +
> + /* TODO: pre-allocate GFP_ATOMIC caches here. */
> + //alloc_tag_site_init(ct, false);
> + }
> +#endif
> +}
> +
> static bool alloc_tag_module_unload(struct codetag_type *cttype,
> struct codetag_module *cmod)
> {
> @@ -175,8 +258,21 @@ static bool alloc_tag_module_unload(struct codetag_type *cttype,
>
> if (WARN(counter.bytes,
> "%s:%u module %s func:%s has %llu allocated at module unload",
> - ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
> + ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) {
> module_unused = false;
> + }
> +#ifdef CONFIG_SLAB_PER_SITE
> + else if (tag->meta.sized) {
> + /* Remove the allocated caches, if possible. */
> + void *p = READ_ONCE(tag->meta.cache);
> +
> + WRITE_ONCE(tag->meta.cache, NULL);
I'm guessing you are not using try_cmpxchg() the same way you did in
alloc_tag_site_init() because a race with any other user is impossible
at the module unload time? If so, a comment mentioning that would be
good.
> + if (tag->meta.sized < SIZE_MAX)
> + kmem_cache_destroy(p);
> + else
> + kmem_buckets_destroy(p);
> + }
> +#endif
> }
>
> return module_unused;
> @@ -260,15 +356,16 @@ static void __init sysctl_init(void)
> static inline void sysctl_init(void) {}
> #endif /* CONFIG_SYSCTL */
>
> +static const struct codetag_type_desc alloc_tag_desc = {
> + .section = "alloc_tags",
> + .tag_size = sizeof(struct alloc_tag),
> + .module_load = alloc_tag_module_load,
> + .module_unload = alloc_tag_module_unload,
> +};
> +
> static int __init alloc_tag_init(void)
> {
> - const struct codetag_type_desc desc = {
> - .section = "alloc_tags",
> - .tag_size = sizeof(struct alloc_tag),
> - .module_unload = alloc_tag_module_unload,
> - };
> -
> - alloc_tag_cttype = codetag_register_type(&desc);
> + alloc_tag_cttype = codetag_register_type(&alloc_tag_desc);
> if (IS_ERR(alloc_tag_cttype))
> return PTR_ERR(alloc_tag_cttype);
>
> @@ -278,3 +375,11 @@ static int __init alloc_tag_init(void)
> return 0;
> }
> module_init(alloc_tag_init);
> +
> +#ifdef CONFIG_SLAB_PER_SITE
> +void alloc_tag_early_walk(void)
> +{
> + codetag_early_walk(&alloc_tag_desc, alloc_tag_site_init_early);
> + ondemand_ready = true;
> +}
> +#endif
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 855c63c3270d..4f01cb6dd32e 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -302,7 +302,20 @@ config SLAB_PER_SITE
> default SLAB_FREELIST_HARDENED
> select SLAB_BUCKETS
> help
> - Track sizes of kmalloc() call sites.
> + As a defense against shared-cache "type confusion" use-after-free
> + attacks, every kmalloc()-family call allocates from a separate
> + kmem_cache (or when dynamically sized, kmem_buckets). Attackers
> + will no longer be able to groom malicious objects via similarly
> + sized allocations that share the same cache as the target object.
> +
> + This increases the "at rest" kmalloc slab memory usage by
> + roughly 5x (around 7MiB), and adds the potential for greater
> + long-term memory fragmentation. However, some workloads
> + actually see performance improvements when single allocation
> + sites are hot.
I hope you provide the performance and overhead data in the cover
letter when you post v1.
> +
> + For a similar defense, see CONFIG_RANDOM_KMALLOC_CACHES, which
> + has less memory usage overhead, but is probabilistic.
>
> config SLUB_STATS
> default n
> @@ -331,6 +344,7 @@ config SLUB_CPU_PARTIAL
> config RANDOM_KMALLOC_CACHES
> default n
> depends on !SLUB_TINY
> + depends on !SLAB_PER_SITE
> bool "Randomize slab caches for normal kmalloc"
> help
> A hardening feature that creates multiple copies of slab caches for
> @@ -345,6 +359,9 @@ config RANDOM_KMALLOC_CACHES
> limited degree of memory and CPU overhead that relates to hardware and
> system workload.
>
> + For a similar defense, see CONFIG_SLAB_PER_SITE, which is
> + deterministic, but has greater memory usage overhead.
> +
> endmenu # Slab allocator options
>
> config SHUFFLE_PAGE_ALLOCATOR
> diff --git a/mm/slab_common.c b/mm/slab_common.c
> index fc698cba0ebe..09506bfa972c 100644
> --- a/mm/slab_common.c
> +++ b/mm/slab_common.c
> @@ -1040,6 +1040,7 @@ void __init create_kmalloc_caches(void)
> kmem_buckets_cache = kmem_cache_create("kmalloc_buckets",
> sizeof(kmem_buckets),
> 0, SLAB_NO_MERGE, NULL);
> + alloc_tag_early_walk();
> }
>
> /**
> diff --git a/mm/slub.c b/mm/slub.c
> index 3520acaf9afa..d14102c4b4d7 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4135,6 +4135,35 @@ void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
> }
> EXPORT_SYMBOL(__kmalloc_large_node_noprof);
>
> +static __always_inline
> +struct kmem_cache *choose_slab(size_t size, kmem_buckets *b, gfp_t flags,
> + unsigned long caller)
> +{
> +#ifdef CONFIG_SLAB_PER_SITE
> + struct alloc_tag *tag = current->alloc_tag;
> +
> + if (!b && tag && tag->meta.sized &&
> + kmalloc_type(flags, caller) == KMALLOC_NORMAL &&
> + (flags & GFP_ATOMIC) != GFP_ATOMIC) {
What if allocation is GFP_ATOMIC but a previous allocation from the
same location (same tag) happened without GFP_ATOMIC and
tag->meta.cache was allocated. Why not use that existing cache?
Same if the tag->meta.cache was pre-allocated.
> + void *p = READ_ONCE(tag->meta.cache);
> +
> + if (!p && slab_state >= UP) {
> + alloc_tag_site_init(&tag->ct, true);
> + p = READ_ONCE(tag->meta.cache);
> + }
> +
> + if (tag->meta.sized < SIZE_MAX) {
> + if (p)
> + return p;
> + /* Otherwise continue with default buckets. */
> + } else {
> + b = p;
> + }
> + }
> +#endif
> + return kmalloc_slab(size, b, flags, caller);
> +}
> +
> static __always_inline
> void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
> unsigned long caller)
> @@ -4152,7 +4181,7 @@ void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
> if (unlikely(!size))
> return ZERO_SIZE_PTR;
>
> - s = kmalloc_slab(size, b, flags, caller);
> + s = choose_slab(size, b, flags, caller);
>
> ret = slab_alloc_node(s, NULL, flags, node, caller, size);
> ret = kasan_kmalloc(s, ret, size, flags);
> --
> 2.34.1
>
On Thu, Aug 29, 2024 at 10:03:56AM -0700, Suren Baghdasaryan wrote:
> On Fri, Aug 9, 2024 at 12:33 AM Kees Cook <kees@kernel.org> wrote:
> >
> > Use separate per-call-site kmem_cache or kmem_buckets. These are
> > allocated on demand to avoid wasting memory for unused caches.
> >
> > A few caches need to be allocated very early to support allocating the
> > caches themselves: kstrdup(), kvasprintf(), and pcpu_mem_zalloc(). Any
> > GFP_ATOMIC allocations are currently left to be allocated from
> > KMALLOC_NORMAL.
> >
> > With a distro config, /proc/slabinfo grows from ~400 entries to ~2200.
> >
> > Since this feature (CONFIG_SLAB_PER_SITE) is redundant to
> > CONFIG_RANDOM_KMALLOC_CACHES, mark it a incompatible. Add Kconfig help
> > text that compares the features.
> >
> > Improvements needed:
> > - Retain call site gfp flags in alloc_tag meta field to:
> > - pre-allocate all GFP_ATOMIC caches (since their caches cannot
> > be allocated on demand unless we want them to be GFP_ATOMIC
> > themselves...)
>
> I'm currently working on a feature to identify allocations with
> __GFP_ACCOUNT known at compile time (similar to how you handle the
> size in the previous patch). Might be something you can reuse/extend.
Great, yes! I'd love to check it out.
> > - Separate MEMCG allocations as well
>
> Do you mean allocations with __GFP_ACCOUNT or something else?
I do, yes.
> > +static void alloc_tag_site_init_early(struct codetag *ct)
> > +{
> > + /* Explicitly initialize the caches needed to initialize caches. */
> > + if (strcmp(ct->function, "kstrdup") == 0 ||
> > + strcmp(ct->function, "kvasprintf") == 0 ||
> > + strcmp(ct->function, "pcpu_mem_zalloc") == 0)
>
> I hope we can find a better way to distinguish these allocations.
> Maybe have a specialized hook for them, like alloc_hooks_early() which
> sets a bit inside ct->flags to distinguish them?
That might be possible. I'll see how that ends up looking. I don't want
to even further fragment the alloc_hooks_... variants.
>
> > + alloc_tag_site_init(ct, false);
> > +
> > + /* TODO: pre-allocate GFP_ATOMIC caches here. */
>
> You could pre-allocate GFP_ATOMIC caches during
> alloc_tag_module_load() only if gfp_flags are known at compile time I
> think. I guess for the dynamic case choose_slab() will fall back to
> kmalloc_slab()?
Right, yes. I'd do it like the size checking: if we know at compile
time, we can depend on it, otherwise it's a run-time fallback.
>
> > @@ -175,8 +258,21 @@ static bool alloc_tag_module_unload(struct codetag_type *cttype,
> >
> > if (WARN(counter.bytes,
> > "%s:%u module %s func:%s has %llu allocated at module unload",
> > - ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
> > + ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) {
> > module_unused = false;
> > + }
> > +#ifdef CONFIG_SLAB_PER_SITE
> > + else if (tag->meta.sized) {
> > + /* Remove the allocated caches, if possible. */
> > + void *p = READ_ONCE(tag->meta.cache);
> > +
> > + WRITE_ONCE(tag->meta.cache, NULL);
>
> I'm guessing you are not using try_cmpxchg() the same way you did in
> alloc_tag_site_init() because a race with any other user is impossible
> at the module unload time? If so, a comment mentioning that would be
> good.
Correct. It should not be possible. But yes, I will add a comment.
> > diff --git a/mm/Kconfig b/mm/Kconfig
> > index 855c63c3270d..4f01cb6dd32e 100644
> > --- a/mm/Kconfig
> > +++ b/mm/Kconfig
> > @@ -302,7 +302,20 @@ config SLAB_PER_SITE
> > default SLAB_FREELIST_HARDENED
> > select SLAB_BUCKETS
> > help
> > - Track sizes of kmalloc() call sites.
> > + As a defense against shared-cache "type confusion" use-after-free
> > + attacks, every kmalloc()-family call allocates from a separate
> > + kmem_cache (or when dynamically sized, kmem_buckets). Attackers
> > + will no longer be able to groom malicious objects via similarly
> > + sized allocations that share the same cache as the target object.
> > +
> > + This increases the "at rest" kmalloc slab memory usage by
> > + roughly 5x (around 7MiB), and adds the potential for greater
> > + long-term memory fragmentation. However, some workloads
> > + actually see performance improvements when single allocation
> > + sites are hot.
>
> I hope you provide the performance and overhead data in the cover
> letter when you post v1.
That's my plan. It's always odd choosing workloads, but we do seem to
have a few 'regular' benchmarks (hackbench, kernel builds, etc). Is
there anything in particular you'd want to see?
> > +static __always_inline
> > +struct kmem_cache *choose_slab(size_t size, kmem_buckets *b, gfp_t flags,
> > + unsigned long caller)
> > +{
> > +#ifdef CONFIG_SLAB_PER_SITE
> > + struct alloc_tag *tag = current->alloc_tag;
> > +
> > + if (!b && tag && tag->meta.sized &&
> > + kmalloc_type(flags, caller) == KMALLOC_NORMAL &&
> > + (flags & GFP_ATOMIC) != GFP_ATOMIC) {
>
> What if allocation is GFP_ATOMIC but a previous allocation from the
> same location (same tag) happened without GFP_ATOMIC and
> tag->meta.cache was allocated. Why not use that existing cache?
> Same if the tag->meta.cache was pre-allocated.
Maybe I was being too conservative in my understanding -- I thought that
I couldn't use those caches on the chance that they may already be full?
Or is that always the risk, ad GFP_ATOMIC deals with that? If it would
be considered safe attempt the allocation from the existing cache, then
yeah, I can adjust this check.
Thanks for looking these over!
-Kees
--
Kees Cook
On Wed, Sep 11, 2024 at 3:30 PM Kees Cook <kees@kernel.org> wrote:
>
> On Thu, Aug 29, 2024 at 10:03:56AM -0700, Suren Baghdasaryan wrote:
> > On Fri, Aug 9, 2024 at 12:33 AM Kees Cook <kees@kernel.org> wrote:
> > >
> > > Use separate per-call-site kmem_cache or kmem_buckets. These are
> > > allocated on demand to avoid wasting memory for unused caches.
> > >
> > > A few caches need to be allocated very early to support allocating the
> > > caches themselves: kstrdup(), kvasprintf(), and pcpu_mem_zalloc(). Any
> > > GFP_ATOMIC allocations are currently left to be allocated from
> > > KMALLOC_NORMAL.
> > >
> > > With a distro config, /proc/slabinfo grows from ~400 entries to ~2200.
> > >
> > > Since this feature (CONFIG_SLAB_PER_SITE) is redundant to
> > > CONFIG_RANDOM_KMALLOC_CACHES, mark it a incompatible. Add Kconfig help
> > > text that compares the features.
> > >
> > > Improvements needed:
> > > - Retain call site gfp flags in alloc_tag meta field to:
> > > - pre-allocate all GFP_ATOMIC caches (since their caches cannot
> > > be allocated on demand unless we want them to be GFP_ATOMIC
> > > themselves...)
> >
> > I'm currently working on a feature to identify allocations with
> > __GFP_ACCOUNT known at compile time (similar to how you handle the
> > size in the previous patch). Might be something you can reuse/extend.
>
> Great, yes! I'd love to check it out.
>
> > > - Separate MEMCG allocations as well
> >
> > Do you mean allocations with __GFP_ACCOUNT or something else?
>
> I do, yes.
>
> > > +static void alloc_tag_site_init_early(struct codetag *ct)
> > > +{
> > > + /* Explicitly initialize the caches needed to initialize caches. */
> > > + if (strcmp(ct->function, "kstrdup") == 0 ||
> > > + strcmp(ct->function, "kvasprintf") == 0 ||
> > > + strcmp(ct->function, "pcpu_mem_zalloc") == 0)
> >
> > I hope we can find a better way to distinguish these allocations.
> > Maybe have a specialized hook for them, like alloc_hooks_early() which
> > sets a bit inside ct->flags to distinguish them?
>
> That might be possible. I'll see how that ends up looking. I don't want
> to even further fragment the alloc_hooks_... variants.
>
> >
> > > + alloc_tag_site_init(ct, false);
> > > +
> > > + /* TODO: pre-allocate GFP_ATOMIC caches here. */
> >
> > You could pre-allocate GFP_ATOMIC caches during
> > alloc_tag_module_load() only if gfp_flags are known at compile time I
> > think. I guess for the dynamic case choose_slab() will fall back to
> > kmalloc_slab()?
>
> Right, yes. I'd do it like the size checking: if we know at compile
> time, we can depend on it, otherwise it's a run-time fallback.
>
> >
> > > @@ -175,8 +258,21 @@ static bool alloc_tag_module_unload(struct codetag_type *cttype,
> > >
> > > if (WARN(counter.bytes,
> > > "%s:%u module %s func:%s has %llu allocated at module unload",
> > > - ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes))
> > > + ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) {
> > > module_unused = false;
> > > + }
> > > +#ifdef CONFIG_SLAB_PER_SITE
> > > + else if (tag->meta.sized) {
> > > + /* Remove the allocated caches, if possible. */
> > > + void *p = READ_ONCE(tag->meta.cache);
> > > +
> > > + WRITE_ONCE(tag->meta.cache, NULL);
> >
> > I'm guessing you are not using try_cmpxchg() the same way you did in
> > alloc_tag_site_init() because a race with any other user is impossible
> > at the module unload time? If so, a comment mentioning that would be
> > good.
>
> Correct. It should not be possible. But yes, I will add a comment.
>
> > > diff --git a/mm/Kconfig b/mm/Kconfig
> > > index 855c63c3270d..4f01cb6dd32e 100644
> > > --- a/mm/Kconfig
> > > +++ b/mm/Kconfig
> > > @@ -302,7 +302,20 @@ config SLAB_PER_SITE
> > > default SLAB_FREELIST_HARDENED
> > > select SLAB_BUCKETS
> > > help
> > > - Track sizes of kmalloc() call sites.
> > > + As a defense against shared-cache "type confusion" use-after-free
> > > + attacks, every kmalloc()-family call allocates from a separate
> > > + kmem_cache (or when dynamically sized, kmem_buckets). Attackers
> > > + will no longer be able to groom malicious objects via similarly
> > > + sized allocations that share the same cache as the target object.
> > > +
> > > + This increases the "at rest" kmalloc slab memory usage by
> > > + roughly 5x (around 7MiB), and adds the potential for greater
> > > + long-term memory fragmentation. However, some workloads
> > > + actually see performance improvements when single allocation
> > > + sites are hot.
> >
> > I hope you provide the performance and overhead data in the cover
> > letter when you post v1.
>
> That's my plan. It's always odd choosing workloads, but we do seem to
> have a few 'regular' benchmarks (hackbench, kernel builds, etc). Is
> there anything in particular you'd want to see?
I have a stress test implemented as a loadable module to benchmark
slab and page allocation times (just a tight loop and timing it). I
can clean it up a bit and share with you.
>
> > > +static __always_inline
> > > +struct kmem_cache *choose_slab(size_t size, kmem_buckets *b, gfp_t flags,
> > > + unsigned long caller)
> > > +{
> > > +#ifdef CONFIG_SLAB_PER_SITE
> > > + struct alloc_tag *tag = current->alloc_tag;
> > > +
> > > + if (!b && tag && tag->meta.sized &&
> > > + kmalloc_type(flags, caller) == KMALLOC_NORMAL &&
> > > + (flags & GFP_ATOMIC) != GFP_ATOMIC) {
> >
> > What if allocation is GFP_ATOMIC but a previous allocation from the
> > same location (same tag) happened without GFP_ATOMIC and
> > tag->meta.cache was allocated. Why not use that existing cache?
> > Same if the tag->meta.cache was pre-allocated.
>
> Maybe I was being too conservative in my understanding -- I thought that
> I couldn't use those caches on the chance that they may already be full?
> Or is that always the risk, ad GFP_ATOMIC deals with that? If it would
> be considered safe attempt the allocation from the existing cache, then
> yeah, I can adjust this check.
Well, you fall back to kmalloc_slab() which also might be full. So,
how would using an existing cache be different?
>
> Thanks for looking these over!
>
> -Kees
>
> --
> Kees Cook
Hi Kees,
On 2024/8/9 15:33, Kees Cook wrote:
> Use separate per-call-site kmem_cache or kmem_buckets. These are
> allocated on demand to avoid wasting memory for unused caches.
>
> A few caches need to be allocated very early to support allocating the
> caches themselves: kstrdup(), kvasprintf(), and pcpu_mem_zalloc(). Any
> GFP_ATOMIC allocations are currently left to be allocated from
> KMALLOC_NORMAL.
>
> With a distro config, /proc/slabinfo grows from ~400 entries to ~2200.
>
> Since this feature (CONFIG_SLAB_PER_SITE) is redundant to
> CONFIG_RANDOM_KMALLOC_CACHES, mark it a incompatible. Add Kconfig help
> text that compares the features.
>
> Improvements needed:
> - Retain call site gfp flags in alloc_tag meta field to:
> - pre-allocate all GFP_ATOMIC caches (since their caches cannot
> be allocated on demand unless we want them to be GFP_ATOMIC
> themselves...)
> - Separate MEMCG allocations as well
> - Allocate individual caches within kmem_buckets on demand to
> further reduce memory usage overhead.
>
> Signed-off-by: Kees Cook <kees@kernel.org>
> ---
> Cc: Suren Baghdasaryan <surenb@google.com>
> Cc: Kent Overstreet <kent.overstreet@linux.dev>
> Cc: Vlastimil Babka <vbabka@suse.cz>
> Cc: Christoph Lameter <cl@linux.com>
> Cc: Pekka Enberg <penberg@kernel.org>
> Cc: David Rientjes <rientjes@google.com>
> Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: Roman Gushchin <roman.gushchin@linux.dev>
> Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> Cc: linux-mm@kvack.org
> ---
> include/linux/alloc_tag.h | 8 +++
> lib/alloc_tag.c | 121 +++++++++++++++++++++++++++++++++++---
> mm/Kconfig | 19 +++++-
> mm/slab_common.c | 1 +
> mm/slub.c | 31 +++++++++-
> 5 files changed, 170 insertions(+), 10 deletions(-)
>
[...]
> diff --git a/mm/slub.c b/mm/slub.c
> index 3520acaf9afa..d14102c4b4d7 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4135,6 +4135,35 @@ void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
> }
> EXPORT_SYMBOL(__kmalloc_large_node_noprof);
>
> +static __always_inline
> +struct kmem_cache *choose_slab(size_t size, kmem_buckets *b, gfp_t flags,
> + unsigned long caller)
> +{
> +#ifdef CONFIG_SLAB_PER_SITE
> + struct alloc_tag *tag = current->alloc_tag;
There is a compile error here if CONFIG_MEM_ALLOC_PROFILING is disabled
when I test this patchset.
mm/slub.c: In function ‘choose_slab’:
mm/slub.c:4187:40: error: ‘struct task_struct’ has no member named
‘alloc_tag’
4187 | struct alloc_tag *tag = current->alloc_tag;
| ^~
CC mm/page_reporting.o
maybe CONFIG_SLAB_PER_SITE should depend on CONFIG_MEM_ALLOC_PROFILING
> +
> + if (!b && tag && tag->meta.sized &&
> + kmalloc_type(flags, caller) == KMALLOC_NORMAL &&
> + (flags & GFP_ATOMIC) != GFP_ATOMIC) {
> + void *p = READ_ONCE(tag->meta.cache);
> +
> + if (!p && slab_state >= UP) {
> + alloc_tag_site_init(&tag->ct, true);
> + p = READ_ONCE(tag->meta.cache);
> + }
> +
> + if (tag->meta.sized < SIZE_MAX) {
> + if (p)
> + return p;
> + /* Otherwise continue with default buckets. */
> + } else {
> + b = p;
> + }
> + }
> +#endif
> + return kmalloc_slab(size, b, flags, caller);
> +}
> +
> static __always_inline
> void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
> unsigned long caller)
> @@ -4152,7 +4181,7 @@ void *__do_kmalloc_node(size_t size, kmem_buckets *b, gfp_t flags, int node,
> if (unlikely(!size))
> return ZERO_SIZE_PTR;
>
> - s = kmalloc_slab(size, b, flags, caller);
> + s = choose_slab(size, b, flags, caller);
>
> ret = slab_alloc_node(s, NULL, flags, node, caller, size);
> ret = kasan_kmalloc(s, ret, size, flags);
On Sat, Aug 17, 2024 at 09:30:58AM +0800, Xiu Jianfeng wrote:
> Hi Kees,
>
> On 2024/8/9 15:33, Kees Cook wrote:
> > Use separate per-call-site kmem_cache or kmem_buckets. These are
> > allocated on demand to avoid wasting memory for unused caches.
> >
> > A few caches need to be allocated very early to support allocating the
> > caches themselves: kstrdup(), kvasprintf(), and pcpu_mem_zalloc(). Any
> > GFP_ATOMIC allocations are currently left to be allocated from
> > KMALLOC_NORMAL.
> >
> > With a distro config, /proc/slabinfo grows from ~400 entries to ~2200.
> >
> > Since this feature (CONFIG_SLAB_PER_SITE) is redundant to
> > CONFIG_RANDOM_KMALLOC_CACHES, mark it a incompatible. Add Kconfig help
> > text that compares the features.
> >
> > Improvements needed:
> > - Retain call site gfp flags in alloc_tag meta field to:
> > - pre-allocate all GFP_ATOMIC caches (since their caches cannot
> > be allocated on demand unless we want them to be GFP_ATOMIC
> > themselves...)
> > - Separate MEMCG allocations as well
> > - Allocate individual caches within kmem_buckets on demand to
> > further reduce memory usage overhead.
> >
> > Signed-off-by: Kees Cook <kees@kernel.org>
> > ---
> > Cc: Suren Baghdasaryan <surenb@google.com>
> > Cc: Kent Overstreet <kent.overstreet@linux.dev>
> > Cc: Vlastimil Babka <vbabka@suse.cz>
> > Cc: Christoph Lameter <cl@linux.com>
> > Cc: Pekka Enberg <penberg@kernel.org>
> > Cc: David Rientjes <rientjes@google.com>
> > Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
> > Cc: Andrew Morton <akpm@linux-foundation.org>
> > Cc: Roman Gushchin <roman.gushchin@linux.dev>
> > Cc: Hyeonggon Yoo <42.hyeyoo@gmail.com>
> > Cc: linux-mm@kvack.org
> > ---
> > include/linux/alloc_tag.h | 8 +++
> > lib/alloc_tag.c | 121 +++++++++++++++++++++++++++++++++++---
> > mm/Kconfig | 19 +++++-
> > mm/slab_common.c | 1 +
> > mm/slub.c | 31 +++++++++-
> > 5 files changed, 170 insertions(+), 10 deletions(-)
> >
>
> [...]
>
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 3520acaf9afa..d14102c4b4d7 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -4135,6 +4135,35 @@ void *__kmalloc_large_node_noprof(size_t size, gfp_t flags, int node)
> > }
> > EXPORT_SYMBOL(__kmalloc_large_node_noprof);
> >
> > +static __always_inline
> > +struct kmem_cache *choose_slab(size_t size, kmem_buckets *b, gfp_t flags,
> > + unsigned long caller)
> > +{
> > +#ifdef CONFIG_SLAB_PER_SITE
> > + struct alloc_tag *tag = current->alloc_tag;
>
> There is a compile error here if CONFIG_MEM_ALLOC_PROFILING is disabled
> when I test this patchset.
>
> mm/slub.c: In function ‘choose_slab’:
> mm/slub.c:4187:40: error: ‘struct task_struct’ has no member named
> ‘alloc_tag’
> 4187 | struct alloc_tag *tag = current->alloc_tag;
> | ^~
> CC mm/page_reporting.o
>
> maybe CONFIG_SLAB_PER_SITE should depend on CONFIG_MEM_ALLOC_PROFILING
Thanks! I tried to make the Kconfig use the right dependencies, but I
clearly missed something. There is also some weird behavior between
"depends" and "select". I will get this fixed for the next version.
-Kees
--
Kees Cook
© 2016 - 2026 Red Hat, Inc.