The leftover space in a slab is always smaller than s->size, and
kmem caches for large objects that are not power-of-two sizes tend to have
a greater amount of leftover space per slab. In some cases, the leftover
space is larger than the size of the slabobj_ext array for the slab.
An excellent example of such a cache is ext4_inode_cache. On my system,
the object size is 1144, with a preferred order of 3, 28 objects per slab,
and 736 bytes of leftover space per slab.
Since the size of the slabobj_ext array is only 224 bytes (w/o mem
profiling) or 448 bytes (w/ mem profiling) per slab, the entire array
fits within the leftover space.
Allocate the slabobj_exts array from this unused space instead of using
kcalloc(), when it is large enough. The array is always allocated when
creating new slabs, because implementing lazy allocation correctly is
difficult without expensive synchronization.
To avoid unnecessary overhead when MEMCG (with SLAB_ACCOUNT) and
MEM_ALLOC_PROFILING are not used for the cache, only allocate the
slabobj_ext array only when either of them are enabled when slabs are
created.
[ MEMCG=y, MEM_ALLOC_PROFILING=n ]
Before patch (creating 2M directories on ext4):
Slab: 3575348 kB
SReclaimable: 3137804 kB
SUnreclaim: 437544 kB
After patch (creating 2M directories on ext4):
Slab: 3558236 kB
SReclaimable: 3139268 kB
SUnreclaim: 418968 kB (-18.14 MiB)
Enjoy the memory savings!
Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
---
mm/slub.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 142 insertions(+), 5 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
index 13acc9437ef5..8101df5fdccf 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -884,6 +884,94 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
return *(unsigned int *)p;
}
+#ifdef CONFIG_SLAB_OBJ_EXT
+
+/*
+ * Check if memory cgroup or memory allocation profiling is enabled.
+ * If enabled, SLUB tries to reduce memory overhead of accounting
+ * slab objects. If neither is enabled when this function is called,
+ * the optimization is simply skipped to avoid affecting caches that do not
+ * need slabobj_ext metadata.
+ *
+ * However, this may disable optimization when memory cgroup or memory
+ * allocation profiling is used, but slabs are created too early
+ * even before those subsystems are initialized.
+ */
+static inline bool need_slab_obj_exts(struct kmem_cache *s)
+{
+ if (!mem_cgroup_disabled() && (s->flags & SLAB_ACCOUNT))
+ return true;
+
+ if (mem_alloc_profiling_enabled())
+ return true;
+
+ return false;
+}
+
+static inline unsigned int obj_exts_size_in_slab(struct slab *slab)
+{
+ return sizeof(struct slabobj_ext) * slab->objects;
+}
+
+static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s,
+ struct slab *slab)
+{
+ unsigned long objext_offset;
+
+ objext_offset = s->red_left_pad + s->size * slab->objects;
+ objext_offset = ALIGN(objext_offset, sizeof(struct slabobj_ext));
+ return objext_offset;
+}
+
+static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s,
+ struct slab *slab)
+{
+ unsigned long objext_offset = obj_exts_offset_in_slab(s, slab);
+ unsigned long objext_size = obj_exts_size_in_slab(slab);
+
+ return objext_offset + objext_size <= slab_size(slab);
+}
+
+static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
+{
+ unsigned long obj_exts;
+
+ if (!obj_exts_fit_within_slab_leftover(s, slab))
+ return false;
+
+ obj_exts = (unsigned long)slab_address(slab);
+ obj_exts += obj_exts_offset_in_slab(s, slab);
+ return obj_exts == slab_obj_exts(slab);
+}
+#else
+static inline bool need_slab_obj_exts(struct kmem_cache *s)
+{
+ return false;
+}
+
+static inline unsigned int obj_exts_size_in_slab(struct slab *slab)
+{
+ return 0;
+}
+
+static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s,
+ struct slab *slab)
+{
+ return 0;
+}
+
+static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s,
+ struct slab *slab)
+{
+ return false;
+}
+
+static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
+{
+ return false;
+}
+#endif
+
#ifdef CONFIG_SLUB_DEBUG
/*
@@ -1404,7 +1492,15 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab)
start = slab_address(slab);
length = slab_size(slab);
end = start + length;
- remainder = length % s->size;
+
+ if (obj_exts_in_slab(s, slab)) {
+ remainder = length;
+ remainder -= obj_exts_offset_in_slab(s, slab);
+ remainder -= obj_exts_size_in_slab(slab);
+ } else {
+ remainder = length % s->size;
+ }
+
if (!remainder)
return;
@@ -2154,6 +2250,11 @@ static inline void free_slab_obj_exts(struct slab *slab)
if (!obj_exts)
return;
+ if (obj_exts_in_slab(slab->slab_cache, slab)) {
+ slab->obj_exts = 0;
+ return;
+ }
+
/*
* obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
* corresponding extension will be NULL. alloc_tag_sub() will throw a
@@ -2169,6 +2270,31 @@ static inline void free_slab_obj_exts(struct slab *slab)
slab->obj_exts = 0;
}
+/*
+ * Try to allocate slabobj_ext array from unused space.
+ * This function must be called on a freshly allocated slab to prevent
+ * concurrency problems.
+ */
+static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab)
+{
+ void *addr;
+
+ if (!need_slab_obj_exts(s))
+ return;
+
+ metadata_access_enable();
+ if (obj_exts_fit_within_slab_leftover(s, slab)) {
+ addr = slab_address(slab) + obj_exts_offset_in_slab(s, slab);
+ addr = kasan_reset_tag(addr);
+ memset(addr, 0, obj_exts_size_in_slab(slab));
+ slab->obj_exts = (unsigned long)addr;
+ if (IS_ENABLED(CONFIG_MEMCG))
+ slab->obj_exts |= MEMCG_DATA_OBJEXTS;
+ slab_set_stride(slab, sizeof(struct slabobj_ext));
+ }
+ metadata_access_disable();
+}
+
#else /* CONFIG_SLAB_OBJ_EXT */
static inline void init_slab_obj_exts(struct slab *slab)
@@ -2185,6 +2311,11 @@ static inline void free_slab_obj_exts(struct slab *slab)
{
}
+static inline void alloc_slab_obj_exts_early(struct kmem_cache *s,
+ struct slab *slab)
+{
+}
+
#endif /* CONFIG_SLAB_OBJ_EXT */
#ifdef CONFIG_MEM_ALLOC_PROFILING
@@ -3155,7 +3286,9 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
static __always_inline void account_slab(struct slab *slab, int order,
struct kmem_cache *s, gfp_t gfp)
{
- if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
+ if (memcg_kmem_online() &&
+ (s->flags & SLAB_ACCOUNT) &&
+ !slab_obj_exts(slab))
alloc_slab_obj_exts(slab, s, gfp, true);
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
@@ -3219,9 +3352,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
slab->objects = oo_objects(oo);
slab->inuse = 0;
slab->frozen = 0;
- init_slab_obj_exts(slab);
-
- account_slab(slab, oo_order(oo), s, flags);
slab->slab_cache = s;
@@ -3230,6 +3360,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
start = slab_address(slab);
setup_slab_debug(s, slab, start);
+ init_slab_obj_exts(slab);
+ /*
+ * Poison the slab before initializing the slabobj_ext array
+ * to prevent the array from being overwritten.
+ */
+ alloc_slab_obj_exts_early(s, slab);
+ account_slab(slab, oo_order(oo), s, flags);
shuffle = shuffle_freelist(s, slab);
--
2.43.0
On 10/27/25 1:28 PM, Harry Yoo wrote:
>
> +#ifdef CONFIG_SLAB_OBJ_EXT
> +
> +/*
> + * Check if memory cgroup or memory allocation profiling is enabled.
> + * If enabled, SLUB tries to reduce memory overhead of accounting
> + * slab objects. If neither is enabled when this function is called,
> + * the optimization is simply skipped to avoid affecting caches that do not
> + * need slabobj_ext metadata.
> + *
> + * However, this may disable optimization when memory cgroup or memory
> + * allocation profiling is used, but slabs are created too early
> + * even before those subsystems are initialized.
> + */
> +static inline bool need_slab_obj_exts(struct kmem_cache *s)
> +{
> + if (!mem_cgroup_disabled() && (s->flags & SLAB_ACCOUNT))
Shouldn't this be !memcg_kmem_online() check?
In case of disabled kmem accounting via 'cgroup.memory=nokmem'
> + return true;
> +
> + if (mem_alloc_profiling_enabled())
> + return true;
> +
> + return false;
> +}
> +
On Wed, Oct 29, 2025 at 07:45:32PM +0100, Andrey Ryabinin wrote:
>
>
> On 10/27/25 1:28 PM, Harry Yoo wrote:
>
> >
> > +#ifdef CONFIG_SLAB_OBJ_EXT
> > +
> > +/*
> > + * Check if memory cgroup or memory allocation profiling is enabled.
> > + * If enabled, SLUB tries to reduce memory overhead of accounting
> > + * slab objects. If neither is enabled when this function is called,
> > + * the optimization is simply skipped to avoid affecting caches that do not
> > + * need slabobj_ext metadata.
> > + *
> > + * However, this may disable optimization when memory cgroup or memory
> > + * allocation profiling is used, but slabs are created too early
> > + * even before those subsystems are initialized.
> > + */
> > +static inline bool need_slab_obj_exts(struct kmem_cache *s)
> > +{
> > + if (!mem_cgroup_disabled() && (s->flags & SLAB_ACCOUNT))
>
> Shouldn't this be !memcg_kmem_online() check?
> In case of disabled kmem accounting via 'cgroup.memory=nokmem'
Good catch. Will fix, thanks!
> > + return true;
> > +
> > + if (mem_alloc_profiling_enabled())
> > + return true;
> > +
> > + return false;
> > +}
> > +
--
Cheers,
Harry / Hyeonggon
On Mon, Oct 27, 2025 at 5:29 AM Harry Yoo <harry.yoo@oracle.com> wrote:
>
> The leftover space in a slab is always smaller than s->size, and
> kmem caches for large objects that are not power-of-two sizes tend to have
> a greater amount of leftover space per slab. In some cases, the leftover
> space is larger than the size of the slabobj_ext array for the slab.
>
> An excellent example of such a cache is ext4_inode_cache. On my system,
> the object size is 1144, with a preferred order of 3, 28 objects per slab,
> and 736 bytes of leftover space per slab.
>
> Since the size of the slabobj_ext array is only 224 bytes (w/o mem
> profiling) or 448 bytes (w/ mem profiling) per slab, the entire array
> fits within the leftover space.
>
> Allocate the slabobj_exts array from this unused space instead of using
> kcalloc(), when it is large enough. The array is always allocated when
> creating new slabs, because implementing lazy allocation correctly is
> difficult without expensive synchronization.
>
> To avoid unnecessary overhead when MEMCG (with SLAB_ACCOUNT) and
> MEM_ALLOC_PROFILING are not used for the cache, only allocate the
> slabobj_ext array only when either of them are enabled when slabs are
> created.
>
> [ MEMCG=y, MEM_ALLOC_PROFILING=n ]
>
> Before patch (creating 2M directories on ext4):
> Slab: 3575348 kB
> SReclaimable: 3137804 kB
> SUnreclaim: 437544 kB
>
> After patch (creating 2M directories on ext4):
> Slab: 3558236 kB
> SReclaimable: 3139268 kB
> SUnreclaim: 418968 kB (-18.14 MiB)
>
> Enjoy the memory savings!
>
> Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> ---
> mm/slub.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 142 insertions(+), 5 deletions(-)
>
> diff --git a/mm/slub.c b/mm/slub.c
> index 13acc9437ef5..8101df5fdccf 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -884,6 +884,94 @@ static inline unsigned int get_orig_size(struct kmem_cache *s, void *object)
> return *(unsigned int *)p;
> }
>
> +#ifdef CONFIG_SLAB_OBJ_EXT
> +
> +/*
> + * Check if memory cgroup or memory allocation profiling is enabled.
> + * If enabled, SLUB tries to reduce memory overhead of accounting
> + * slab objects. If neither is enabled when this function is called,
> + * the optimization is simply skipped to avoid affecting caches that do not
> + * need slabobj_ext metadata.
> + *
> + * However, this may disable optimization when memory cgroup or memory
> + * allocation profiling is used, but slabs are created too early
> + * even before those subsystems are initialized.
> + */
> +static inline bool need_slab_obj_exts(struct kmem_cache *s)
> +{
> + if (!mem_cgroup_disabled() && (s->flags & SLAB_ACCOUNT))
> + return true;
> +
> + if (mem_alloc_profiling_enabled())
> + return true;
> +
> + return false;
> +}
> +
> +static inline unsigned int obj_exts_size_in_slab(struct slab *slab)
> +{
> + return sizeof(struct slabobj_ext) * slab->objects;
> +}
> +
> +static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s,
> + struct slab *slab)
> +{
> + unsigned long objext_offset;
> +
> + objext_offset = s->red_left_pad + s->size * slab->objects;
> + objext_offset = ALIGN(objext_offset, sizeof(struct slabobj_ext));
> + return objext_offset;
> +}
> +
> +static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s,
> + struct slab *slab)
> +{
> + unsigned long objext_offset = obj_exts_offset_in_slab(s, slab);
> + unsigned long objext_size = obj_exts_size_in_slab(slab);
> +
> + return objext_offset + objext_size <= slab_size(slab);
> +}
> +
> +static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
> +{
> + unsigned long obj_exts;
> +
> + if (!obj_exts_fit_within_slab_leftover(s, slab))
> + return false;
> +
> + obj_exts = (unsigned long)slab_address(slab);
> + obj_exts += obj_exts_offset_in_slab(s, slab);
> + return obj_exts == slab_obj_exts(slab);
You can check that slab_obj_exts(slab) is not NULL before making the
above calculations.
> +}
> +#else
> +static inline bool need_slab_obj_exts(struct kmem_cache *s)
> +{
> + return false;
> +}
> +
> +static inline unsigned int obj_exts_size_in_slab(struct slab *slab)
> +{
> + return 0;
> +}
> +
> +static inline unsigned long obj_exts_offset_in_slab(struct kmem_cache *s,
> + struct slab *slab)
> +{
> + return 0;
> +}
> +
> +static inline bool obj_exts_fit_within_slab_leftover(struct kmem_cache *s,
> + struct slab *slab)
> +{
> + return false;
> +}
> +
> +static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
> +{
> + return false;
> +}
> +#endif
> +
> #ifdef CONFIG_SLUB_DEBUG
>
> /*
> @@ -1404,7 +1492,15 @@ slab_pad_check(struct kmem_cache *s, struct slab *slab)
> start = slab_address(slab);
> length = slab_size(slab);
> end = start + length;
> - remainder = length % s->size;
> +
> + if (obj_exts_in_slab(s, slab)) {
> + remainder = length;
> + remainder -= obj_exts_offset_in_slab(s, slab);
> + remainder -= obj_exts_size_in_slab(slab);
> + } else {
> + remainder = length % s->size;
> + }
> +
> if (!remainder)
> return;
>
> @@ -2154,6 +2250,11 @@ static inline void free_slab_obj_exts(struct slab *slab)
> if (!obj_exts)
> return;
>
> + if (obj_exts_in_slab(slab->slab_cache, slab)) {
> + slab->obj_exts = 0;
> + return;
> + }
> +
> /*
> * obj_exts was created with __GFP_NO_OBJ_EXT flag, therefore its
> * corresponding extension will be NULL. alloc_tag_sub() will throw a
> @@ -2169,6 +2270,31 @@ static inline void free_slab_obj_exts(struct slab *slab)
> slab->obj_exts = 0;
> }
>
> +/*
> + * Try to allocate slabobj_ext array from unused space.
> + * This function must be called on a freshly allocated slab to prevent
> + * concurrency problems.
> + */
> +static void alloc_slab_obj_exts_early(struct kmem_cache *s, struct slab *slab)
> +{
> + void *addr;
> +
> + if (!need_slab_obj_exts(s))
> + return;
> +
> + metadata_access_enable();
> + if (obj_exts_fit_within_slab_leftover(s, slab)) {
> + addr = slab_address(slab) + obj_exts_offset_in_slab(s, slab);
> + addr = kasan_reset_tag(addr);
> + memset(addr, 0, obj_exts_size_in_slab(slab));
> + slab->obj_exts = (unsigned long)addr;
> + if (IS_ENABLED(CONFIG_MEMCG))
> + slab->obj_exts |= MEMCG_DATA_OBJEXTS;
> + slab_set_stride(slab, sizeof(struct slabobj_ext));
> + }
> + metadata_access_disable();
> +}
> +
> #else /* CONFIG_SLAB_OBJ_EXT */
>
> static inline void init_slab_obj_exts(struct slab *slab)
> @@ -2185,6 +2311,11 @@ static inline void free_slab_obj_exts(struct slab *slab)
> {
> }
>
> +static inline void alloc_slab_obj_exts_early(struct kmem_cache *s,
> + struct slab *slab)
> +{
> +}
> +
> #endif /* CONFIG_SLAB_OBJ_EXT */
>
> #ifdef CONFIG_MEM_ALLOC_PROFILING
> @@ -3155,7 +3286,9 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
> static __always_inline void account_slab(struct slab *slab, int order,
> struct kmem_cache *s, gfp_t gfp)
> {
> - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
> + if (memcg_kmem_online() &&
> + (s->flags & SLAB_ACCOUNT) &&
> + !slab_obj_exts(slab))
> alloc_slab_obj_exts(slab, s, gfp, true);
Don't you need to add a check for !obj_exts_in_slab() inside
alloc_slab_obj_exts() to avoid allocating slab->obj_exts?
>
> mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
> @@ -3219,9 +3352,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> slab->objects = oo_objects(oo);slab_obj_exts
> slab->inuse = 0;
> slab->frozen = 0;
> - init_slab_obj_exts(slab);
> -
> - account_slab(slab, oo_order(oo), s, flags);
>
> slab->slab_cache = s;
>
> @@ -3230,6 +3360,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> start = slab_address(slab);
>
> setup_slab_debug(s, slab, start);
> + init_slab_obj_exts(slab);
> + /*
> + * Poison the slab before initializing the slabobj_ext array
> + * to prevent the array from being overwritten.
> + */
> + alloc_slab_obj_exts_early(s, slab);
> + account_slab(slab, oo_order(oo), s, flags);
alloc_slab_obj_exts() is called in 2 other places:
1. __memcg_slab_post_alloc_hook()
2. prepare_slab_obj_exts_hook()
Don't you need alloc_slab_obj_exts_early() there as well?
>
> shuffle = shuffle_freelist(s, slab);
>
> --
> 2.43.0
>
On Tue, Oct 28, 2025 at 08:07:42PM -0700, Suren Baghdasaryan wrote:
> On Mon, Oct 27, 2025 at 5:29 AM Harry Yoo <harry.yoo@oracle.com> wrote:
> >
> > The leftover space in a slab is always smaller than s->size, and
> > kmem caches for large objects that are not power-of-two sizes tend to have
> > a greater amount of leftover space per slab. In some cases, the leftover
> > space is larger than the size of the slabobj_ext array for the slab.
> >
> > An excellent example of such a cache is ext4_inode_cache. On my system,
> > the object size is 1144, with a preferred order of 3, 28 objects per slab,
> > and 736 bytes of leftover space per slab.
> >
> > Since the size of the slabobj_ext array is only 224 bytes (w/o mem
> > profiling) or 448 bytes (w/ mem profiling) per slab, the entire array
> > fits within the leftover space.
> >
> > Allocate the slabobj_exts array from this unused space instead of using
> > kcalloc(), when it is large enough. The array is always allocated when
> > creating new slabs, because implementing lazy allocation correctly is
> > difficult without expensive synchronization.
> >
> > To avoid unnecessary overhead when MEMCG (with SLAB_ACCOUNT) and
> > MEM_ALLOC_PROFILING are not used for the cache, only allocate the
> > slabobj_ext array only when either of them are enabled when slabs are
> > created.
> >
> > [ MEMCG=y, MEM_ALLOC_PROFILING=n ]
> >
> > Before patch (creating 2M directories on ext4):
> > Slab: 3575348 kB
> > SReclaimable: 3137804 kB
> > SUnreclaim: 437544 kB
> >
> > After patch (creating 2M directories on ext4):
> > Slab: 3558236 kB
> > SReclaimable: 3139268 kB
> > SUnreclaim: 418968 kB (-18.14 MiB)
> >
> > Enjoy the memory savings!
> >
> > Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> > ---
> > mm/slub.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > 1 file changed, 142 insertions(+), 5 deletions(-)
> >
> > diff --git a/mm/slub.c b/mm/slub.c
> > index 13acc9437ef5..8101df5fdccf 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > +static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
> > +{
> > + unsigned long obj_exts;
> > +
> > + if (!obj_exts_fit_within_slab_leftover(s, slab))
> > + return false;
> > +
> > + obj_exts = (unsigned long)slab_address(slab);
> > + obj_exts += obj_exts_offset_in_slab(s, slab);
> > + return obj_exts == slab_obj_exts(slab);
>
> You can check that slab_obj_exts(slab) is not NULL before making the
> above calculations.
Did you mean this?
if (!slab_obj_exts(slab))
return false;
If so, yes that makes sense.
> > @@ -2185,6 +2311,11 @@ static inline void free_slab_obj_exts(struct slab *slab)
> > {
> > }
> >
> > +static inline void alloc_slab_obj_exts_early(struct kmem_cache *s,
> > + struct slab *slab)
> > +{
> > +}
> > +
> > #endif /* CONFIG_SLAB_OBJ_EXT */
> >
> > #ifdef CONFIG_MEM_ALLOC_PROFILING
> > @@ -3155,7 +3286,9 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
> > static __always_inline void account_slab(struct slab *slab, int order,
> > struct kmem_cache *s, gfp_t gfp)
> > {
> > - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
> > + if (memcg_kmem_online() &&
> > + (s->flags & SLAB_ACCOUNT) &&
> > + !slab_obj_exts(slab))
> > alloc_slab_obj_exts(slab, s, gfp, true);
>
> Don't you need to add a check for !obj_exts_in_slab() inside
> alloc_slab_obj_exts() to avoid allocating slab->obj_exts?
slab_obj_exts() should have returned a nonzero value
and then we don't call alloc_slab_obj_exts()?
> > mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
> > @@ -3219,9 +3352,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> > slab->objects = oo_objects(oo);slab_obj_exts
> > slab->inuse = 0;
> > slab->frozen = 0;
> > - init_slab_obj_exts(slab);
> > -
> > - account_slab(slab, oo_order(oo), s, flags);
> >
> > slab->slab_cache = s;
> >
> > @@ -3230,6 +3360,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> > start = slab_address(slab);
> >
> > setup_slab_debug(s, slab, start);
> > + init_slab_obj_exts(slab);
> > + /*
> > + * Poison the slab before initializing the slabobj_ext array
> > + * to prevent the array from being overwritten.
> > + */
> > + alloc_slab_obj_exts_early(s, slab);
> > + account_slab(slab, oo_order(oo), s, flags);
>
> alloc_slab_obj_exts() is called in 2 other places:
> 1. __memcg_slab_post_alloc_hook()
> 2. prepare_slab_obj_exts_hook()
>
> Don't you need alloc_slab_obj_exts_early() there as well?
That's good point, and I thought it's difficult to address
concurrency problem without using a per-slab lock.
Thread A Thread B
- sees slab->obj_exts == 0
- sees slab->obj_exts == 0
- allocates the vector from unused space
and initializes it.
- try cmpxchg()
- allocates the vector
from unused space and
initializes it.
(the vector is already
in use and it's overwritten!)
- try cmpxchg()
But since this is slowpath, using slab_{lock,unlock}() here is probably
fine. What do you think?
--
Cheers,
Harry / Hyeonggon
On Wed, Oct 29, 2025 at 1:00 AM Harry Yoo <harry.yoo@oracle.com> wrote:
>
> On Tue, Oct 28, 2025 at 08:07:42PM -0700, Suren Baghdasaryan wrote:
> > On Mon, Oct 27, 2025 at 5:29 AM Harry Yoo <harry.yoo@oracle.com> wrote:
> > >
> > > The leftover space in a slab is always smaller than s->size, and
> > > kmem caches for large objects that are not power-of-two sizes tend to have
> > > a greater amount of leftover space per slab. In some cases, the leftover
> > > space is larger than the size of the slabobj_ext array for the slab.
> > >
> > > An excellent example of such a cache is ext4_inode_cache. On my system,
> > > the object size is 1144, with a preferred order of 3, 28 objects per slab,
> > > and 736 bytes of leftover space per slab.
> > >
> > > Since the size of the slabobj_ext array is only 224 bytes (w/o mem
> > > profiling) or 448 bytes (w/ mem profiling) per slab, the entire array
> > > fits within the leftover space.
> > >
> > > Allocate the slabobj_exts array from this unused space instead of using
> > > kcalloc(), when it is large enough. The array is always allocated when
> > > creating new slabs, because implementing lazy allocation correctly is
> > > difficult without expensive synchronization.
> > >
> > > To avoid unnecessary overhead when MEMCG (with SLAB_ACCOUNT) and
> > > MEM_ALLOC_PROFILING are not used for the cache, only allocate the
> > > slabobj_ext array only when either of them are enabled when slabs are
> > > created.
> > >
> > > [ MEMCG=y, MEM_ALLOC_PROFILING=n ]
> > >
> > > Before patch (creating 2M directories on ext4):
> > > Slab: 3575348 kB
> > > SReclaimable: 3137804 kB
> > > SUnreclaim: 437544 kB
> > >
> > > After patch (creating 2M directories on ext4):
> > > Slab: 3558236 kB
> > > SReclaimable: 3139268 kB
> > > SUnreclaim: 418968 kB (-18.14 MiB)
> > >
> > > Enjoy the memory savings!
> > >
> > > Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> > > ---
> > > mm/slub.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > > 1 file changed, 142 insertions(+), 5 deletions(-)
> > >
> > > diff --git a/mm/slub.c b/mm/slub.c
> > > index 13acc9437ef5..8101df5fdccf 100644
> > > --- a/mm/slub.c
> > > +++ b/mm/slub.c
> > > +static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
> > > +{
> > > + unsigned long obj_exts;
> > > +
> > > + if (!obj_exts_fit_within_slab_leftover(s, slab))
> > > + return false;
> > > +
> > > + obj_exts = (unsigned long)slab_address(slab);
> > > + obj_exts += obj_exts_offset_in_slab(s, slab);
> > > + return obj_exts == slab_obj_exts(slab);
> >
> > You can check that slab_obj_exts(slab) is not NULL before making the
> > above calculations.
>
> Did you mean this?
>
> if (!slab_obj_exts(slab))
> return false;
Yes but you can store the returned value to reuse later in the last
"return obj_exts == slab_obj_exts(slab);" expression.
>
> If so, yes that makes sense.
>
> > > @@ -2185,6 +2311,11 @@ static inline void free_slab_obj_exts(struct slab *slab)
> > > {
> > > }
> > >
> > > +static inline void alloc_slab_obj_exts_early(struct kmem_cache *s,
> > > + struct slab *slab)
> > > +{
> > > +}
> > > +
> > > #endif /* CONFIG_SLAB_OBJ_EXT */
> > >
> > > #ifdef CONFIG_MEM_ALLOC_PROFILING
> > > @@ -3155,7 +3286,9 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
> > > static __always_inline void account_slab(struct slab *slab, int order,
> > > struct kmem_cache *s, gfp_t gfp)
> > > {
> > > - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
> > > + if (memcg_kmem_online() &&
> > > + (s->flags & SLAB_ACCOUNT) &&
> > > + !slab_obj_exts(slab))
> > > alloc_slab_obj_exts(slab, s, gfp, true);
> >
> > Don't you need to add a check for !obj_exts_in_slab() inside
> > alloc_slab_obj_exts() to avoid allocating slab->obj_exts?
>
> slab_obj_exts() should have returned a nonzero value
> and then we don't call alloc_slab_obj_exts()?
Sorry, I mean that you would need to check
obj_exts_fit_within_slab_leftover() inside alloc_slab_obj_exts() to
avoid allocating the vector when obj_exts can fit inside the slab
itself. This is because alloc_slab_obj_exts() can be called from other
places as well. However, from your next comment, I realize that your
intention might have been to keep those other callers intact and
allocate the vector separately even if the obj_exts could have been
squeezed inside the slab. Is that correct?
>
> > > mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
> > > @@ -3219,9 +3352,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> > > slab->objects = oo_objects(oo);slab_obj_exts
> > > slab->inuse = 0;
> > > slab->frozen = 0;
> > > - init_slab_obj_exts(slab);
> > > -
> > > - account_slab(slab, oo_order(oo), s, flags);
> > >
> > > slab->slab_cache = s;
> > >
> > > @@ -3230,6 +3360,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> > > start = slab_address(slab);
> > >
> > > setup_slab_debug(s, slab, start);
> > > + init_slab_obj_exts(slab);
> > > + /*
> > > + * Poison the slab before initializing the slabobj_ext array
> > > + * to prevent the array from being overwritten.
> > > + */
> > > + alloc_slab_obj_exts_early(s, slab);
> > > + account_slab(slab, oo_order(oo), s, flags);
> >
> > alloc_slab_obj_exts() is called in 2 other places:
> > 1. __memcg_slab_post_alloc_hook()
> > 2. prepare_slab_obj_exts_hook()
> >
> > Don't you need alloc_slab_obj_exts_early() there as well?
>
> That's good point, and I thought it's difficult to address
> concurrency problem without using a per-slab lock.
>
> Thread A Thread B
> - sees slab->obj_exts == 0
> - sees slab->obj_exts == 0
> - allocates the vector from unused space
> and initializes it.
> - try cmpxchg()
> - allocates the vector
> from unused space and
> initializes it.
> (the vector is already
> in use and it's overwritten!)
>
> - try cmpxchg()
>
> But since this is slowpath, using slab_{lock,unlock}() here is probably
> fine. What do you think?
Ok, was your original intent to leave these callers as is and allocate
the vector like we do today even if obj_exts fit inside the slab?
>
> --
> Cheers,
> Harry / Hyeonggon
On Wed, Oct 29, 2025 at 11:37:27AM -0700, Suren Baghdasaryan wrote:
> On Wed, Oct 29, 2025 at 1:00 AM Harry Yoo <harry.yoo@oracle.com> wrote:
> >
> > On Tue, Oct 28, 2025 at 08:07:42PM -0700, Suren Baghdasaryan wrote:
> > > On Mon, Oct 27, 2025 at 5:29 AM Harry Yoo <harry.yoo@oracle.com> wrote:
> > > >
> > > > The leftover space in a slab is always smaller than s->size, and
> > > > kmem caches for large objects that are not power-of-two sizes tend to have
> > > > a greater amount of leftover space per slab. In some cases, the leftover
> > > > space is larger than the size of the slabobj_ext array for the slab.
> > > >
> > > > An excellent example of such a cache is ext4_inode_cache. On my system,
> > > > the object size is 1144, with a preferred order of 3, 28 objects per slab,
> > > > and 736 bytes of leftover space per slab.
> > > >
> > > > Since the size of the slabobj_ext array is only 224 bytes (w/o mem
> > > > profiling) or 448 bytes (w/ mem profiling) per slab, the entire array
> > > > fits within the leftover space.
> > > >
> > > > Allocate the slabobj_exts array from this unused space instead of using
> > > > kcalloc(), when it is large enough. The array is always allocated when
> > > > creating new slabs, because implementing lazy allocation correctly is
> > > > difficult without expensive synchronization.
> > > >
> > > > To avoid unnecessary overhead when MEMCG (with SLAB_ACCOUNT) and
> > > > MEM_ALLOC_PROFILING are not used for the cache, only allocate the
> > > > slabobj_ext array only when either of them are enabled when slabs are
> > > > created.
> > > >
> > > > [ MEMCG=y, MEM_ALLOC_PROFILING=n ]
> > > >
> > > > Before patch (creating 2M directories on ext4):
> > > > Slab: 3575348 kB
> > > > SReclaimable: 3137804 kB
> > > > SUnreclaim: 437544 kB
> > > >
> > > > After patch (creating 2M directories on ext4):
> > > > Slab: 3558236 kB
> > > > SReclaimable: 3139268 kB
> > > > SUnreclaim: 418968 kB (-18.14 MiB)
> > > >
> > > > Enjoy the memory savings!
> > > >
> > > > Signed-off-by: Harry Yoo <harry.yoo@oracle.com>
> > > > ---
> > > > mm/slub.c | 147 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
> > > > 1 file changed, 142 insertions(+), 5 deletions(-)
> > > >
> > > > diff --git a/mm/slub.c b/mm/slub.c
> > > > index 13acc9437ef5..8101df5fdccf 100644
> > > > --- a/mm/slub.c
> > > > +++ b/mm/slub.c
> > > > +static inline bool obj_exts_in_slab(struct kmem_cache *s, struct slab *slab)
> > > > +{
> > > > + unsigned long obj_exts;
> > > > +
> > > > + if (!obj_exts_fit_within_slab_leftover(s, slab))
> > > > + return false;
> > > > +
> > > > + obj_exts = (unsigned long)slab_address(slab);
> > > > + obj_exts += obj_exts_offset_in_slab(s, slab);
> > > > + return obj_exts == slab_obj_exts(slab);
> > >
> > > You can check that slab_obj_exts(slab) is not NULL before making the
> > > above calculations.
> >
> > Did you mean this?
> >
> > if (!slab_obj_exts(slab))
> > return false;
>
> Yes but you can store the returned value to reuse later in the last
> "return obj_exts == slab_obj_exts(slab);" expression.
Okay, will do.
> > If so, yes that makes sense.
> >
> > > > @@ -2185,6 +2311,11 @@ static inline void free_slab_obj_exts(struct slab *slab)
> > > > {
> > > > }
> > > >
> > > > +static inline void alloc_slab_obj_exts_early(struct kmem_cache *s,
> > > > + struct slab *slab)
> > > > +{
> > > > +}
> > > > +
> > > > #endif /* CONFIG_SLAB_OBJ_EXT */
> > > >
> > > > #ifdef CONFIG_MEM_ALLOC_PROFILING
> > > > @@ -3155,7 +3286,9 @@ static inline bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
> > > > static __always_inline void account_slab(struct slab *slab, int order,
> > > > struct kmem_cache *s, gfp_t gfp)
> > > > {
> > > > - if (memcg_kmem_online() && (s->flags & SLAB_ACCOUNT))
> > > > + if (memcg_kmem_online() &&
> > > > + (s->flags & SLAB_ACCOUNT) &&
> > > > + !slab_obj_exts(slab))
> > > > alloc_slab_obj_exts(slab, s, gfp, true);
> > >
> > > Don't you need to add a check for !obj_exts_in_slab() inside
> > > alloc_slab_obj_exts() to avoid allocating slab->obj_exts?
> >
> > slab_obj_exts() should have returned a nonzero value
> > and then we don't call alloc_slab_obj_exts()?
>
> Sorry, I mean that you would need to check
> obj_exts_fit_within_slab_leftover() inside alloc_slab_obj_exts() to
> avoid allocating the vector when obj_exts can fit inside the slab
> itself. This is because alloc_slab_obj_exts() can be called from other
> places as well. However, from your next comment, I realize that your
> intention might have been to keep those other callers intact and
> allocate the vector separately even if the obj_exts could have been
> squeezed inside the slab. Is that correct?
Yes, that's correct!
> > > > mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
> > > > @@ -3219,9 +3352,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> > > > slab->objects = oo_objects(oo);slab_obj_exts
> > > > slab->inuse = 0;
> > > > slab->frozen = 0;
> > > > - init_slab_obj_exts(slab);
> > > > -
> > > > - account_slab(slab, oo_order(oo), s, flags);
> > > >
> > > > slab->slab_cache = s;
> > > >
> > > > @@ -3230,6 +3360,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
> > > > start = slab_address(slab);
> > > >
> > > > setup_slab_debug(s, slab, start);
> > > > + init_slab_obj_exts(slab);
> > > > + /*
> > > > + * Poison the slab before initializing the slabobj_ext array
> > > > + * to prevent the array from being overwritten.
> > > > + */
> > > > + alloc_slab_obj_exts_early(s, slab);
> > > > + account_slab(slab, oo_order(oo), s, flags);
> > >
> > > alloc_slab_obj_exts() is called in 2 other places:
> > > 1. __memcg_slab_post_alloc_hook()
> > > 2. prepare_slab_obj_exts_hook()
> > >
> > > Don't you need alloc_slab_obj_exts_early() there as well?
> >
> > That's good point, and I thought it's difficult to address
> > concurrency problem without using a per-slab lock.
> >
> > Thread A Thread B
> > - sees slab->obj_exts == 0
> > - sees slab->obj_exts == 0
> > - allocates the vector from unused space
> > and initializes it.
> > - try cmpxchg()
> > - allocates the vector
> > from unused space and
> > initializes it.
> > (the vector is already
> > in use and it's overwritten!)
> >
> > - try cmpxchg()
> >
> > But since this is slowpath, using slab_{lock,unlock}() here is probably
> > fine. What do you think?
>
> Ok, was your original intent to leave these callers as is and allocate
> the vector like we do today even if obj_exts fit inside the slab?
Yes that's what I intended, and maybe later we could allocate the vector
from the unused space even after the slab is allocated, as long as
it doesn't hurt performance.
> >
> > --
> > Cheers,
> > Harry / Hyeonggon
--
Cheers,
Harry / Hyeonggon
On 10/30/25 01:40, Harry Yoo wrote:
> On Wed, Oct 29, 2025 at 11:37:27AM -0700, Suren Baghdasaryan wrote:
>> > > > mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
>> > > > @@ -3219,9 +3352,6 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>> > > > slab->objects = oo_objects(oo);slab_obj_exts
>> > > > slab->inuse = 0;
>> > > > slab->frozen = 0;
>> > > > - init_slab_obj_exts(slab);
>> > > > -
>> > > > - account_slab(slab, oo_order(oo), s, flags);
>> > > >
>> > > > slab->slab_cache = s;
>> > > >
>> > > > @@ -3230,6 +3360,13 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
>> > > > start = slab_address(slab);
>> > > >
>> > > > setup_slab_debug(s, slab, start);
>> > > > + init_slab_obj_exts(slab);
>> > > > + /*
>> > > > + * Poison the slab before initializing the slabobj_ext array
>> > > > + * to prevent the array from being overwritten.
>> > > > + */
>> > > > + alloc_slab_obj_exts_early(s, slab);
>> > > > + account_slab(slab, oo_order(oo), s, flags);
>> > >
>> > > alloc_slab_obj_exts() is called in 2 other places:
>> > > 1. __memcg_slab_post_alloc_hook()
>> > > 2. prepare_slab_obj_exts_hook()
>> > >
>> > > Don't you need alloc_slab_obj_exts_early() there as well?
>> >
>> > That's good point, and I thought it's difficult to address
>> > concurrency problem without using a per-slab lock.
>> >
>> > Thread A Thread B
>> > - sees slab->obj_exts == 0
>> > - sees slab->obj_exts == 0
>> > - allocates the vector from unused space
>> > and initializes it.
>> > - try cmpxchg()
>> > - allocates the vector
>> > from unused space and
>> > initializes it.
>> > (the vector is already
>> > in use and it's overwritten!)
>> >
>> > - try cmpxchg()
>> >
>> > But since this is slowpath, using slab_{lock,unlock}() here is probably
>> > fine. What do you think?
>>
>> Ok, was your original intent to leave these callers as is and allocate
>> the vector like we do today even if obj_exts fit inside the slab?
>
> Yes that's what I intended, and maybe later we could allocate the vector
> from the unused space even after the slab is allocated, as long as
> it doesn't hurt performance.
It would be nice. I guess what can happen is there's a cache without
SLAB_ACCOUNT but then some allocations from that will use __GFP_ACCOUNT and
we need to allocate obj_exts on-demand, right?
>> >
>> > --
>> > Cheers,
>> > Harry / Hyeonggon
>
© 2016 - 2026 Red Hat, Inc.