We now rely on sheaves as the percpu caching layer and can refill them
directly from partial or newly allocated slabs. Start removing the cpu
(partial) slabs code, first from allocation paths.
This means that any allocation not satisfied from percpu sheaves will
end up in ___slab_alloc(), where we remove the usage of cpu (partial)
slabs, so it will only perform get_partial() or new_slab().
In get_partial_node() we used to return a slab for freezing as the cpu
slab and to refill the partial slab. Now we only want to return a single
object and leave the slab on the list (unless it became full). We can't
simply reuse alloc_single_from_partial() as that assumes freeing uses
free_to_partial_list(). Instead we need to use __slab_update_freelist()
to work properly against a racing __slab_free().
The rest of the changes is removing functions that no longer have any
callers.
Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
mm/slub.c | 614 ++++++++------------------------------------------------------
1 file changed, 71 insertions(+), 543 deletions(-)
diff --git a/mm/slub.c b/mm/slub.c
index e2b052657d11..bd67336e7c1f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -245,7 +245,6 @@ static DEFINE_STATIC_KEY_FALSE(strict_numa);
struct partial_context {
gfp_t flags;
unsigned int orig_size;
- void *object;
unsigned int min_objects;
unsigned int max_objects;
struct list_head slabs;
@@ -598,36 +597,6 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object)
return freelist_ptr_decode(s, p, ptr_addr);
}
-static void prefetch_freepointer(const struct kmem_cache *s, void *object)
-{
- prefetchw(object + s->offset);
-}
-
-/*
- * When running under KMSAN, get_freepointer_safe() may return an uninitialized
- * pointer value in the case the current thread loses the race for the next
- * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in
- * slab_alloc_node() will fail, so the uninitialized value won't be used, but
- * KMSAN will still check all arguments of cmpxchg because of imperfect
- * handling of inline assembly.
- * To work around this problem, we apply __no_kmsan_checks to ensure that
- * get_freepointer_safe() returns initialized memory.
- */
-__no_kmsan_checks
-static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
-{
- unsigned long freepointer_addr;
- freeptr_t p;
-
- if (!debug_pagealloc_enabled_static())
- return get_freepointer(s, object);
-
- object = kasan_reset_tag(object);
- freepointer_addr = (unsigned long)object + s->offset;
- copy_from_kernel_nofault(&p, (freeptr_t *)freepointer_addr, sizeof(p));
- return freelist_ptr_decode(s, p, freepointer_addr);
-}
-
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
unsigned long freeptr_addr = (unsigned long)object + s->offset;
@@ -707,23 +676,11 @@ static void slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
nr_slabs = DIV_ROUND_UP(nr_objects * 2, oo_objects(s->oo));
s->cpu_partial_slabs = nr_slabs;
}
-
-static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
-{
- return s->cpu_partial_slabs;
-}
-#else
-#ifdef SLAB_SUPPORTS_SYSFS
+#elif defined(SLAB_SUPPORTS_SYSFS)
static inline void
slub_set_cpu_partial(struct kmem_cache *s, unsigned int nr_objects)
{
}
-#endif
-
-static inline unsigned int slub_get_cpu_partial(struct kmem_cache *s)
-{
- return 0;
-}
#endif /* CONFIG_SLUB_CPU_PARTIAL */
/*
@@ -1075,7 +1032,7 @@ static void set_track_update(struct kmem_cache *s, void *object,
p->handle = handle;
#endif
p->addr = addr;
- p->cpu = smp_processor_id();
+ p->cpu = raw_smp_processor_id();
p->pid = current->pid;
p->when = jiffies;
}
@@ -3575,15 +3532,15 @@ static bool get_partial_node_bulk(struct kmem_cache *s,
}
/*
- * Try to allocate a partial slab from a specific node.
+ * Try to allocate object from a partial slab on a specific node.
*/
-static struct slab *get_partial_node(struct kmem_cache *s,
- struct kmem_cache_node *n,
- struct partial_context *pc)
+static void *get_partial_node(struct kmem_cache *s,
+ struct kmem_cache_node *n,
+ struct partial_context *pc)
{
- struct slab *slab, *slab2, *partial = NULL;
+ struct slab *slab, *slab2;
unsigned long flags;
- unsigned int partial_slabs = 0;
+ void *object;
/*
* Racy check. If we mistakenly see no partial slabs then we
@@ -3599,54 +3556,54 @@ static struct slab *get_partial_node(struct kmem_cache *s,
else if (!spin_trylock_irqsave(&n->list_lock, flags))
return NULL;
list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
+
+ unsigned long counters;
+ struct slab new;
+
if (!pfmemalloc_match(slab, pc->flags))
continue;
if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
- void *object = alloc_single_from_partial(s, n, slab,
+ object = alloc_single_from_partial(s, n, slab,
pc->orig_size);
- if (object) {
- partial = slab;
- pc->object = object;
+ if (object)
break;
- }
continue;
}
- remove_partial(n, slab);
-
- if (!partial) {
- partial = slab;
- stat(s, ALLOC_FROM_PARTIAL);
-
- if ((slub_get_cpu_partial(s) == 0)) {
- break;
- }
- } else {
- put_cpu_partial(s, slab, 0);
- stat(s, CPU_PARTIAL_NODE);
-
- if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
- break;
- }
- }
+ /*
+ * get a single object from the slab. This might race against
+ * __slab_free(), which however has to take the list_lock if
+ * it's about to make the slab fully free.
+ */
+ do {
+ object = slab->freelist;
+ counters = slab->counters;
+ new.freelist = get_freepointer(s, object);
+ new.counters = counters;
+ new.inuse++;
+ } while (!__slab_update_freelist(s, slab,
+ object, counters,
+ new.freelist, new.counters,
+ "get_partial_node"));
+
+ if (!new.freelist)
+ remove_partial(n, slab);
}
spin_unlock_irqrestore(&n->list_lock, flags);
- return partial;
+ return object;
}
/*
- * Get a slab from somewhere. Search in increasing NUMA distances.
+ * Get an object from somewhere. Search in increasing NUMA distances.
*/
-static struct slab *get_any_partial(struct kmem_cache *s,
- struct partial_context *pc)
+static void *get_any_partial(struct kmem_cache *s, struct partial_context *pc)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
enum zone_type highest_zoneidx = gfp_zone(pc->flags);
- struct slab *slab;
unsigned int cpuset_mems_cookie;
/*
@@ -3681,8 +3638,8 @@ static struct slab *get_any_partial(struct kmem_cache *s,
if (n && cpuset_zone_allowed(zone, pc->flags) &&
n->nr_partial > s->min_partial) {
- slab = get_partial_node(s, n, pc);
- if (slab) {
+ void *object = get_partial_node(s, n, pc);
+ if (object) {
/*
* Don't check read_mems_allowed_retry()
* here - if mems_allowed was updated in
@@ -3690,7 +3647,7 @@ static struct slab *get_any_partial(struct kmem_cache *s,
* between allocation and the cpuset
* update
*/
- return slab;
+ return object;
}
}
}
@@ -3700,20 +3657,20 @@ static struct slab *get_any_partial(struct kmem_cache *s,
}
/*
- * Get a partial slab, lock it and return it.
+ * Get an object from a partial slab
*/
-static struct slab *get_partial(struct kmem_cache *s, int node,
- struct partial_context *pc)
+static void *get_partial(struct kmem_cache *s, int node,
+ struct partial_context *pc)
{
- struct slab *slab;
int searchnode = node;
+ void *object;
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- slab = get_partial_node(s, get_node(s, searchnode), pc);
- if (slab || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
- return slab;
+ object = get_partial_node(s, get_node(s, searchnode), pc);
+ if (object || (node != NUMA_NO_NODE && (pc->flags & __GFP_THISNODE)))
+ return object;
return get_any_partial(s, pc);
}
@@ -4272,19 +4229,6 @@ static int slub_cpu_dead(unsigned int cpu)
return 0;
}
-/*
- * Check if the objects in a per cpu structure fit numa
- * locality expectations.
- */
-static inline int node_match(struct slab *slab, int node)
-{
-#ifdef CONFIG_NUMA
- if (node != NUMA_NO_NODE && slab_nid(slab) != node)
- return 0;
-#endif
- return 1;
-}
-
#ifdef CONFIG_SLUB_DEBUG
static int count_free(struct slab *slab)
{
@@ -4469,39 +4413,6 @@ __update_cpu_freelist_fast(struct kmem_cache *s,
&old.full, new.full);
}
-/*
- * Check the slab->freelist and either transfer the freelist to the
- * per cpu freelist or deactivate the slab.
- *
- * The slab is still frozen if the return value is not NULL.
- *
- * If this function returns NULL then the slab has been unfrozen.
- */
-static inline void *get_freelist(struct kmem_cache *s, struct slab *slab)
-{
- struct slab new;
- unsigned long counters;
- void *freelist;
-
- lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
-
- do {
- freelist = slab->freelist;
- counters = slab->counters;
-
- new.counters = counters;
-
- new.inuse = slab->objects;
- new.frozen = freelist != NULL;
-
- } while (!__slab_update_freelist(s, slab,
- freelist, counters,
- NULL, new.counters,
- "get_freelist"));
-
- return freelist;
-}
-
/*
* Get the slab's freelist and do not freeze it.
*
@@ -4535,197 +4446,23 @@ static inline void *get_freelist_nofreeze(struct kmem_cache *s, struct slab *sla
}
/*
- * Freeze the partial slab and return the pointer to the freelist.
- */
-static inline void *freeze_slab(struct kmem_cache *s, struct slab *slab)
-{
- struct slab new;
- unsigned long counters;
- void *freelist;
-
- do {
- freelist = slab->freelist;
- counters = slab->counters;
-
- new.counters = counters;
- VM_BUG_ON(new.frozen);
-
- new.inuse = slab->objects;
- new.frozen = 1;
-
- } while (!slab_update_freelist(s, slab,
- freelist, counters,
- NULL, new.counters,
- "freeze_slab"));
-
- return freelist;
-}
-
-/*
- * Slow path. The lockless freelist is empty or we need to perform
- * debugging duties.
- *
- * Processing is still very fast if new objects have been freed to the
- * regular freelist. In that case we simply take over the regular freelist
- * as the lockless freelist and zap the regular freelist.
+ * Slow path. We failed to allocate via percpu sheaves or they are not available
+ * due to bootstrap or debugging enabled or SLUB_TINY.
*
- * If that is not working then we fall back to the partial lists. We take the
- * first element of the freelist as the object to allocate now and move the
- * rest of the freelist to the lockless freelist.
- *
- * And if we were unable to get a new slab from the partial slab lists then
- * we need to allocate a new slab. This is the slowest path since it involves
- * a call to the page allocator and the setup of a new slab.
- *
- * Version of __slab_alloc to use when we know that preemption is
- * already disabled (which is the case for bulk allocation).
+ * We try to allocate from partial slab lists and fall back to allocating a new
+ * slab.
*/
static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
+ unsigned long addr, unsigned int orig_size)
{
bool allow_spin = gfpflags_allow_spinning(gfpflags);
void *freelist;
struct slab *slab;
- unsigned long flags;
struct partial_context pc;
bool try_thisnode = true;
stat(s, ALLOC_SLOWPATH);
-reread_slab:
-
- slab = READ_ONCE(c->slab);
- if (!slab) {
- /*
- * if the node is not online or has no normal memory, just
- * ignore the node constraint
- */
- if (unlikely(node != NUMA_NO_NODE &&
- !node_isset(node, slab_nodes)))
- node = NUMA_NO_NODE;
- goto new_slab;
- }
-
- if (unlikely(!node_match(slab, node))) {
- /*
- * same as above but node_match() being false already
- * implies node != NUMA_NO_NODE.
- *
- * We don't strictly honor pfmemalloc and NUMA preferences
- * when !allow_spin because:
- *
- * 1. Most kmalloc() users allocate objects on the local node,
- * so kmalloc_nolock() tries not to interfere with them by
- * deactivating the cpu slab.
- *
- * 2. Deactivating due to NUMA or pfmemalloc mismatch may cause
- * unnecessary slab allocations even when n->partial list
- * is not empty.
- */
- if (!node_isset(node, slab_nodes) ||
- !allow_spin) {
- node = NUMA_NO_NODE;
- } else {
- stat(s, ALLOC_NODE_MISMATCH);
- goto deactivate_slab;
- }
- }
-
- /*
- * By rights, we should be searching for a slab page that was
- * PFMEMALLOC but right now, we are losing the pfmemalloc
- * information when the page leaves the per-cpu allocator
- */
- if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin))
- goto deactivate_slab;
-
- /* must check again c->slab in case we got preempted and it changed */
- local_lock_cpu_slab(s, flags);
-
- if (unlikely(slab != c->slab)) {
- local_unlock_cpu_slab(s, flags);
- goto reread_slab;
- }
- freelist = c->freelist;
- if (freelist)
- goto load_freelist;
-
- freelist = get_freelist(s, slab);
-
- if (!freelist) {
- c->slab = NULL;
- c->tid = next_tid(c->tid);
- local_unlock_cpu_slab(s, flags);
- stat(s, DEACTIVATE_BYPASS);
- goto new_slab;
- }
-
- stat(s, ALLOC_REFILL);
-
-load_freelist:
-
- lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock));
-
- /*
- * freelist is pointing to the list of objects to be used.
- * slab is pointing to the slab from which the objects are obtained.
- * That slab must be frozen for per cpu allocations to work.
- */
- VM_BUG_ON(!c->slab->frozen);
- c->freelist = get_freepointer(s, freelist);
- c->tid = next_tid(c->tid);
- local_unlock_cpu_slab(s, flags);
- return freelist;
-
-deactivate_slab:
-
- local_lock_cpu_slab(s, flags);
- if (slab != c->slab) {
- local_unlock_cpu_slab(s, flags);
- goto reread_slab;
- }
- freelist = c->freelist;
- c->slab = NULL;
- c->freelist = NULL;
- c->tid = next_tid(c->tid);
- local_unlock_cpu_slab(s, flags);
- deactivate_slab(s, slab, freelist);
-
-new_slab:
-
-#ifdef CONFIG_SLUB_CPU_PARTIAL
- while (slub_percpu_partial(c)) {
- local_lock_cpu_slab(s, flags);
- if (unlikely(c->slab)) {
- local_unlock_cpu_slab(s, flags);
- goto reread_slab;
- }
- if (unlikely(!slub_percpu_partial(c))) {
- local_unlock_cpu_slab(s, flags);
- /* we were preempted and partial list got empty */
- goto new_objects;
- }
-
- slab = slub_percpu_partial(c);
- slub_set_percpu_partial(c, slab);
-
- if (likely(node_match(slab, node) &&
- pfmemalloc_match(slab, gfpflags)) ||
- !allow_spin) {
- c->slab = slab;
- freelist = get_freelist(s, slab);
- VM_BUG_ON(!freelist);
- stat(s, CPU_PARTIAL_ALLOC);
- goto load_freelist;
- }
-
- local_unlock_cpu_slab(s, flags);
-
- slab->next = NULL;
- __put_partials(s, slab);
- }
-#endif
-
new_objects:
pc.flags = gfpflags;
@@ -4750,33 +4487,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
}
pc.orig_size = orig_size;
- slab = get_partial(s, node, &pc);
- if (slab) {
- if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
- freelist = pc.object;
- /*
- * For debug caches here we had to go through
- * alloc_single_from_partial() so just store the
- * tracking info and return the object.
- *
- * Due to disabled preemption we need to disallow
- * blocking. The flags are further adjusted by
- * gfp_nested_mask() in stack_depot itself.
- */
- if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr,
- gfpflags & ~(__GFP_DIRECT_RECLAIM));
+ freelist = get_partial(s, node, &pc);
+ if (freelist) {
+ if (kmem_cache_debug_flags(s, SLAB_STORE_USER))
+ set_track(s, freelist, TRACK_ALLOC, addr, gfpflags);
- return freelist;
- }
-
- freelist = freeze_slab(s, slab);
- goto retry_load_slab;
+ return freelist;
}
- slub_put_cpu_ptr(s->cpu_slab);
slab = new_slab(s, pc.flags, node);
- c = slub_get_cpu_ptr(s->cpu_slab);
if (unlikely(!slab)) {
if (node != NUMA_NO_NODE && !(gfpflags & __GFP_THISNODE)
@@ -4790,66 +4509,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
stat(s, ALLOC_SLAB);
- if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
- freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
-
- if (unlikely(!freelist))
- goto new_objects;
-
- if (s->flags & SLAB_STORE_USER)
- set_track(s, freelist, TRACK_ALLOC, addr,
- gfpflags & ~(__GFP_DIRECT_RECLAIM));
-
- return freelist;
- }
-
- /*
- * No other reference to the slab yet so we can
- * muck around with it freely without cmpxchg
- */
- freelist = slab->freelist;
- slab->freelist = NULL;
- slab->inuse = slab->objects;
- slab->frozen = 1;
-
- inc_slabs_node(s, slab_nid(slab), slab->objects);
+ freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
- if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
- /*
- * For !pfmemalloc_match() case we don't load freelist so that
- * we don't make further mismatched allocations easier.
- */
- deactivate_slab(s, slab, get_freepointer(s, freelist));
- return freelist;
- }
+ if (unlikely(!freelist))
+ goto new_objects;
-retry_load_slab:
+ if (kmem_cache_debug_flags(s, SLAB_STORE_USER))
+ set_track(s, freelist, TRACK_ALLOC, addr, gfpflags);
- local_lock_cpu_slab(s, flags);
- if (unlikely(c->slab)) {
- void *flush_freelist = c->freelist;
- struct slab *flush_slab = c->slab;
-
- c->slab = NULL;
- c->freelist = NULL;
- c->tid = next_tid(c->tid);
-
- local_unlock_cpu_slab(s, flags);
-
- if (unlikely(!allow_spin)) {
- /* Reentrant slub cannot take locks, defer */
- defer_deactivate_slab(flush_slab, flush_freelist);
- } else {
- deactivate_slab(s, flush_slab, flush_freelist);
- }
-
- stat(s, CPUSLAB_FLUSH);
-
- goto retry_load_slab;
- }
- c->slab = slab;
-
- goto load_freelist;
+ return freelist;
}
/*
* We disallow kprobes in ___slab_alloc() to prevent reentrance
@@ -4865,87 +4533,11 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
*/
NOKPROBE_SYMBOL(___slab_alloc);
-/*
- * A wrapper for ___slab_alloc() for contexts where preemption is not yet
- * disabled. Compensates for possible cpu changes by refetching the per cpu area
- * pointer.
- */
-static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
- unsigned long addr, struct kmem_cache_cpu *c, unsigned int orig_size)
-{
- void *p;
-
-#ifdef CONFIG_PREEMPT_COUNT
- /*
- * We may have been preempted and rescheduled on a different
- * cpu before disabling preemption. Need to reload cpu area
- * pointer.
- */
- c = slub_get_cpu_ptr(s->cpu_slab);
-#endif
- if (unlikely(!gfpflags_allow_spinning(gfpflags))) {
- if (local_lock_is_locked(&s->cpu_slab->lock)) {
- /*
- * EBUSY is an internal signal to kmalloc_nolock() to
- * retry a different bucket. It's not propagated
- * to the caller.
- */
- p = ERR_PTR(-EBUSY);
- goto out;
- }
- }
- p = ___slab_alloc(s, gfpflags, node, addr, c, orig_size);
-out:
-#ifdef CONFIG_PREEMPT_COUNT
- slub_put_cpu_ptr(s->cpu_slab);
-#endif
- return p;
-}
-
static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
gfp_t gfpflags, int node, unsigned long addr, size_t orig_size)
{
- struct kmem_cache_cpu *c;
- struct slab *slab;
- unsigned long tid;
void *object;
-redo:
- /*
- * Must read kmem_cache cpu data via this cpu ptr. Preemption is
- * enabled. We may switch back and forth between cpus while
- * reading from one cpu area. That does not matter as long
- * as we end up on the original cpu again when doing the cmpxchg.
- *
- * We must guarantee that tid and kmem_cache_cpu are retrieved on the
- * same cpu. We read first the kmem_cache_cpu pointer and use it to read
- * the tid. If we are preempted and switched to another cpu between the
- * two reads, it's OK as the two are still associated with the same cpu
- * and cmpxchg later will validate the cpu.
- */
- c = raw_cpu_ptr(s->cpu_slab);
- tid = READ_ONCE(c->tid);
-
- /*
- * Irqless object alloc/free algorithm used here depends on sequence
- * of fetching cpu_slab's data. tid should be fetched before anything
- * on c to guarantee that object and slab associated with previous tid
- * won't be used with current tid. If we fetch tid first, object and
- * slab could be one associated with next tid and our alloc/free
- * request will be failed. In this case, we will retry. So, no problem.
- */
- barrier();
-
- /*
- * The transaction ids are globally unique per cpu and per operation on
- * a per cpu queue. Thus they can be guarantee that the cmpxchg_double
- * occurs on the right processor and that there was no operation on the
- * linked list in between.
- */
-
- object = c->freelist;
- slab = c->slab;
-
#ifdef CONFIG_NUMA
if (static_branch_unlikely(&strict_numa) &&
node == NUMA_NO_NODE) {
@@ -4954,47 +4546,20 @@ static __always_inline void *__slab_alloc_node(struct kmem_cache *s,
if (mpol) {
/*
- * Special BIND rule support. If existing slab
+ * Special BIND rule support. If the local node
* is in permitted set then do not redirect
* to a particular node.
* Otherwise we apply the memory policy to get
* the node we need to allocate on.
*/
- if (mpol->mode != MPOL_BIND || !slab ||
- !node_isset(slab_nid(slab), mpol->nodes))
-
+ if (mpol->mode != MPOL_BIND ||
+ !node_isset(numa_mem_id(), mpol->nodes))
node = mempolicy_slab_node();
}
}
#endif
- if (!USE_LOCKLESS_FAST_PATH() ||
- unlikely(!object || !slab || !node_match(slab, node))) {
- object = __slab_alloc(s, gfpflags, node, addr, c, orig_size);
- } else {
- void *next_object = get_freepointer_safe(s, object);
-
- /*
- * The cmpxchg will only match if there was no additional
- * operation and if we are on the right processor.
- *
- * The cmpxchg does the following atomically (without lock
- * semantics!)
- * 1. Relocate first pointer to the current per cpu area.
- * 2. Verify that tid and freelist have not been changed
- * 3. If they were not changed replace tid and freelist
- *
- * Since this is without lock semantics the protection is only
- * against code executing on this cpu *not* from access by
- * other cpus.
- */
- if (unlikely(!__update_cpu_freelist_fast(s, object, next_object, tid))) {
- note_cmpxchg_failure("slab_alloc", s, tid);
- goto redo;
- }
- prefetch_freepointer(s, next_object);
- stat(s, ALLOC_FASTPATH);
- }
+ object = ___slab_alloc(s, gfpflags, node, addr, orig_size);
return object;
}
@@ -7638,62 +7203,25 @@ static inline
int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
- struct kmem_cache_cpu *c;
- unsigned long irqflags;
int i;
/*
- * Drain objects in the per cpu slab, while disabling local
- * IRQs, which protects against PREEMPT and interrupts
- * handlers invoking normal fastpath.
+ * TODO: this might be more efficient (if necessary) by reusing
+ * __refill_objects()
*/
- c = slub_get_cpu_ptr(s->cpu_slab);
- local_lock_irqsave(&s->cpu_slab->lock, irqflags);
-
for (i = 0; i < size; i++) {
- void *object = c->freelist;
- if (unlikely(!object)) {
- /*
- * We may have removed an object from c->freelist using
- * the fastpath in the previous iteration; in that case,
- * c->tid has not been bumped yet.
- * Since ___slab_alloc() may reenable interrupts while
- * allocating memory, we should bump c->tid now.
- */
- c->tid = next_tid(c->tid);
+ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_,
+ s->object_size);
+ if (unlikely(!p[i]))
+ goto error;
- local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
-
- /*
- * Invoking slow path likely have side-effect
- * of re-populating per CPU c->freelist
- */
- p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE,
- _RET_IP_, c, s->object_size);
- if (unlikely(!p[i]))
- goto error;
-
- c = this_cpu_ptr(s->cpu_slab);
- maybe_wipe_obj_freeptr(s, p[i]);
-
- local_lock_irqsave(&s->cpu_slab->lock, irqflags);
-
- continue; /* goto for-loop */
- }
- c->freelist = get_freepointer(s, object);
- p[i] = object;
maybe_wipe_obj_freeptr(s, p[i]);
- stat(s, ALLOC_FASTPATH);
}
- c->tid = next_tid(c->tid);
- local_unlock_irqrestore(&s->cpu_slab->lock, irqflags);
- slub_put_cpu_ptr(s->cpu_slab);
return i;
error:
- slub_put_cpu_ptr(s->cpu_slab);
__kmem_cache_free_bulk(s, i, p);
return 0;
--
2.51.1
On Thu, Oct 23, 2025 at 03:52:32PM +0200, Vlastimil Babka wrote:
> We now rely on sheaves as the percpu caching layer and can refill them
> directly from partial or newly allocated slabs. Start removing the cpu
> (partial) slabs code, first from allocation paths.
>
> This means that any allocation not satisfied from percpu sheaves will
> end up in ___slab_alloc(), where we remove the usage of cpu (partial)
> slabs, so it will only perform get_partial() or new_slab().
>
> In get_partial_node() we used to return a slab for freezing as the cpu
> slab and to refill the partial slab. Now we only want to return a single
> object and leave the slab on the list (unless it became full). We can't
> simply reuse alloc_single_from_partial() as that assumes freeing uses
> free_to_partial_list(). Instead we need to use __slab_update_freelist()
> to work properly against a racing __slab_free().
>
> The rest of the changes is removing functions that no longer have any
> callers.
>
> Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
> ---
> mm/slub.c | 614 ++++++++------------------------------------------------------
> 1 file changed, 71 insertions(+), 543 deletions(-)
> diff --git a/mm/slub.c b/mm/slub.c
> index e2b052657d11..bd67336e7c1f 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4790,66 +4509,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>
> stat(s, ALLOC_SLAB);
>
> - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
> - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
> -
> - if (unlikely(!freelist))
> - goto new_objects;
> -
> - if (s->flags & SLAB_STORE_USER)
> - set_track(s, freelist, TRACK_ALLOC, addr,
> - gfpflags & ~(__GFP_DIRECT_RECLAIM));
> -
> - return freelist;
> - }
> -
> - /*
> - * No other reference to the slab yet so we can
> - * muck around with it freely without cmpxchg
> - */
> - freelist = slab->freelist;
> - slab->freelist = NULL;
> - slab->inuse = slab->objects;
> - slab->frozen = 1;
> -
> - inc_slabs_node(s, slab_nid(slab), slab->objects);
> + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
>
> - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
> - /*
> - * For !pfmemalloc_match() case we don't load freelist so that
> - * we don't make further mismatched allocations easier.
> - */
> - deactivate_slab(s, slab, get_freepointer(s, freelist));
> - return freelist;
> - }
> + if (unlikely(!freelist))
> + goto new_objects;
We may end up in an endless loop in !allow_spin case?
(e.g., kmalloc_nolock() is called in NMI context and n->list_lock is
held in the process context on the same CPU)
Allocate a new slab, but somebody is holding n->list_lock, so trylock fails,
free the slab, goto new_objects, and repeat.
--
Cheers,
Harry / Hyeonggon
On 10/30/25 05:32, Harry Yoo wrote:
> On Thu, Oct 23, 2025 at 03:52:32PM +0200, Vlastimil Babka wrote:
>> diff --git a/mm/slub.c b/mm/slub.c
>> index e2b052657d11..bd67336e7c1f 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>> @@ -4790,66 +4509,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>>
>> stat(s, ALLOC_SLAB);
>>
>> - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
>> - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
>> -
>> - if (unlikely(!freelist))
>> - goto new_objects;
>> -
>> - if (s->flags & SLAB_STORE_USER)
>> - set_track(s, freelist, TRACK_ALLOC, addr,
>> - gfpflags & ~(__GFP_DIRECT_RECLAIM));
>> -
>> - return freelist;
>> - }
>> -
>> - /*
>> - * No other reference to the slab yet so we can
>> - * muck around with it freely without cmpxchg
>> - */
>> - freelist = slab->freelist;
>> - slab->freelist = NULL;
>> - slab->inuse = slab->objects;
>> - slab->frozen = 1;
>> -
>> - inc_slabs_node(s, slab_nid(slab), slab->objects);
>> + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
>>
>> - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
>> - /*
>> - * For !pfmemalloc_match() case we don't load freelist so that
>> - * we don't make further mismatched allocations easier.
>> - */
>> - deactivate_slab(s, slab, get_freepointer(s, freelist));
>> - return freelist;
>> - }
>> + if (unlikely(!freelist))
>> + goto new_objects;
>
> We may end up in an endless loop in !allow_spin case?
> (e.g., kmalloc_nolock() is called in NMI context and n->list_lock is
> held in the process context on the same CPU)
>
> Allocate a new slab, but somebody is holding n->list_lock, so trylock fails,
> free the slab, goto new_objects, and repeat.
Ugh, yeah. However, AFAICS this possibility already exists prior to this
patch, only it's limited to SLUB_TINY/kmem_cache_debug(s). But we should fix
it in 6.18 then.
How? Grab the single object and defer deactivation of the slab minus one
object? Would work except for kmem_cache_debug(s) we open again a race for
inconsistency check failure, and we have to undo the simple slab freeing fix
and handle the accounting issue differently again.
Fail the allocation for the debug case to avoid the consistency check
issues? Would it be acceptable for kmalloc_nolock() users?
On Thu, Oct 30, 2025 at 6:09 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 10/30/25 05:32, Harry Yoo wrote:
> > On Thu, Oct 23, 2025 at 03:52:32PM +0200, Vlastimil Babka wrote:
> >> diff --git a/mm/slub.c b/mm/slub.c
> >> index e2b052657d11..bd67336e7c1f 100644
> >> --- a/mm/slub.c
> >> +++ b/mm/slub.c
> >> @@ -4790,66 +4509,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> >>
> >> stat(s, ALLOC_SLAB);
> >>
> >> - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
> >> - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
> >> -
> >> - if (unlikely(!freelist))
> >> - goto new_objects;
> >> -
> >> - if (s->flags & SLAB_STORE_USER)
> >> - set_track(s, freelist, TRACK_ALLOC, addr,
> >> - gfpflags & ~(__GFP_DIRECT_RECLAIM));
> >> -
> >> - return freelist;
> >> - }
> >> -
> >> - /*
> >> - * No other reference to the slab yet so we can
> >> - * muck around with it freely without cmpxchg
> >> - */
> >> - freelist = slab->freelist;
> >> - slab->freelist = NULL;
> >> - slab->inuse = slab->objects;
> >> - slab->frozen = 1;
> >> -
> >> - inc_slabs_node(s, slab_nid(slab), slab->objects);
> >> + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
> >>
> >> - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
> >> - /*
> >> - * For !pfmemalloc_match() case we don't load freelist so that
> >> - * we don't make further mismatched allocations easier.
> >> - */
> >> - deactivate_slab(s, slab, get_freepointer(s, freelist));
> >> - return freelist;
> >> - }
> >> + if (unlikely(!freelist))
> >> + goto new_objects;
> >
> > We may end up in an endless loop in !allow_spin case?
> > (e.g., kmalloc_nolock() is called in NMI context and n->list_lock is
> > held in the process context on the same CPU)
> >
> > Allocate a new slab, but somebody is holding n->list_lock, so trylock fails,
> > free the slab, goto new_objects, and repeat.
>
> Ugh, yeah. However, AFAICS this possibility already exists prior to this
> patch, only it's limited to SLUB_TINY/kmem_cache_debug(s). But we should fix
> it in 6.18 then.
> How? Grab the single object and defer deactivation of the slab minus one
> object? Would work except for kmem_cache_debug(s) we open again a race for
> inconsistency check failure, and we have to undo the simple slab freeing fix
> and handle the accounting issue differently again.
> Fail the allocation for the debug case to avoid the consistency check
> issues? Would it be acceptable for kmalloc_nolock() users?
You mean something like:
diff --git a/mm/slub.c b/mm/slub.c
index a8fcc7e6f25a..e9a8b75f31d7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -4658,8 +4658,11 @@ static void *___slab_alloc(struct kmem_cache
*s, gfp_t gfpflags, int node,
if (kmem_cache_debug(s)) {
freelist = alloc_single_from_new_slab(s, slab,
orig_size, gfpflags);
- if (unlikely(!freelist))
+ if (unlikely(!freelist)) {
+ if (!allow_spin)
+ return NULL;
goto new_objects;
+ }
or I misunderstood the issue?
On 10/30/25 16:27, Alexei Starovoitov wrote:
> On Thu, Oct 30, 2025 at 6:09 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> On 10/30/25 05:32, Harry Yoo wrote:
>> > On Thu, Oct 23, 2025 at 03:52:32PM +0200, Vlastimil Babka wrote:
>> >> diff --git a/mm/slub.c b/mm/slub.c
>> >> index e2b052657d11..bd67336e7c1f 100644
>> >> --- a/mm/slub.c
>> >> +++ b/mm/slub.c
>> >> @@ -4790,66 +4509,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
>> >>
>> >> stat(s, ALLOC_SLAB);
>> >>
>> >> - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
>> >> - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
>> >> -
>> >> - if (unlikely(!freelist))
>> >> - goto new_objects;
>> >> -
>> >> - if (s->flags & SLAB_STORE_USER)
>> >> - set_track(s, freelist, TRACK_ALLOC, addr,
>> >> - gfpflags & ~(__GFP_DIRECT_RECLAIM));
>> >> -
>> >> - return freelist;
>> >> - }
>> >> -
>> >> - /*
>> >> - * No other reference to the slab yet so we can
>> >> - * muck around with it freely without cmpxchg
>> >> - */
>> >> - freelist = slab->freelist;
>> >> - slab->freelist = NULL;
>> >> - slab->inuse = slab->objects;
>> >> - slab->frozen = 1;
>> >> -
>> >> - inc_slabs_node(s, slab_nid(slab), slab->objects);
>> >> + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
>> >>
>> >> - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
>> >> - /*
>> >> - * For !pfmemalloc_match() case we don't load freelist so that
>> >> - * we don't make further mismatched allocations easier.
>> >> - */
>> >> - deactivate_slab(s, slab, get_freepointer(s, freelist));
>> >> - return freelist;
>> >> - }
>> >> + if (unlikely(!freelist))
>> >> + goto new_objects;
>> >
>> > We may end up in an endless loop in !allow_spin case?
>> > (e.g., kmalloc_nolock() is called in NMI context and n->list_lock is
>> > held in the process context on the same CPU)
>> >
>> > Allocate a new slab, but somebody is holding n->list_lock, so trylock fails,
>> > free the slab, goto new_objects, and repeat.
>>
>> Ugh, yeah. However, AFAICS this possibility already exists prior to this
>> patch, only it's limited to SLUB_TINY/kmem_cache_debug(s). But we should fix
>> it in 6.18 then.
>> How? Grab the single object and defer deactivation of the slab minus one
>> object? Would work except for kmem_cache_debug(s) we open again a race for
>> inconsistency check failure, and we have to undo the simple slab freeing fix
>> and handle the accounting issue differently again.
>> Fail the allocation for the debug case to avoid the consistency check
>> issues? Would it be acceptable for kmalloc_nolock() users?
>
> You mean something like:
> diff --git a/mm/slub.c b/mm/slub.c
> index a8fcc7e6f25a..e9a8b75f31d7 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
> @@ -4658,8 +4658,11 @@ static void *___slab_alloc(struct kmem_cache
> *s, gfp_t gfpflags, int node,
> if (kmem_cache_debug(s)) {
> freelist = alloc_single_from_new_slab(s, slab,
> orig_size, gfpflags);
>
> - if (unlikely(!freelist))
> + if (unlikely(!freelist)) {
> + if (!allow_spin)
> + return NULL;
> goto new_objects;
> + }
>
> or I misunderstood the issue?
Yeah that would be the easiest solution, if you can accept the occasional
allocation failures.
On Thu, Oct 30, 2025 at 04:35:52PM +0100, Vlastimil Babka wrote:
> On 10/30/25 16:27, Alexei Starovoitov wrote:
> > On Thu, Oct 30, 2025 at 6:09 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> On 10/30/25 05:32, Harry Yoo wrote:
> >> > On Thu, Oct 23, 2025 at 03:52:32PM +0200, Vlastimil Babka wrote:
> >> >> diff --git a/mm/slub.c b/mm/slub.c
> >> >> index e2b052657d11..bd67336e7c1f 100644
> >> >> --- a/mm/slub.c
> >> >> +++ b/mm/slub.c
> >> >> @@ -4790,66 +4509,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> >> >>
> >> >> stat(s, ALLOC_SLAB);
> >> >>
> >> >> - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
> >> >> - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
> >> >> -
> >> >> - if (unlikely(!freelist))
> >> >> - goto new_objects;
> >> >> -
> >> >> - if (s->flags & SLAB_STORE_USER)
> >> >> - set_track(s, freelist, TRACK_ALLOC, addr,
> >> >> - gfpflags & ~(__GFP_DIRECT_RECLAIM));
> >> >> -
> >> >> - return freelist;
> >> >> - }
> >> >> -
> >> >> - /*
> >> >> - * No other reference to the slab yet so we can
> >> >> - * muck around with it freely without cmpxchg
> >> >> - */
> >> >> - freelist = slab->freelist;
> >> >> - slab->freelist = NULL;
> >> >> - slab->inuse = slab->objects;
> >> >> - slab->frozen = 1;
> >> >> -
> >> >> - inc_slabs_node(s, slab_nid(slab), slab->objects);
> >> >> + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
> >> >>
> >> >> - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
> >> >> - /*
> >> >> - * For !pfmemalloc_match() case we don't load freelist so that
> >> >> - * we don't make further mismatched allocations easier.
> >> >> - */
> >> >> - deactivate_slab(s, slab, get_freepointer(s, freelist));
> >> >> - return freelist;
> >> >> - }
> >> >> + if (unlikely(!freelist))
> >> >> + goto new_objects;
> >> >
> >> > We may end up in an endless loop in !allow_spin case?
> >> > (e.g., kmalloc_nolock() is called in NMI context and n->list_lock is
> >> > held in the process context on the same CPU)
> >> >
> >> > Allocate a new slab, but somebody is holding n->list_lock, so trylock fails,
> >> > free the slab, goto new_objects, and repeat.
> >>
> >> Ugh, yeah. However, AFAICS this possibility already exists prior to this
> >> patch, only it's limited to SLUB_TINY/kmem_cache_debug(s). But we should fix
> >> it in 6.18 then.
Oops, right ;)
> >> How? Grab the single object and defer deactivation of the slab minus one
> >> object? Would work except for kmem_cache_debug(s) we open again a race for
> >> inconsistency check failure, and we have to undo the simple slab freeing fix
> >> and handle the accounting issue differently again.
> >> Fail the allocation for the debug case to avoid the consistency check
> >> issues? Would it be acceptable for kmalloc_nolock() users?
I think this should work (and is simple)!
> > You mean something like:
> > diff --git a/mm/slub.c b/mm/slub.c
> > index a8fcc7e6f25a..e9a8b75f31d7 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -4658,8 +4658,11 @@ static void *___slab_alloc(struct kmem_cache
> > *s, gfp_t gfpflags, int node,
> > if (kmem_cache_debug(s)) {
> > freelist = alloc_single_from_new_slab(s, slab,
> > orig_size, gfpflags);
> >
> > - if (unlikely(!freelist))
> > + if (unlikely(!freelist)) {
> > + if (!allow_spin)
> > + return NULL;
> > goto new_objects;
> > + }
> >
> > or I misunderstood the issue?
>
> Yeah that would be the easiest solution, if you can accept the occasional
> allocation failures.
Looks good to me.
--
Cheers,
Harry / Hyeonggon
On Thu, Oct 30, 2025 at 8:35 AM Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 10/30/25 16:27, Alexei Starovoitov wrote:
> > On Thu, Oct 30, 2025 at 6:09 AM Vlastimil Babka <vbabka@suse.cz> wrote:
> >>
> >> On 10/30/25 05:32, Harry Yoo wrote:
> >> > On Thu, Oct 23, 2025 at 03:52:32PM +0200, Vlastimil Babka wrote:
> >> >> diff --git a/mm/slub.c b/mm/slub.c
> >> >> index e2b052657d11..bd67336e7c1f 100644
> >> >> --- a/mm/slub.c
> >> >> +++ b/mm/slub.c
> >> >> @@ -4790,66 +4509,15 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
> >> >>
> >> >> stat(s, ALLOC_SLAB);
> >> >>
> >> >> - if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
> >> >> - freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
> >> >> -
> >> >> - if (unlikely(!freelist))
> >> >> - goto new_objects;
> >> >> -
> >> >> - if (s->flags & SLAB_STORE_USER)
> >> >> - set_track(s, freelist, TRACK_ALLOC, addr,
> >> >> - gfpflags & ~(__GFP_DIRECT_RECLAIM));
> >> >> -
> >> >> - return freelist;
> >> >> - }
> >> >> -
> >> >> - /*
> >> >> - * No other reference to the slab yet so we can
> >> >> - * muck around with it freely without cmpxchg
> >> >> - */
> >> >> - freelist = slab->freelist;
> >> >> - slab->freelist = NULL;
> >> >> - slab->inuse = slab->objects;
> >> >> - slab->frozen = 1;
> >> >> -
> >> >> - inc_slabs_node(s, slab_nid(slab), slab->objects);
> >> >> + freelist = alloc_single_from_new_slab(s, slab, orig_size, gfpflags);
> >> >>
> >> >> - if (unlikely(!pfmemalloc_match(slab, gfpflags) && allow_spin)) {
> >> >> - /*
> >> >> - * For !pfmemalloc_match() case we don't load freelist so that
> >> >> - * we don't make further mismatched allocations easier.
> >> >> - */
> >> >> - deactivate_slab(s, slab, get_freepointer(s, freelist));
> >> >> - return freelist;
> >> >> - }
> >> >> + if (unlikely(!freelist))
> >> >> + goto new_objects;
> >> >
> >> > We may end up in an endless loop in !allow_spin case?
> >> > (e.g., kmalloc_nolock() is called in NMI context and n->list_lock is
> >> > held in the process context on the same CPU)
> >> >
> >> > Allocate a new slab, but somebody is holding n->list_lock, so trylock fails,
> >> > free the slab, goto new_objects, and repeat.
> >>
> >> Ugh, yeah. However, AFAICS this possibility already exists prior to this
> >> patch, only it's limited to SLUB_TINY/kmem_cache_debug(s). But we should fix
> >> it in 6.18 then.
> >> How? Grab the single object and defer deactivation of the slab minus one
> >> object? Would work except for kmem_cache_debug(s) we open again a race for
> >> inconsistency check failure, and we have to undo the simple slab freeing fix
> >> and handle the accounting issue differently again.
> >> Fail the allocation for the debug case to avoid the consistency check
> >> issues? Would it be acceptable for kmalloc_nolock() users?
> >
> > You mean something like:
> > diff --git a/mm/slub.c b/mm/slub.c
> > index a8fcc7e6f25a..e9a8b75f31d7 100644
> > --- a/mm/slub.c
> > +++ b/mm/slub.c
> > @@ -4658,8 +4658,11 @@ static void *___slab_alloc(struct kmem_cache
> > *s, gfp_t gfpflags, int node,
> > if (kmem_cache_debug(s)) {
> > freelist = alloc_single_from_new_slab(s, slab,
> > orig_size, gfpflags);
> >
> > - if (unlikely(!freelist))
> > + if (unlikely(!freelist)) {
> > + if (!allow_spin)
> > + return NULL;
> > goto new_objects;
> > + }
> >
> > or I misunderstood the issue?
>
> Yeah that would be the easiest solution, if you can accept the occasional
> allocation failures.
yeah. not worried about the slub debug case.
Let's reassess when sheav conversion is over.
On Thu, 23 Oct 2025 15:52:32 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
> We now rely on sheaves as the percpu caching layer and can refill them
> directly from partial or newly allocated slabs. Start removing the cpu
> (partial) slabs code, first from allocation paths.
>
> This means that any allocation not satisfied from percpu sheaves will
> end up in ___slab_alloc(), where we remove the usage of cpu (partial)
> slabs, so it will only perform get_partial() or new_slab().
>
> In get_partial_node() we used to return a slab for freezing as the cpu
> slab and to refill the partial slab. Now we only want to return a single
> object and leave the slab on the list (unless it became full). We can't
> simply reuse alloc_single_from_partial() as that assumes freeing uses
> free_to_partial_list(). Instead we need to use __slab_update_freelist()
> to work properly against a racing __slab_free().
>
> The rest of the changes is removing functions that no longer have any
> callers.
>
Hi Vlastimil,
We're trying out the AI patch review automation on the BPF commits and it had
some questions about a few of these. Since the goal is to actually test the
automation, I'm putting them in unedited, but I did try and make sure they
were valid before sending.
This one seems pretty unlikely but still worth fixing:
>
> diff --git a/mm/slub.c b/mm/slub.c
> index e2b052657d11..bd67336e7c1f 100644
> --- a/mm/slub.c
> +++ b/mm/slub.c
[ ... ]
> @@ -3575,15 +3532,15 @@ static bool get_partial_node_bulk(struct kmem_cache *s,
> }
>
> /*
> - * Try to allocate a partial slab from a specific node.
> + * Try to allocate object from a partial slab on a specific node.
> */
> -static struct slab *get_partial_node(struct kmem_cache *s,
> - struct kmem_cache_node *n,
> - struct partial_context *pc)
> +static void *get_partial_node(struct kmem_cache *s,
> + struct kmem_cache_node *n,
> + struct partial_context *pc)
> {
> - struct slab *slab, *slab2, *partial = NULL;
> + struct slab *slab, *slab2;
> unsigned long flags;
> - unsigned int partial_slabs = 0;
> + void *object;
>
> /*
> * Racy check. If we mistakenly see no partial slabs then we
> @@ -3599,54 +3556,54 @@ static struct slab *get_partial_node(struct kmem_cache *s,
> else if (!spin_trylock_irqsave(&n->list_lock, flags))
> return NULL;
> list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
> +
> + unsigned long counters;
> + struct slab new;
> +
> if (!pfmemalloc_match(slab, pc->flags))
> continue;
Can get_partial_node() return an uninitialized pointer? The variable
'object' is declared but never initialized. If all slabs in the partial
list fail the pfmemalloc_match() check, the loop completes without
setting 'object', then returns it at the end of the function.
In the previous version, the equivalent 'partial' variable was explicitly
initialized to NULL. When all slabs were skipped, NULL was returned.
>
> if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
> - void *object = alloc_single_from_partial(s, n, slab,
> + object = alloc_single_from_partial(s, n, slab,
> pc->orig_size);
> - if (object) {
> - partial = slab;
> - pc->object = object;
> + if (object)
> break;
> - }
> continue;
> }
>
> - remove_partial(n, slab);
> -
> - if (!partial) {
> - partial = slab;
> - stat(s, ALLOC_FROM_PARTIAL);
> -
> - if ((slub_get_cpu_partial(s) == 0)) {
> - break;
> - }
> - } else {
> - put_cpu_partial(s, slab, 0);
> - stat(s, CPU_PARTIAL_NODE);
> -
> - if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
> - break;
> - }
> - }
> + /*
> + * get a single object from the slab. This might race against
> + * __slab_free(), which however has to take the list_lock if
> + * it's about to make the slab fully free.
> + */
> + do {
> + object = slab->freelist;
> + counters = slab->counters;
> + new.freelist = get_freepointer(s, object);
> + new.counters = counters;
> + new.inuse++;
> + } while (!__slab_update_freelist(s, slab,
> + object, counters,
> + new.freelist, new.counters,
> + "get_partial_node"));
> +
> + if (!new.freelist)
> + remove_partial(n, slab);
> }
> spin_unlock_irqrestore(&n->list_lock, flags);
> - return partial;
> + return object;
> }
[ ... ]
On 10/24/25 16:29, Chris Mason wrote:
> On Thu, 23 Oct 2025 15:52:32 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
>
>> We now rely on sheaves as the percpu caching layer and can refill them
>> directly from partial or newly allocated slabs. Start removing the cpu
>> (partial) slabs code, first from allocation paths.
>>
>> This means that any allocation not satisfied from percpu sheaves will
>> end up in ___slab_alloc(), where we remove the usage of cpu (partial)
>> slabs, so it will only perform get_partial() or new_slab().
>>
>> In get_partial_node() we used to return a slab for freezing as the cpu
>> slab and to refill the partial slab. Now we only want to return a single
>> object and leave the slab on the list (unless it became full). We can't
>> simply reuse alloc_single_from_partial() as that assumes freeing uses
>> free_to_partial_list(). Instead we need to use __slab_update_freelist()
>> to work properly against a racing __slab_free().
>>
>> The rest of the changes is removing functions that no longer have any
>> callers.
>>
>
> Hi Vlastimil,
>
> We're trying out the AI patch review automation on the BPF commits and it had
> some questions about a few of these. Since the goal is to actually test the
> automation, I'm putting them in unedited, but I did try and make sure they
> were valid before sending.
>
> This one seems pretty unlikely but still worth fixing:
>
>>
>> diff --git a/mm/slub.c b/mm/slub.c
>> index e2b052657d11..bd67336e7c1f 100644
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>
> [ ... ]
>
>> @@ -3575,15 +3532,15 @@ static bool get_partial_node_bulk(struct kmem_cache *s,
>> }
>>
>> /*
>> - * Try to allocate a partial slab from a specific node.
>> + * Try to allocate object from a partial slab on a specific node.
>> */
>> -static struct slab *get_partial_node(struct kmem_cache *s,
>> - struct kmem_cache_node *n,
>> - struct partial_context *pc)
>> +static void *get_partial_node(struct kmem_cache *s,
>> + struct kmem_cache_node *n,
>> + struct partial_context *pc)
>> {
>> - struct slab *slab, *slab2, *partial = NULL;
>> + struct slab *slab, *slab2;
>> unsigned long flags;
>> - unsigned int partial_slabs = 0;
>> + void *object;
>>
>> /*
>> * Racy check. If we mistakenly see no partial slabs then we
>> @@ -3599,54 +3556,54 @@ static struct slab *get_partial_node(struct kmem_cache *s,
>> else if (!spin_trylock_irqsave(&n->list_lock, flags))
>> return NULL;
>> list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
>> +
>> + unsigned long counters;
>> + struct slab new;
>> +
>> if (!pfmemalloc_match(slab, pc->flags))
>> continue;
>
> Can get_partial_node() return an uninitialized pointer? The variable
> 'object' is declared but never initialized. If all slabs in the partial
> list fail the pfmemalloc_match() check, the loop completes without
> setting 'object', then returns it at the end of the function.
>
> In the previous version, the equivalent 'partial' variable was explicitly
> initialized to NULL. When all slabs were skipped, NULL was returned.
By the way, there was another bug in this patch, causing a severe memory
leak, which the AI unfortunately didn't flag. Petr reported it during
performance testing and it took me more than a day to find it. Oh well :)
Wonder if things got better since then perhaps, and your or Roman's tools
would find it today? :)
>>
>> if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
>> - void *object = alloc_single_from_partial(s, n, slab,
>> + object = alloc_single_from_partial(s, n, slab,
>> pc->orig_size);
>> - if (object) {
>> - partial = slab;
>> - pc->object = object;
>> + if (object)
>> break;
>> - }
>> continue;
>> }
>>
>> - remove_partial(n, slab);
>> -
>> - if (!partial) {
>> - partial = slab;
>> - stat(s, ALLOC_FROM_PARTIAL);
>> -
>> - if ((slub_get_cpu_partial(s) == 0)) {
>> - break;
>> - }
>> - } else {
>> - put_cpu_partial(s, slab, 0);
>> - stat(s, CPU_PARTIAL_NODE);
>> -
>> - if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
>> - break;
>> - }
>> - }
>> + /*
>> + * get a single object from the slab. This might race against
>> + * __slab_free(), which however has to take the list_lock if
>> + * it's about to make the slab fully free.
>> + */
>> + do {
>> + object = slab->freelist;
>> + counters = slab->counters;
>> + new.freelist = get_freepointer(s, object);
>> + new.counters = counters;
>> + new.inuse++;
>> + } while (!__slab_update_freelist(s, slab,
>> + object, counters,
>> + new.freelist, new.counters,
>> + "get_partial_node"));
>> +
>> + if (!new.freelist)
>> + remove_partial(n, slab);
>> }
>> spin_unlock_irqrestore(&n->list_lock, flags);
>> - return partial;
>> + return object;
>> }
>
> [ ... ]
>
>
>
On 1/9/26 3:16 AM, Vlastimil Babka wrote:
> On 10/24/25 16:29, Chris Mason wrote:
>> On Thu, 23 Oct 2025 15:52:32 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
[ ... ]
> By the way, there was another bug in this patch, causing a severe memory
> leak, which the AI unfortunately didn't flag. Petr reported it during
> performance testing and it took me more than a day to find it. Oh well :)
>
> Wonder if things got better since then perhaps, and your or Roman's tools
> would find it today? :)
Yes and no. It didn't find the leak until I changed the prompt to say:
"there is a leak, find it". I'll see if I can improve things...
> diff --git a/mm/slub.c b/mm/slub.c
> --- a/mm/slub.c
> +++ b/mm/slub.c
[ ... ]
> @@ -3592,54 +3549,53 @@ static void *get_partial_node(struct kmem_cache *s,
> else if (!spin_trylock_irqsave(&n->list_lock, flags))
> return NULL;
> list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
> +
> + struct freelist_counters old, new;
> +
> if (!pfmemalloc_match(slab, pc->flags))
> continue;
>
> if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
> - void *object = alloc_single_from_partial(s, n, slab,
> + object = alloc_single_from_partial(s, n, slab,
> pc->orig_size);
> - if (object) {
> - partial = slab;
> - pc->object = object;
> + if (object)
> break;
> - }
> continue;
> }
>
> + /*
> + * get a single object from the slab. This might race against
> + * __slab_free(), which however has to take the list_lock if
> + * it's about to make the slab fully free.
> + */
> + do {
> + old.freelist = slab->freelist;
> + old.counters = slab->counters;
> +
> + new.freelist = get_freepointer(s, old.freelist);
> + new.counters = old.counters;
> + new.inuse++;
> +
> + } while (!__slab_update_freelist(s, slab, &old, &new, "get_partial_node"));
> +
> + object = old.freelist;
> + if (!new.freelist)
> + remove_partial(n, slab);
Is there a missing break statement here? The debug path above breaks out
of the loop after successfully allocating an object, but this non-debug
path continues iterating through the partial list. Each iteration overwrites
the object variable, so previously allocated objects would be leaked.
The commit message says "Now we only want to return a single object" which
matches the debug path behavior, but the non-debug path appears to allocate
from every matching slab in the list.
> }
> spin_unlock_irqrestore(&n->list_lock, flags);
> - return partial;
> + return object;
> }
On 1/10/26 14:20, Chris Mason wrote:
> On 1/9/26 3:16 AM, Vlastimil Babka wrote:
>> On 10/24/25 16:29, Chris Mason wrote:
>>> On Thu, 23 Oct 2025 15:52:32 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:
>
> [ ... ]
>
>> By the way, there was another bug in this patch, causing a severe memory
>> leak, which the AI unfortunately didn't flag. Petr reported it during
>> performance testing and it took me more than a day to find it. Oh well :)
>>
>> Wonder if things got better since then perhaps, and your or Roman's tools
>> would find it today? :)
>
> Yes and no. It didn't find the leak until I changed the prompt to say:
> "there is a leak, find it". I'll see if I can improve things...
Thanks. Hmm even if it has to be done like this, it could be a substantial
time saver vs finding the leak myself.
>> diff --git a/mm/slub.c b/mm/slub.c
>> --- a/mm/slub.c
>> +++ b/mm/slub.c
>
> [ ... ]
>
>> @@ -3592,54 +3549,53 @@ static void *get_partial_node(struct kmem_cache *s,
>> else if (!spin_trylock_irqsave(&n->list_lock, flags))
>> return NULL;
>> list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
>> +
>> + struct freelist_counters old, new;
>> +
>> if (!pfmemalloc_match(slab, pc->flags))
>> continue;
>>
>> if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
>> - void *object = alloc_single_from_partial(s, n, slab,
>> + object = alloc_single_from_partial(s, n, slab,
>> pc->orig_size);
>> - if (object) {
>> - partial = slab;
>> - pc->object = object;
>> + if (object)
>> break;
>> - }
>> continue;
>> }
>>
>> + /*
>> + * get a single object from the slab. This might race against
>> + * __slab_free(), which however has to take the list_lock if
>> + * it's about to make the slab fully free.
>> + */
>> + do {
>> + old.freelist = slab->freelist;
>> + old.counters = slab->counters;
>> +
>> + new.freelist = get_freepointer(s, old.freelist);
>> + new.counters = old.counters;
>> + new.inuse++;
>> +
>> + } while (!__slab_update_freelist(s, slab, &old, &new, "get_partial_node"));
>> +
>> + object = old.freelist;
>> + if (!new.freelist)
>> + remove_partial(n, slab);
>
> Is there a missing break statement here? The debug path above breaks out
> of the loop after successfully allocating an object, but this non-debug
> path continues iterating through the partial list. Each iteration overwrites
> the object variable, so previously allocated objects would be leaked.
>
> The commit message says "Now we only want to return a single object" which
> matches the debug path behavior, but the non-debug path appears to allocate
> from every matching slab in the list.
>
>> }
>> spin_unlock_irqrestore(&n->list_lock, flags);
>> - return partial;
>> + return object;
>> }
>
On 1/10/26 10:41 AM, Vlastimil Babka wrote: > On 1/10/26 14:20, Chris Mason wrote: >> On 1/9/26 3:16 AM, Vlastimil Babka wrote: >>> On 10/24/25 16:29, Chris Mason wrote: >>>> On Thu, 23 Oct 2025 15:52:32 +0200 Vlastimil Babka <vbabka@suse.cz> wrote: >> >> [ ... ] >> >>> By the way, there was another bug in this patch, causing a severe memory >>> leak, which the AI unfortunately didn't flag. Petr reported it during >>> performance testing and it took me more than a day to find it. Oh well :) >>> >>> Wonder if things got better since then perhaps, and your or Roman's tools >>> would find it today? :) >> >> Yes and no. It didn't find the leak until I changed the prompt to say: >> "there is a leak, find it". I'll see if I can improve things... > > Thanks. Hmm even if it has to be done like this, it could be a substantial > time saver vs finding the leak myself. Finding the missing break on the first pass was tricky because claude consistently focused on concerns about potential NULL pointers and mostly ignored the loop flow control changes. I think I've fixed things by expanding the loop analysis and also forcing it to make a more fine grained list of changes to analyze before it jumps into the review. It caught the missing break 5 out of 6 times in a loop, so maybe? That's probably the best I can get right now for a generic review, but claude will almost always be more reliable with extra directions like "there is a leak, find it" on top of the review prompt. I've pushed out two new commits to: https://github.com/masoncl/review-prompts 9a44c271 CS-001.md: pay more attention to loop control flow and memory allocations 7fad3996 review-core.md: make change categories more fine grained Thanks for flagging this, I think/hope it'll handle complex changes more effectively now across the board. -chris
Chris Mason <clm@meta.com> writes: > On 1/10/26 10:41 AM, Vlastimil Babka wrote: >> On 1/10/26 14:20, Chris Mason wrote: >>> On 1/9/26 3:16 AM, Vlastimil Babka wrote: >>>> On 10/24/25 16:29, Chris Mason wrote: >>>>> On Thu, 23 Oct 2025 15:52:32 +0200 Vlastimil Babka <vbabka@suse.cz> wrote: >>> >>> [ ... ] >>> >>>> By the way, there was another bug in this patch, causing a severe memory >>>> leak, which the AI unfortunately didn't flag. Petr reported it during >>>> performance testing and it took me more than a day to find it. Oh well :) >>>> >>>> Wonder if things got better since then perhaps, and your or Roman's tools >>>> would find it today? :) >>> >>> Yes and no. It didn't find the leak until I changed the prompt to say: >>> "there is a leak, find it". I'll see if I can improve things... >> >> Thanks. Hmm even if it has to be done like this, it could be a substantial >> time saver vs finding the leak myself. > > Finding the missing break on the first pass was tricky because claude > consistently focused on concerns about potential NULL pointers and > mostly ignored the loop flow control changes. > > I think I've fixed things by expanding the loop analysis and also > forcing it to make a more fine grained list of changes to analyze before > it jumps into the review. > > It caught the missing break 5 out of 6 times in a loop, so maybe? > That's probably the best I can get right now for a generic review, but > claude will almost always be more reliable with extra directions like > "there is a leak, find it" on top of the review prompt. > > I've pushed out two new commits to: > https://github.com/masoncl/review-prompts > > 9a44c271 CS-001.md: pay more attention to loop control flow and memory > allocations > 7fad3996 review-core.md: make change categories more fine grained It helped Gemini too. With these changes even the flash-3 model caught it from the first attempt. Thanks
On 10/24/25 16:29, Chris Mason wrote:
>> else if (!spin_trylock_irqsave(&n->list_lock, flags))
>> return NULL;
>> list_for_each_entry_safe(slab, slab2, &n->partial, slab_list) {
>> +
>> + unsigned long counters;
>> + struct slab new;
>> +
>> if (!pfmemalloc_match(slab, pc->flags))
>> continue;
>
> Can get_partial_node() return an uninitialized pointer? The variable
> 'object' is declared but never initialized. If all slabs in the partial
> list fail the pfmemalloc_match() check, the loop completes without
> setting 'object', then returns it at the end of the function.
>
> In the previous version, the equivalent 'partial' variable was explicitly
> initialized to NULL. When all slabs were skipped, NULL was returned.
Indeed, this can happen. Thanks!
>>
>> if (IS_ENABLED(CONFIG_SLUB_TINY) || kmem_cache_debug(s)) {
>> - void *object = alloc_single_from_partial(s, n, slab,
>> + object = alloc_single_from_partial(s, n, slab,
>> pc->orig_size);
>> - if (object) {
>> - partial = slab;
>> - pc->object = object;
>> + if (object)
>> break;
>> - }
>> continue;
>> }
>>
>> - remove_partial(n, slab);
>> -
>> - if (!partial) {
>> - partial = slab;
>> - stat(s, ALLOC_FROM_PARTIAL);
>> -
>> - if ((slub_get_cpu_partial(s) == 0)) {
>> - break;
>> - }
>> - } else {
>> - put_cpu_partial(s, slab, 0);
>> - stat(s, CPU_PARTIAL_NODE);
>> -
>> - if (++partial_slabs > slub_get_cpu_partial(s) / 2) {
>> - break;
>> - }
>> - }
>> + /*
>> + * get a single object from the slab. This might race against
>> + * __slab_free(), which however has to take the list_lock if
>> + * it's about to make the slab fully free.
>> + */
>> + do {
>> + object = slab->freelist;
>> + counters = slab->counters;
>> + new.freelist = get_freepointer(s, object);
>> + new.counters = counters;
>> + new.inuse++;
>> + } while (!__slab_update_freelist(s, slab,
>> + object, counters,
>> + new.freelist, new.counters,
>> + "get_partial_node"));
>> +
>> + if (!new.freelist)
>> + remove_partial(n, slab);
>> }
>> spin_unlock_irqrestore(&n->list_lock, flags);
>> - return partial;
>> + return object;
>> }
>
> [ ... ]
>
>
© 2016 - 2026 Red Hat, Inc.