Introduce QPW for per-cpu operations (v3)

[PATCH v3 4/4] slub: apply new queue_percpu_work_on() interface

Posted by Marcelo Tosatti 1 week, 3 days ago

Make use of the new qpw_{un,}lock*() and queue_percpu_work_on()
interface to improve performance & latency.

For functions that may be scheduled in a different cpu, replace
local_{un,}lock*() by qpw_{un,}lock*(), and replace schedule_work_on() by
queue_percpu_work_on(). The same happens for flush_work() and
flush_percpu_work().

This change requires allocation of qpw_structs instead of a work_structs,
and changing parameters of a few functions to include the cpu parameter.

This should bring no relevant performance impact on non-QPW kernels:
For functions that may be scheduled in a different cpu, the local_*lock's
this_cpu_ptr() becomes a per_cpu_ptr(smp_processor_id()).

Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

---
 mm/slub.c |  146 +++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 74 insertions(+), 72 deletions(-)

Index: linux/mm/slub.c
===================================================================
--- linux.orig/mm/slub.c
+++ linux/mm/slub.c
@@ -50,6 +50,7 @@
 #include <linux/irq_work.h>
 #include <linux/kprobes.h>
 #include <linux/debugfs.h>
+#include <linux/qpw.h>
 #include <trace/events/kmem.h>
 
 #include "internal.h"
@@ -129,7 +130,7 @@
  *   For debug caches, all allocations are forced to go through a list_lock
  *   protected region to serialize against concurrent validation.
  *
- *   cpu_sheaves->lock (local_trylock)
+ *   cpu_sheaves->lock (qpw_trylock)
  *
  *   This lock protects fastpath operations on the percpu sheaves. On !RT it
  *   only disables preemption and does no atomic operations. As long as the main
@@ -157,7 +158,7 @@
  *   Interrupts are disabled as part of list_lock or barn lock operations, or
  *   around the slab_lock operation, in order to make the slab allocator safe
  *   to use in the context of an irq.
- *   Preemption is disabled as part of local_trylock operations.
+ *   Preemption is disabled as part of qpw_trylock operations.
  *   kmalloc_nolock() and kfree_nolock() are safe in NMI context but see
  *   their limitations.
  *
@@ -418,7 +419,7 @@ struct slab_sheaf {
 };
 
 struct slub_percpu_sheaves {
-	local_trylock_t lock;
+	qpw_trylock_t lock;
 	struct slab_sheaf *main; /* never NULL when unlocked */
 	struct slab_sheaf *spare; /* empty or full, may be NULL */
 	struct slab_sheaf *rcu_free; /* for batching kfree_rcu() */
@@ -480,7 +481,7 @@ static nodemask_t slab_nodes;
 static struct workqueue_struct *flushwq;
 
 struct slub_flush_work {
-	struct work_struct work;
+	struct qpw_struct qpw;
 	struct kmem_cache *s;
 	bool skip;
 };
@@ -2859,16 +2860,14 @@ static void __kmem_cache_free_bulk(struc
  *
  * Returns how many objects are remaining to be flushed
  */
-static unsigned int __sheaf_flush_main_batch(struct kmem_cache *s)
+static unsigned int __sheaf_flush_main_batch(struct kmem_cache *s, int cpu)
 {
 	struct slub_percpu_sheaves *pcs;
 	unsigned int batch, remaining;
 	void *objects[PCS_BATCH_MAX];
 	struct slab_sheaf *sheaf;
 
-	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
-
-	pcs = this_cpu_ptr(s->cpu_sheaves);
+	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 	sheaf = pcs->main;
 
 	batch = min(PCS_BATCH_MAX, sheaf->size);
@@ -2878,7 +2877,7 @@ static unsigned int __sheaf_flush_main_b
 
 	remaining = sheaf->size;
 
-	local_unlock(&s->cpu_sheaves->lock);
+	qpw_unlock(&s->cpu_sheaves->lock, cpu);
 
 	__kmem_cache_free_bulk(s, batch, &objects[0]);
 
@@ -2887,14 +2886,14 @@ static unsigned int __sheaf_flush_main_b
 	return remaining;
 }
 
-static void sheaf_flush_main(struct kmem_cache *s)
+static void sheaf_flush_main(struct kmem_cache *s, int cpu)
 {
 	unsigned int remaining;
 
 	do {
-		local_lock(&s->cpu_sheaves->lock);
+		qpw_lock(&s->cpu_sheaves->lock, cpu);
 
-		remaining = __sheaf_flush_main_batch(s);
+		remaining = __sheaf_flush_main_batch(s, cpu);
 
 	} while (remaining);
 }
@@ -2908,11 +2907,13 @@ static bool sheaf_try_flush_main(struct
 	bool ret = false;
 
 	do {
-		if (!local_trylock(&s->cpu_sheaves->lock))
+		if (!local_qpw_trylock(&s->cpu_sheaves->lock))
 			return ret;
 
 		ret = true;
-		remaining = __sheaf_flush_main_batch(s);
+
+		qpw_lockdep_assert_held(&s->cpu_sheaves->lock);
+		remaining = __sheaf_flush_main_batch(s, smp_processor_id());
 
 	} while (remaining);
 
@@ -2989,13 +2990,13 @@ static void rcu_free_sheaf_nobarn(struct
  * flushing operations are rare so let's keep it simple and flush to slabs
  * directly, skipping the barn
  */
-static void pcs_flush_all(struct kmem_cache *s)
+static void pcs_flush_all(struct kmem_cache *s, int cpu)
 {
 	struct slub_percpu_sheaves *pcs;
 	struct slab_sheaf *spare, *rcu_free;
 
-	local_lock(&s->cpu_sheaves->lock);
-	pcs = this_cpu_ptr(s->cpu_sheaves);
+	qpw_lock(&s->cpu_sheaves->lock, cpu);
+	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 
 	spare = pcs->spare;
 	pcs->spare = NULL;
@@ -3003,7 +3004,7 @@ static void pcs_flush_all(struct kmem_ca
 	rcu_free = pcs->rcu_free;
 	pcs->rcu_free = NULL;
 
-	local_unlock(&s->cpu_sheaves->lock);
+	qpw_unlock(&s->cpu_sheaves->lock, cpu);
 
 	if (spare) {
 		sheaf_flush_unused(s, spare);
@@ -3013,7 +3014,7 @@ static void pcs_flush_all(struct kmem_ca
 	if (rcu_free)
 		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
 
-	sheaf_flush_main(s);
+	sheaf_flush_main(s, cpu);
 }
 
 static void __pcs_flush_all_cpu(struct kmem_cache *s, unsigned int cpu)
@@ -3963,13 +3964,13 @@ static void flush_cpu_sheaves(struct wor
 {
 	struct kmem_cache *s;
 	struct slub_flush_work *sfw;
+	int cpu = qpw_get_cpu(w);
 
-	sfw = container_of(w, struct slub_flush_work, work);
-
+	sfw = &per_cpu(slub_flush, cpu);
 	s = sfw->s;
 
 	if (cache_has_sheaves(s))
-		pcs_flush_all(s);
+		pcs_flush_all(s, cpu);
 }
 
 static void flush_all_cpus_locked(struct kmem_cache *s)
@@ -3986,17 +3987,17 @@ static void flush_all_cpus_locked(struct
 			sfw->skip = true;
 			continue;
 		}
-		INIT_WORK(&sfw->work, flush_cpu_sheaves);
+		INIT_QPW(&sfw->qpw, flush_cpu_sheaves, cpu);
 		sfw->skip = false;
 		sfw->s = s;
-		queue_work_on(cpu, flushwq, &sfw->work);
+		queue_percpu_work_on(cpu, flushwq, &sfw->qpw);
 	}
 
 	for_each_online_cpu(cpu) {
 		sfw = &per_cpu(slub_flush, cpu);
 		if (sfw->skip)
 			continue;
-		flush_work(&sfw->work);
+		flush_percpu_work(&sfw->qpw);
 	}
 
 	mutex_unlock(&flush_lock);
@@ -4015,17 +4016,18 @@ static void flush_rcu_sheaf(struct work_
 	struct slab_sheaf *rcu_free;
 	struct slub_flush_work *sfw;
 	struct kmem_cache *s;
+	int cpu = qpw_get_cpu(w);
 
-	sfw = container_of(w, struct slub_flush_work, work);
+	sfw = &per_cpu(slub_flush, cpu);
 	s = sfw->s;
 
-	local_lock(&s->cpu_sheaves->lock);
-	pcs = this_cpu_ptr(s->cpu_sheaves);
+	qpw_lock(&s->cpu_sheaves->lock, cpu);
+	pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 
 	rcu_free = pcs->rcu_free;
 	pcs->rcu_free = NULL;
 
-	local_unlock(&s->cpu_sheaves->lock);
+	qpw_unlock(&s->cpu_sheaves->lock, cpu);
 
 	if (rcu_free)
 		call_rcu(&rcu_free->rcu_head, rcu_free_sheaf_nobarn);
@@ -4050,14 +4052,14 @@ void flush_rcu_sheaves_on_cache(struct k
 		 * sure the __kfree_rcu_sheaf() finished its call_rcu()
 		 */
 
-		INIT_WORK(&sfw->work, flush_rcu_sheaf);
+		INIT_QPW(&sfw->qpw, flush_rcu_sheaf, cpu);
 		sfw->s = s;
-		queue_work_on(cpu, flushwq, &sfw->work);
+		queue_percpu_work_on(cpu, flushwq, &sfw->qpw);
 	}
 
 	for_each_online_cpu(cpu) {
 		sfw = &per_cpu(slub_flush, cpu);
-		flush_work(&sfw->work);
+		flush_percpu_work(&sfw->qpw);
 	}
 
 	mutex_unlock(&flush_lock);
@@ -4565,11 +4567,11 @@ __pcs_replace_empty_main(struct kmem_cac
 	struct node_barn *barn;
 	bool allow_spin;
 
-	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+	qpw_lockdep_assert_held(&s->cpu_sheaves->lock);
 
 	/* Bootstrap or debug cache, back off */
 	if (unlikely(!cache_has_sheaves(s))) {
-		local_unlock(&s->cpu_sheaves->lock);
+		local_qpw_unlock(&s->cpu_sheaves->lock);
 		return NULL;
 	}
 
@@ -4580,7 +4582,7 @@ __pcs_replace_empty_main(struct kmem_cac
 
 	barn = get_barn(s);
 	if (!barn) {
-		local_unlock(&s->cpu_sheaves->lock);
+		local_qpw_unlock(&s->cpu_sheaves->lock);
 		return NULL;
 	}
 
@@ -4605,7 +4607,7 @@ __pcs_replace_empty_main(struct kmem_cac
 		}
 	}
 
-	local_unlock(&s->cpu_sheaves->lock);
+	local_qpw_unlock(&s->cpu_sheaves->lock);
 	pcs = NULL;
 
 	if (!allow_spin)
@@ -4629,7 +4631,7 @@ __pcs_replace_empty_main(struct kmem_cac
 	if (!full)
 		return NULL;
 
-	if (!local_trylock(&s->cpu_sheaves->lock))
+	if (!local_qpw_trylock(&s->cpu_sheaves->lock))
 		goto barn_put;
 	pcs = this_cpu_ptr(s->cpu_sheaves);
 
@@ -4708,7 +4710,7 @@ void *alloc_from_pcs(struct kmem_cache *
 		return NULL;
 	}
 
-	if (!local_trylock(&s->cpu_sheaves->lock))
+	if (!local_qpw_trylock(&s->cpu_sheaves->lock))
 		return NULL;
 
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -4728,7 +4730,7 @@ void *alloc_from_pcs(struct kmem_cache *
 		 * the current allocation or previous freeing process.
 		 */
 		if (page_to_nid(virt_to_page(object)) != node) {
-			local_unlock(&s->cpu_sheaves->lock);
+			local_qpw_unlock(&s->cpu_sheaves->lock);
 			stat(s, ALLOC_NODE_MISMATCH);
 			return NULL;
 		}
@@ -4736,7 +4738,7 @@ void *alloc_from_pcs(struct kmem_cache *
 
 	pcs->main->size--;
 
-	local_unlock(&s->cpu_sheaves->lock);
+	local_qpw_unlock(&s->cpu_sheaves->lock);
 
 	stat(s, ALLOC_FASTPATH);
 
@@ -4753,7 +4755,7 @@ unsigned int alloc_from_pcs_bulk(struct
 	unsigned int batch;
 
 next_batch:
-	if (!local_trylock(&s->cpu_sheaves->lock))
+	if (!local_qpw_trylock(&s->cpu_sheaves->lock))
 		return allocated;
 
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -4764,7 +4766,7 @@ next_batch:
 		struct node_barn *barn;
 
 		if (unlikely(!cache_has_sheaves(s))) {
-			local_unlock(&s->cpu_sheaves->lock);
+			local_qpw_unlock(&s->cpu_sheaves->lock);
 			return allocated;
 		}
 
@@ -4775,7 +4777,7 @@ next_batch:
 
 		barn = get_barn(s);
 		if (!barn) {
-			local_unlock(&s->cpu_sheaves->lock);
+			local_qpw_unlock(&s->cpu_sheaves->lock);
 			return allocated;
 		}
 
@@ -4790,7 +4792,7 @@ next_batch:
 
 		stat(s, BARN_GET_FAIL);
 
-		local_unlock(&s->cpu_sheaves->lock);
+		local_qpw_unlock(&s->cpu_sheaves->lock);
 
 		/*
 		 * Once full sheaves in barn are depleted, let the bulk
@@ -4808,7 +4810,7 @@ do_alloc:
 	main->size -= batch;
 	memcpy(p, main->objects + main->size, batch * sizeof(void *));
 
-	local_unlock(&s->cpu_sheaves->lock);
+	local_qpw_unlock(&s->cpu_sheaves->lock);
 
 	stat_add(s, ALLOC_FASTPATH, batch);
 
@@ -4992,7 +4994,7 @@ kmem_cache_prefill_sheaf(struct kmem_cac
 		return sheaf;
 	}
 
-	local_lock(&s->cpu_sheaves->lock);
+	local_qpw_lock(&s->cpu_sheaves->lock);
 	pcs = this_cpu_ptr(s->cpu_sheaves);
 
 	if (pcs->spare) {
@@ -5011,7 +5013,7 @@ kmem_cache_prefill_sheaf(struct kmem_cac
 			stat(s, BARN_GET_FAIL);
 	}
 
-	local_unlock(&s->cpu_sheaves->lock);
+	local_qpw_unlock(&s->cpu_sheaves->lock);
 
 
 	if (!sheaf)
@@ -5055,7 +5057,7 @@ void kmem_cache_return_sheaf(struct kmem
 		return;
 	}
 
-	local_lock(&s->cpu_sheaves->lock);
+	local_qpw_lock(&s->cpu_sheaves->lock);
 	pcs = this_cpu_ptr(s->cpu_sheaves);
 	barn = get_barn(s);
 
@@ -5065,7 +5067,7 @@ void kmem_cache_return_sheaf(struct kmem
 		stat(s, SHEAF_RETURN_FAST);
 	}
 
-	local_unlock(&s->cpu_sheaves->lock);
+	local_qpw_unlock(&s->cpu_sheaves->lock);
 
 	if (!sheaf)
 		return;
@@ -5595,7 +5597,7 @@ static void __pcs_install_empty_sheaf(st
 		struct slub_percpu_sheaves *pcs, struct slab_sheaf *empty,
 		struct node_barn *barn)
 {
-	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+	qpw_lockdep_assert_held(&s->cpu_sheaves->lock);
 
 	/* This is what we expect to find if nobody interrupted us. */
 	if (likely(!pcs->spare)) {
@@ -5646,17 +5648,17 @@ __pcs_replace_full_main(struct kmem_cach
 	bool put_fail;
 
 restart:
-	lockdep_assert_held(this_cpu_ptr(&s->cpu_sheaves->lock));
+	qpw_lockdep_assert_held(&s->cpu_sheaves->lock);
 
 	/* Bootstrap or debug cache, back off */
 	if (unlikely(!cache_has_sheaves(s))) {
-		local_unlock(&s->cpu_sheaves->lock);
+		local_qpw_unlock(&s->cpu_sheaves->lock);
 		return NULL;
 	}
 
 	barn = get_barn(s);
 	if (!barn) {
-		local_unlock(&s->cpu_sheaves->lock);
+		local_qpw_unlock(&s->cpu_sheaves->lock);
 		return NULL;
 	}
 
@@ -5693,7 +5695,7 @@ restart:
 		stat(s, BARN_PUT_FAIL);
 
 		pcs->spare = NULL;
-		local_unlock(&s->cpu_sheaves->lock);
+		local_qpw_unlock(&s->cpu_sheaves->lock);
 
 		sheaf_flush_unused(s, to_flush);
 		empty = to_flush;
@@ -5709,7 +5711,7 @@ restart:
 	put_fail = true;
 
 alloc_empty:
-	local_unlock(&s->cpu_sheaves->lock);
+	local_qpw_unlock(&s->cpu_sheaves->lock);
 
 	/*
 	 * alloc_empty_sheaf() doesn't support !allow_spin and it's
@@ -5729,7 +5731,7 @@ alloc_empty:
 	if (!sheaf_try_flush_main(s))
 		return NULL;
 
-	if (!local_trylock(&s->cpu_sheaves->lock))
+	if (!local_qpw_trylock(&s->cpu_sheaves->lock))
 		return NULL;
 
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -5745,7 +5747,7 @@ alloc_empty:
 	return pcs;
 
 got_empty:
-	if (!local_trylock(&s->cpu_sheaves->lock)) {
+	if (!local_qpw_trylock(&s->cpu_sheaves->lock)) {
 		barn_put_empty_sheaf(barn, empty);
 		return NULL;
 	}
@@ -5765,7 +5767,7 @@ bool free_to_pcs(struct kmem_cache *s, v
 {
 	struct slub_percpu_sheaves *pcs;
 
-	if (!local_trylock(&s->cpu_sheaves->lock))
+	if (!local_qpw_trylock(&s->cpu_sheaves->lock))
 		return false;
 
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -5779,7 +5781,7 @@ bool free_to_pcs(struct kmem_cache *s, v
 
 	pcs->main->objects[pcs->main->size++] = object;
 
-	local_unlock(&s->cpu_sheaves->lock);
+	local_qpw_unlock(&s->cpu_sheaves->lock);
 
 	stat(s, FREE_FASTPATH);
 
@@ -5869,7 +5871,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache
 
 	lock_map_acquire_try(&kfree_rcu_sheaf_map);
 
-	if (!local_trylock(&s->cpu_sheaves->lock))
+	if (!local_qpw_trylock(&s->cpu_sheaves->lock))
 		goto fail;
 
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -5881,7 +5883,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache
 
 		/* Bootstrap or debug cache, fall back */
 		if (unlikely(!cache_has_sheaves(s))) {
-			local_unlock(&s->cpu_sheaves->lock);
+			local_qpw_unlock(&s->cpu_sheaves->lock);
 			goto fail;
 		}
 
@@ -5893,7 +5895,7 @@ bool __kfree_rcu_sheaf(struct kmem_cache
 
 		barn = get_barn(s);
 		if (!barn) {
-			local_unlock(&s->cpu_sheaves->lock);
+			local_qpw_unlock(&s->cpu_sheaves->lock);
 			goto fail;
 		}
 
@@ -5904,14 +5906,14 @@ bool __kfree_rcu_sheaf(struct kmem_cache
 			goto do_free;
 		}
 
-		local_unlock(&s->cpu_sheaves->lock);
+		local_qpw_unlock(&s->cpu_sheaves->lock);
 
 		empty = alloc_empty_sheaf(s, GFP_NOWAIT);
 
 		if (!empty)
 			goto fail;
 
-		if (!local_trylock(&s->cpu_sheaves->lock)) {
+		if (!local_qpw_trylock(&s->cpu_sheaves->lock)) {
 			barn_put_empty_sheaf(barn, empty);
 			goto fail;
 		}
@@ -5942,13 +5944,13 @@ do_free:
 	}
 
 	/*
-	 * we flush before local_unlock to make sure a racing
+	 * we flush before local_qpw_unlock to make sure a racing
 	 * flush_all_rcu_sheaves() doesn't miss this sheaf
 	 */
 	if (rcu_sheaf)
 		call_rcu(&rcu_sheaf->rcu_head, rcu_free_sheaf);
 
-	local_unlock(&s->cpu_sheaves->lock);
+	local_qpw_unlock(&s->cpu_sheaves->lock);
 
 	stat(s, FREE_RCU_SHEAF);
 	lock_map_release(&kfree_rcu_sheaf_map);
@@ -6004,7 +6006,7 @@ next_remote_batch:
 		goto flush_remote;
 
 next_batch:
-	if (!local_trylock(&s->cpu_sheaves->lock))
+	if (!local_qpw_trylock(&s->cpu_sheaves->lock))
 		goto fallback;
 
 	pcs = this_cpu_ptr(s->cpu_sheaves);
@@ -6047,7 +6049,7 @@ do_free:
 	memcpy(main->objects + main->size, p, batch * sizeof(void *));
 	main->size += batch;
 
-	local_unlock(&s->cpu_sheaves->lock);
+	local_qpw_unlock(&s->cpu_sheaves->lock);
 
 	stat_add(s, FREE_FASTPATH, batch);
 
@@ -6063,7 +6065,7 @@ do_free:
 	return;
 
 no_empty:
-	local_unlock(&s->cpu_sheaves->lock);
+	local_qpw_unlock(&s->cpu_sheaves->lock);
 
 	/*
 	 * if we depleted all empty sheaves in the barn or there are too
@@ -7468,7 +7470,7 @@ static int init_percpu_sheaves(struct km
 
 		pcs = per_cpu_ptr(s->cpu_sheaves, cpu);
 
-		local_trylock_init(&pcs->lock);
+		qpw_trylock_init(&pcs->lock);
 
 		/*
 		 * Bootstrap sheaf has zero size so fast-path allocation fails.

Re: [PATCH v3 4/4] slub: apply new queue_percpu_work_on() interface

Posted by Frederic Weisbecker 2 days, 2 hours ago

Le Mon, Mar 23, 2026 at 02:55:48PM -0300, Marcelo Tosatti a écrit :
> Make use of the new qpw_{un,}lock*() and queue_percpu_work_on()
> interface to improve performance & latency.
> 
> For functions that may be scheduled in a different cpu, replace
> local_{un,}lock*() by qpw_{un,}lock*(), and replace schedule_work_on() by
> queue_percpu_work_on(). The same happens for flush_work() and
> flush_percpu_work().
> 
> This change requires allocation of qpw_structs instead of a work_structs,
> and changing parameters of a few functions to include the cpu parameter.
> 
> This should bring no relevant performance impact on non-QPW kernels:
> For functions that may be scheduled in a different cpu, the local_*lock's
> this_cpu_ptr() becomes a per_cpu_ptr(smp_processor_id()).
> 
> Signed-off-by: Leonardo Bras <leobras.c@gmail.com>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>

Again I can't review the details but I'm fine with the approach once
the namespace is fixed.

Acked-by: Frederic Weisbecker <frederic@kernel.org>

-- 
Frederic Weisbecker
SUSE Labs

[PATCH v3 1/4] Introducing qpw_lock() and per-cpu queue & flush work
[PATCH v3 2/4] mm/swap: move bh draining into a separate workqueue
[PATCH v3 3/4] swap: apply new queue_percpu_work_on() interface
[PATCH v3 4/4] slub: apply new queue_percpu_work_on() interface