Make use of the new qpw_{un,}lock*() and queue_percpu_work_on()
interface to improve performance & latency on PREEMPT_RT kernels.
For functions that may be scheduled in a different cpu, replace
local_{un,}lock*() by qpw_{un,}lock*(), and replace schedule_work_on() by
queue_percpu_work_on(). The same happens for flush_work() and
flush_percpu_work().
The change requires allocation of qpw_structs instead of a work_structs,
and changing parameters of a few functions to include the cpu parameter.
This should bring no relevant performance impact on non-RT kernels:
For functions that may be scheduled in a different cpu, the local_*lock's
this_cpu_ptr() becomes a per_cpu_ptr(smp_processor_id()).
Signed-off-by: Leonardo Bras <leobras@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
mm/internal.h | 4 +-
mm/mlock.c | 71 ++++++++++++++++++++++++++++++++------------
mm/page_alloc.c | 2 -
mm/swap.c | 90 +++++++++++++++++++++++++++++++-------------------------
4 files changed, 108 insertions(+), 59 deletions(-)
Index: slab/mm/mlock.c
===================================================================
--- slab.orig/mm/mlock.c
+++ slab/mm/mlock.c
@@ -25,17 +25,16 @@
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
#include <linux/secretmem.h>
+#include <linux/qpw.h>
#include "internal.h"
struct mlock_fbatch {
- local_lock_t lock;
+ qpw_lock_t lock;
struct folio_batch fbatch;
};
-static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
- .lock = INIT_LOCAL_LOCK(lock),
-};
+static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch);
bool can_do_mlock(void)
{
@@ -209,18 +208,25 @@ static void mlock_folio_batch(struct fol
folios_put(fbatch);
}
-void mlock_drain_local(void)
+void mlock_drain_cpu(int cpu)
{
struct folio_batch *fbatch;
- local_lock(&mlock_fbatch.lock);
- fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
+ qpw_lock(&mlock_fbatch.lock, cpu);
+ fbatch = per_cpu_ptr(&mlock_fbatch.fbatch, cpu);
if (folio_batch_count(fbatch))
mlock_folio_batch(fbatch);
- local_unlock(&mlock_fbatch.lock);
+ qpw_unlock(&mlock_fbatch.lock, cpu);
}
-void mlock_drain_remote(int cpu)
+void mlock_drain_local(void)
+{
+ migrate_disable();
+ mlock_drain_cpu(smp_processor_id());
+ migrate_enable();
+}
+
+void mlock_drain_offline(int cpu)
{
struct folio_batch *fbatch;
@@ -242,9 +248,12 @@ bool need_mlock_drain(int cpu)
void mlock_folio(struct folio *folio)
{
struct folio_batch *fbatch;
+ int cpu;
- local_lock(&mlock_fbatch.lock);
- fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
+ migrate_disable();
+ cpu = smp_processor_id();
+ qpw_lock(&mlock_fbatch.lock, cpu);
+ fbatch = per_cpu_ptr(&mlock_fbatch.fbatch, cpu);
if (!folio_test_set_mlocked(folio)) {
int nr_pages = folio_nr_pages(folio);
@@ -257,7 +266,8 @@ void mlock_folio(struct folio *folio)
if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
- local_unlock(&mlock_fbatch.lock);
+ qpw_unlock(&mlock_fbatch.lock, cpu);
+ migrate_enable();
}
/**
@@ -268,9 +278,13 @@ void mlock_new_folio(struct folio *folio
{
struct folio_batch *fbatch;
int nr_pages = folio_nr_pages(folio);
+ int cpu;
+
+ migrate_disable();
+ cpu = smp_processor_id();
+ qpw_lock(&mlock_fbatch.lock, cpu);
- local_lock(&mlock_fbatch.lock);
- fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
+ fbatch = per_cpu_ptr(&mlock_fbatch.fbatch, cpu);
folio_set_mlocked(folio);
zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
@@ -280,7 +294,8 @@ void mlock_new_folio(struct folio *folio
if (!folio_batch_add(fbatch, mlock_new(folio)) ||
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
- local_unlock(&mlock_fbatch.lock);
+ migrate_enable();
+ qpw_unlock(&mlock_fbatch.lock, cpu);
}
/**
@@ -290,9 +305,13 @@ void mlock_new_folio(struct folio *folio
void munlock_folio(struct folio *folio)
{
struct folio_batch *fbatch;
+ int cpu;
- local_lock(&mlock_fbatch.lock);
- fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
+ migrate_disable();
+ cpu = smp_processor_id();
+ qpw_lock(&mlock_fbatch.lock, cpu);
+
+ fbatch = per_cpu_ptr(&mlock_fbatch.fbatch, cpu);
/*
* folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
* which will check whether the folio is multiply mlocked.
@@ -301,7 +320,8 @@ void munlock_folio(struct folio *folio)
if (!folio_batch_add(fbatch, folio) ||
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
mlock_folio_batch(fbatch);
- local_unlock(&mlock_fbatch.lock);
+ qpw_unlock(&mlock_fbatch.lock, cpu);
+ migrate_enable();
}
static inline unsigned int folio_mlock_step(struct folio *folio,
@@ -823,3 +843,18 @@ void user_shm_unlock(size_t size, struct
spin_unlock(&shmlock_user_lock);
put_ucounts(ucounts);
}
+
+int __init mlock_init(void)
+{
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct mlock_fbatch *fbatch = &per_cpu(mlock_fbatch, cpu);
+
+ qpw_lock_init(&fbatch->lock);
+ }
+
+ return 0;
+}
+
+module_init(mlock_init);
Index: slab/mm/swap.c
===================================================================
--- slab.orig/mm/swap.c
+++ slab/mm/swap.c
@@ -35,7 +35,7 @@
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
-#include <linux/local_lock.h>
+#include <linux/qpw.h>
#include <linux/buffer_head.h>
#include "internal.h"
@@ -52,7 +52,7 @@ struct cpu_fbatches {
* The following folio batches are grouped together because they are protected
* by disabling preemption (and interrupts remain enabled).
*/
- local_lock_t lock;
+ qpw_lock_t lock;
struct folio_batch lru_add;
struct folio_batch lru_deactivate_file;
struct folio_batch lru_deactivate;
@@ -61,14 +61,11 @@ struct cpu_fbatches {
struct folio_batch lru_activate;
#endif
/* Protecting the following batches which require disabling interrupts */
- local_lock_t lock_irq;
+ qpw_lock_t lock_irq;
struct folio_batch lru_move_tail;
};
-static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
- .lock = INIT_LOCAL_LOCK(lock),
- .lock_irq = INIT_LOCAL_LOCK(lock_irq),
-};
+static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches);
static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
unsigned long *flagsp)
@@ -183,22 +180,24 @@ static void __folio_batch_add_and_move(s
struct folio *folio, move_fn_t move_fn, bool disable_irq)
{
unsigned long flags;
+ int cpu;
folio_get(folio);
+ cpu = smp_processor_id();
if (disable_irq)
- local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
+ qpw_lock_irqsave(&cpu_fbatches.lock_irq, flags, cpu);
else
- local_lock(&cpu_fbatches.lock);
+ qpw_lock(&cpu_fbatches.lock, cpu);
- if (!folio_batch_add(this_cpu_ptr(fbatch), folio) ||
+ if (!folio_batch_add(per_cpu_ptr(fbatch, cpu), folio) ||
!folio_may_be_lru_cached(folio) || lru_cache_disabled())
- folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);
+ folio_batch_move_lru(per_cpu_ptr(fbatch, cpu), move_fn);
if (disable_irq)
- local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
+ qpw_unlock_irqrestore(&cpu_fbatches.lock_irq, flags, cpu);
else
- local_unlock(&cpu_fbatches.lock);
+ qpw_unlock(&cpu_fbatches.lock, cpu);
}
#define folio_batch_add_and_move(folio, op) \
@@ -358,9 +357,10 @@ static void __lru_cache_activate_folio(s
{
struct folio_batch *fbatch;
int i;
+ int cpu = smp_processor_id();
- local_lock(&cpu_fbatches.lock);
- fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
+ qpw_lock(&cpu_fbatches.lock, cpu);
+ fbatch = per_cpu_ptr(&cpu_fbatches.lru_add, cpu);
/*
* Search backwards on the optimistic assumption that the folio being
@@ -381,7 +381,7 @@ static void __lru_cache_activate_folio(s
}
}
- local_unlock(&cpu_fbatches.lock);
+ qpw_unlock(&cpu_fbatches.lock, cpu);
}
#ifdef CONFIG_LRU_GEN
@@ -653,9 +653,9 @@ void lru_add_drain_cpu(int cpu)
unsigned long flags;
/* No harm done if a racing interrupt already did this */
- local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
+ qpw_lock_irqsave(&cpu_fbatches.lock_irq, flags, cpu);
folio_batch_move_lru(fbatch, lru_move_tail);
- local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
+ qpw_unlock_irqrestore(&cpu_fbatches.lock_irq, flags, cpu);
}
fbatch = &fbatches->lru_deactivate_file;
@@ -733,10 +733,12 @@ void folio_mark_lazyfree(struct folio *f
void lru_add_drain(void)
{
- local_lock(&cpu_fbatches.lock);
- lru_add_drain_cpu(smp_processor_id());
- local_unlock(&cpu_fbatches.lock);
- mlock_drain_local();
+ int cpu = smp_processor_id();
+
+ qpw_lock(&cpu_fbatches.lock, cpu);
+ lru_add_drain_cpu(cpu);
+ qpw_unlock(&cpu_fbatches.lock, cpu);
+ mlock_drain_cpu(cpu);
}
/*
@@ -745,30 +747,32 @@ void lru_add_drain(void)
* the same cpu. It shouldn't be a problem in !SMP case since
* the core is only one and the locks will disable preemption.
*/
-static void lru_add_mm_drain(void)
+static void lru_add_mm_drain(int cpu)
{
- local_lock(&cpu_fbatches.lock);
- lru_add_drain_cpu(smp_processor_id());
- local_unlock(&cpu_fbatches.lock);
- mlock_drain_local();
+ qpw_lock(&cpu_fbatches.lock, cpu);
+ lru_add_drain_cpu(cpu);
+ qpw_unlock(&cpu_fbatches.lock, cpu);
+ mlock_drain_cpu(cpu);
}
void lru_add_drain_cpu_zone(struct zone *zone)
{
- local_lock(&cpu_fbatches.lock);
- lru_add_drain_cpu(smp_processor_id());
+ int cpu = smp_processor_id();
+
+ qpw_lock(&cpu_fbatches.lock, cpu);
+ lru_add_drain_cpu(cpu);
drain_local_pages(zone);
- local_unlock(&cpu_fbatches.lock);
- mlock_drain_local();
+ qpw_unlock(&cpu_fbatches.lock, cpu);
+ mlock_drain_cpu(cpu);
}
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+static DEFINE_PER_CPU(struct qpw_struct, lru_add_drain_qpw);
-static void lru_add_drain_per_cpu(struct work_struct *dummy)
+static void lru_add_drain_per_cpu(struct work_struct *w)
{
- lru_add_mm_drain();
+ lru_add_mm_drain(qpw_get_cpu(w));
}
static DEFINE_PER_CPU(struct work_struct, bh_add_drain_work);
@@ -883,12 +887,12 @@ static inline void __lru_add_drain_all(b
cpumask_clear(&has_mm_work);
cpumask_clear(&has_bh_work);
for_each_online_cpu(cpu) {
- struct work_struct *mm_work = &per_cpu(lru_add_drain_work, cpu);
+ struct qpw_struct *mm_qpw = &per_cpu(lru_add_drain_qpw, cpu);
struct work_struct *bh_work = &per_cpu(bh_add_drain_work, cpu);
if (cpu_needs_mm_drain(cpu)) {
- INIT_WORK(mm_work, lru_add_drain_per_cpu);
- queue_work_on(cpu, mm_percpu_wq, mm_work);
+ INIT_QPW(mm_qpw, lru_add_drain_per_cpu, cpu);
+ queue_percpu_work_on(cpu, mm_percpu_wq, mm_qpw);
__cpumask_set_cpu(cpu, &has_mm_work);
}
@@ -900,7 +904,7 @@ static inline void __lru_add_drain_all(b
}
for_each_cpu(cpu, &has_mm_work)
- flush_work(&per_cpu(lru_add_drain_work, cpu));
+ flush_percpu_work(&per_cpu(lru_add_drain_qpw, cpu));
for_each_cpu(cpu, &has_bh_work)
flush_work(&per_cpu(bh_add_drain_work, cpu));
@@ -950,7 +954,7 @@ void lru_cache_disable(void)
#ifdef CONFIG_SMP
__lru_add_drain_all(true);
#else
- lru_add_mm_drain();
+ lru_add_mm_drain(smp_processor_id());
invalidate_bh_lrus_cpu();
#endif
}
@@ -1124,6 +1128,7 @@ static const struct ctl_table swap_sysct
void __init swap_setup(void)
{
unsigned long megs = PAGES_TO_MB(totalram_pages());
+ unsigned int cpu;
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
@@ -1136,4 +1141,11 @@ void __init swap_setup(void)
*/
register_sysctl_init("vm", swap_sysctl_table);
+
+ for_each_possible_cpu(cpu) {
+ struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
+
+ qpw_lock_init(&fbatches->lock);
+ qpw_lock_init(&fbatches->lock_irq);
+ }
}
Index: slab/mm/internal.h
===================================================================
--- slab.orig/mm/internal.h
+++ slab/mm/internal.h
@@ -1061,10 +1061,12 @@ static inline void munlock_vma_folio(str
munlock_folio(folio);
}
+int __init mlock_init(void);
void mlock_new_folio(struct folio *folio);
bool need_mlock_drain(int cpu);
void mlock_drain_local(void);
-void mlock_drain_remote(int cpu);
+void mlock_drain_cpu(int cpu);
+void mlock_drain_offline(int cpu);
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
Index: slab/mm/page_alloc.c
===================================================================
--- slab.orig/mm/page_alloc.c
+++ slab/mm/page_alloc.c
@@ -6251,7 +6251,7 @@ static int page_alloc_cpu_dead(unsigned
struct zone *zone;
lru_add_drain_cpu(cpu);
- mlock_drain_remote(cpu);
+ mlock_drain_offline(cpu);
drain_pages(cpu);
/*
On Fri, Feb 06, 2026 at 11:34:33AM -0300, Marcelo Tosatti wrote:
> Make use of the new qpw_{un,}lock*() and queue_percpu_work_on()
> interface to improve performance & latency on PREEMPT_RT kernels.
>
> For functions that may be scheduled in a different cpu, replace
> local_{un,}lock*() by qpw_{un,}lock*(), and replace schedule_work_on() by
> queue_percpu_work_on(). The same happens for flush_work() and
> flush_percpu_work().
>
> The change requires allocation of qpw_structs instead of a work_structs,
> and changing parameters of a few functions to include the cpu parameter.
>
> This should bring no relevant performance impact on non-RT kernels:
I think this is still referencing the previuos version, as there may be
impact in PREEMPT_RT=n kernels if QPW=y and qpw=1 in kernel cmdline.
I would go with:
This should bring no relevant performance impact on non-QPW kernels
> For functions that may be scheduled in a different cpu, the local_*lock's
> this_cpu_ptr() becomes a per_cpu_ptr(smp_processor_id()).
>
> Signed-off-by: Leonardo Bras <leobras@redhat.com>
> Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
>
> ---
> mm/internal.h | 4 +-
> mm/mlock.c | 71 ++++++++++++++++++++++++++++++++------------
> mm/page_alloc.c | 2 -
> mm/swap.c | 90 +++++++++++++++++++++++++++++++-------------------------
> 4 files changed, 108 insertions(+), 59 deletions(-)
>
> Index: slab/mm/mlock.c
> ===================================================================
> --- slab.orig/mm/mlock.c
> +++ slab/mm/mlock.c
> @@ -25,17 +25,16 @@
> #include <linux/memcontrol.h>
> #include <linux/mm_inline.h>
> #include <linux/secretmem.h>
> +#include <linux/qpw.h>
>
> #include "internal.h"
>
> struct mlock_fbatch {
> - local_lock_t lock;
> + qpw_lock_t lock;
> struct folio_batch fbatch;
> };
>
> -static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch) = {
> - .lock = INIT_LOCAL_LOCK(lock),
> -};
> +static DEFINE_PER_CPU(struct mlock_fbatch, mlock_fbatch);
>
> bool can_do_mlock(void)
> {
> @@ -209,18 +208,25 @@ static void mlock_folio_batch(struct fol
> folios_put(fbatch);
> }
>
> -void mlock_drain_local(void)
> +void mlock_drain_cpu(int cpu)
> {
> struct folio_batch *fbatch;
>
> - local_lock(&mlock_fbatch.lock);
> - fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
> + qpw_lock(&mlock_fbatch.lock, cpu);
> + fbatch = per_cpu_ptr(&mlock_fbatch.fbatch, cpu);
> if (folio_batch_count(fbatch))
> mlock_folio_batch(fbatch);
> - local_unlock(&mlock_fbatch.lock);
> + qpw_unlock(&mlock_fbatch.lock, cpu);
> }
>
> -void mlock_drain_remote(int cpu)
> +void mlock_drain_local(void)
> +{
> + migrate_disable();
> + mlock_drain_cpu(smp_processor_id());
> + migrate_enable();
> +}
> +
> +void mlock_drain_offline(int cpu)
> {
> struct folio_batch *fbatch;
>
> @@ -242,9 +248,12 @@ bool need_mlock_drain(int cpu)
> void mlock_folio(struct folio *folio)
> {
> struct folio_batch *fbatch;
> + int cpu;
>
> - local_lock(&mlock_fbatch.lock);
> - fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
> + migrate_disable();
> + cpu = smp_processor_id();
Wondering if for these cases it would make sense to have something like:
qpw_get_local_cpu() and
qpw_put_local_cpu()
so we could encapsulate these migrate_{en,dis}able()
and the smp_processor_id().
Or even,
int qpw_local_lock() {
migrate_disable();
cpu = smp_processor_id();
qpw_lock(..., cpu);
return cpu;
}
and
qpw_local_unlock(cpu){
qpw_unlock(...,cpu);
migrate_enable();
}
so it's more direct to convert the local-only cases.
What do you think?
> + qpw_lock(&mlock_fbatch.lock, cpu);
> + fbatch = per_cpu_ptr(&mlock_fbatch.fbatch, cpu);
>
> if (!folio_test_set_mlocked(folio)) {
> int nr_pages = folio_nr_pages(folio);
> @@ -257,7 +266,8 @@ void mlock_folio(struct folio *folio)
> if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
> !folio_may_be_lru_cached(folio) || lru_cache_disabled())
> mlock_folio_batch(fbatch);
> - local_unlock(&mlock_fbatch.lock);
> + qpw_unlock(&mlock_fbatch.lock, cpu);
> + migrate_enable();
> }
>
> /**
> @@ -268,9 +278,13 @@ void mlock_new_folio(struct folio *folio
> {
> struct folio_batch *fbatch;
> int nr_pages = folio_nr_pages(folio);
> + int cpu;
> +
> + migrate_disable();
> + cpu = smp_processor_id();
> + qpw_lock(&mlock_fbatch.lock, cpu);
>
> - local_lock(&mlock_fbatch.lock);
> - fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
> + fbatch = per_cpu_ptr(&mlock_fbatch.fbatch, cpu);
> folio_set_mlocked(folio);
>
> zone_stat_mod_folio(folio, NR_MLOCK, nr_pages);
> @@ -280,7 +294,8 @@ void mlock_new_folio(struct folio *folio
> if (!folio_batch_add(fbatch, mlock_new(folio)) ||
> !folio_may_be_lru_cached(folio) || lru_cache_disabled())
> mlock_folio_batch(fbatch);
> - local_unlock(&mlock_fbatch.lock);
> + migrate_enable();
> + qpw_unlock(&mlock_fbatch.lock, cpu);
in the above conversion, the migrate_enable() happened after qpw_unlock,
and in this one is the oposite. Any particular reason?
> }
>
> /**
> @@ -290,9 +305,13 @@ void mlock_new_folio(struct folio *folio
> void munlock_folio(struct folio *folio)
> {
> struct folio_batch *fbatch;
> + int cpu;
>
> - local_lock(&mlock_fbatch.lock);
> - fbatch = this_cpu_ptr(&mlock_fbatch.fbatch);
> + migrate_disable();
> + cpu = smp_processor_id();
> + qpw_lock(&mlock_fbatch.lock, cpu);
> +
> + fbatch = per_cpu_ptr(&mlock_fbatch.fbatch, cpu);
> /*
> * folio_test_clear_mlocked(folio) must be left to __munlock_folio(),
> * which will check whether the folio is multiply mlocked.
> @@ -301,7 +320,8 @@ void munlock_folio(struct folio *folio)
> if (!folio_batch_add(fbatch, folio) ||
> !folio_may_be_lru_cached(folio) || lru_cache_disabled())
> mlock_folio_batch(fbatch);
> - local_unlock(&mlock_fbatch.lock);
> + qpw_unlock(&mlock_fbatch.lock, cpu);
> + migrate_enable();
> }
>
> static inline unsigned int folio_mlock_step(struct folio *folio,
> @@ -823,3 +843,18 @@ void user_shm_unlock(size_t size, struct
> spin_unlock(&shmlock_user_lock);
> put_ucounts(ucounts);
> }
> +
> +int __init mlock_init(void)
> +{
> + unsigned int cpu;
> +
> + for_each_possible_cpu(cpu) {
> + struct mlock_fbatch *fbatch = &per_cpu(mlock_fbatch, cpu);
> +
> + qpw_lock_init(&fbatch->lock);
> + }
> +
> + return 0;
> +}
> +
> +module_init(mlock_init);
> Index: slab/mm/swap.c
> ===================================================================
> --- slab.orig/mm/swap.c
> +++ slab/mm/swap.c
> @@ -35,7 +35,7 @@
> #include <linux/uio.h>
> #include <linux/hugetlb.h>
> #include <linux/page_idle.h>
> -#include <linux/local_lock.h>
> +#include <linux/qpw.h>
> #include <linux/buffer_head.h>
>
> #include "internal.h"
> @@ -52,7 +52,7 @@ struct cpu_fbatches {
> * The following folio batches are grouped together because they are protected
> * by disabling preemption (and interrupts remain enabled).
> */
> - local_lock_t lock;
> + qpw_lock_t lock;
> struct folio_batch lru_add;
> struct folio_batch lru_deactivate_file;
> struct folio_batch lru_deactivate;
> @@ -61,14 +61,11 @@ struct cpu_fbatches {
> struct folio_batch lru_activate;
> #endif
> /* Protecting the following batches which require disabling interrupts */
> - local_lock_t lock_irq;
> + qpw_lock_t lock_irq;
> struct folio_batch lru_move_tail;
> };
>
> -static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
> - .lock = INIT_LOCAL_LOCK(lock),
> - .lock_irq = INIT_LOCAL_LOCK(lock_irq),
> -};
> +static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches);
>
> static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
> unsigned long *flagsp)
> @@ -183,22 +180,24 @@ static void __folio_batch_add_and_move(s
> struct folio *folio, move_fn_t move_fn, bool disable_irq)
> {
> unsigned long flags;
> + int cpu;
>
> folio_get(folio);
don't we need the migrate_disable() here?
>
> + cpu = smp_processor_id();
> if (disable_irq)
> - local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
> + qpw_lock_irqsave(&cpu_fbatches.lock_irq, flags, cpu);
> else
> - local_lock(&cpu_fbatches.lock);
> + qpw_lock(&cpu_fbatches.lock, cpu);
>
> - if (!folio_batch_add(this_cpu_ptr(fbatch), folio) ||
> + if (!folio_batch_add(per_cpu_ptr(fbatch, cpu), folio) ||
> !folio_may_be_lru_cached(folio) || lru_cache_disabled())
> - folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);
> + folio_batch_move_lru(per_cpu_ptr(fbatch, cpu), move_fn);
>
> if (disable_irq)
> - local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
> + qpw_unlock_irqrestore(&cpu_fbatches.lock_irq, flags, cpu);
> else
> - local_unlock(&cpu_fbatches.lock);
> + qpw_unlock(&cpu_fbatches.lock, cpu);
> }
>
> #define folio_batch_add_and_move(folio, op) \
> @@ -358,9 +357,10 @@ static void __lru_cache_activate_folio(s
> {
> struct folio_batch *fbatch;
> int i;
and here?
> + int cpu = smp_processor_id();
>
> - local_lock(&cpu_fbatches.lock);
> - fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
> + qpw_lock(&cpu_fbatches.lock, cpu);
> + fbatch = per_cpu_ptr(&cpu_fbatches.lru_add, cpu);
>
> /*
> * Search backwards on the optimistic assumption that the folio being
> @@ -381,7 +381,7 @@ static void __lru_cache_activate_folio(s
> }
> }
>
> - local_unlock(&cpu_fbatches.lock);
> + qpw_unlock(&cpu_fbatches.lock, cpu);
> }
>
> #ifdef CONFIG_LRU_GEN
> @@ -653,9 +653,9 @@ void lru_add_drain_cpu(int cpu)
> unsigned long flags;
>
> /* No harm done if a racing interrupt already did this */
> - local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
> + qpw_lock_irqsave(&cpu_fbatches.lock_irq, flags, cpu);
> folio_batch_move_lru(fbatch, lru_move_tail);
> - local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
> + qpw_unlock_irqrestore(&cpu_fbatches.lock_irq, flags, cpu);
> }
>
> fbatch = &fbatches->lru_deactivate_file;
> @@ -733,10 +733,12 @@ void folio_mark_lazyfree(struct folio *f
>
> void lru_add_drain(void)
> {
> - local_lock(&cpu_fbatches.lock);
> - lru_add_drain_cpu(smp_processor_id());
> - local_unlock(&cpu_fbatches.lock);
> - mlock_drain_local();
and here?
> + int cpu = smp_processor_id();
> +
> + qpw_lock(&cpu_fbatches.lock, cpu);
> + lru_add_drain_cpu(cpu);
> + qpw_unlock(&cpu_fbatches.lock, cpu);
> + mlock_drain_cpu(cpu);
> }
>
> /*
> @@ -745,30 +747,32 @@ void lru_add_drain(void)
> * the same cpu. It shouldn't be a problem in !SMP case since
> * the core is only one and the locks will disable preemption.
> */
> -static void lru_add_mm_drain(void)
> +static void lru_add_mm_drain(int cpu)
> {
> - local_lock(&cpu_fbatches.lock);
> - lru_add_drain_cpu(smp_processor_id());
> - local_unlock(&cpu_fbatches.lock);
> - mlock_drain_local();
> + qpw_lock(&cpu_fbatches.lock, cpu);
> + lru_add_drain_cpu(cpu);
> + qpw_unlock(&cpu_fbatches.lock, cpu);
> + mlock_drain_cpu(cpu);
> }
>
> void lru_add_drain_cpu_zone(struct zone *zone)
> {
> - local_lock(&cpu_fbatches.lock);
> - lru_add_drain_cpu(smp_processor_id());
and here ?
> + int cpu = smp_processor_id();
> +
> + qpw_lock(&cpu_fbatches.lock, cpu);
> + lru_add_drain_cpu(cpu);
> drain_local_pages(zone);
> - local_unlock(&cpu_fbatches.lock);
> - mlock_drain_local();
> + qpw_unlock(&cpu_fbatches.lock, cpu);
> + mlock_drain_cpu(cpu);
> }
>
> #ifdef CONFIG_SMP
>
> -static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
> +static DEFINE_PER_CPU(struct qpw_struct, lru_add_drain_qpw);
>
> -static void lru_add_drain_per_cpu(struct work_struct *dummy)
> +static void lru_add_drain_per_cpu(struct work_struct *w)
> {
> - lru_add_mm_drain();
> + lru_add_mm_drain(qpw_get_cpu(w));
> }
>
> static DEFINE_PER_CPU(struct work_struct, bh_add_drain_work);
> @@ -883,12 +887,12 @@ static inline void __lru_add_drain_all(b
> cpumask_clear(&has_mm_work);
> cpumask_clear(&has_bh_work);
> for_each_online_cpu(cpu) {
> - struct work_struct *mm_work = &per_cpu(lru_add_drain_work, cpu);
> + struct qpw_struct *mm_qpw = &per_cpu(lru_add_drain_qpw, cpu);
> struct work_struct *bh_work = &per_cpu(bh_add_drain_work, cpu);
>
> if (cpu_needs_mm_drain(cpu)) {
> - INIT_WORK(mm_work, lru_add_drain_per_cpu);
> - queue_work_on(cpu, mm_percpu_wq, mm_work);
> + INIT_QPW(mm_qpw, lru_add_drain_per_cpu, cpu);
> + queue_percpu_work_on(cpu, mm_percpu_wq, mm_qpw);
> __cpumask_set_cpu(cpu, &has_mm_work);
> }
>
> @@ -900,7 +904,7 @@ static inline void __lru_add_drain_all(b
> }
>
> for_each_cpu(cpu, &has_mm_work)
> - flush_work(&per_cpu(lru_add_drain_work, cpu));
> + flush_percpu_work(&per_cpu(lru_add_drain_qpw, cpu));
>
> for_each_cpu(cpu, &has_bh_work)
> flush_work(&per_cpu(bh_add_drain_work, cpu));
> @@ -950,7 +954,7 @@ void lru_cache_disable(void)
> #ifdef CONFIG_SMP
> __lru_add_drain_all(true);
> #else
> - lru_add_mm_drain();
and here, I wonder
> + lru_add_mm_drain(smp_processor_id());
> invalidate_bh_lrus_cpu();
> #endif
> }
> @@ -1124,6 +1128,7 @@ static const struct ctl_table swap_sysct
> void __init swap_setup(void)
> {
> unsigned long megs = PAGES_TO_MB(totalram_pages());
> + unsigned int cpu;
>
> /* Use a smaller cluster for small-memory machines */
> if (megs < 16)
> @@ -1136,4 +1141,11 @@ void __init swap_setup(void)
> */
>
> register_sysctl_init("vm", swap_sysctl_table);
> +
> + for_each_possible_cpu(cpu) {
> + struct cpu_fbatches *fbatches = &per_cpu(cpu_fbatches, cpu);
> +
> + qpw_lock_init(&fbatches->lock);
> + qpw_lock_init(&fbatches->lock_irq);
> + }
> }
> Index: slab/mm/internal.h
> ===================================================================
> --- slab.orig/mm/internal.h
> +++ slab/mm/internal.h
> @@ -1061,10 +1061,12 @@ static inline void munlock_vma_folio(str
> munlock_folio(folio);
> }
>
> +int __init mlock_init(void);
> void mlock_new_folio(struct folio *folio);
> bool need_mlock_drain(int cpu);
> void mlock_drain_local(void);
> -void mlock_drain_remote(int cpu);
> +void mlock_drain_cpu(int cpu);
> +void mlock_drain_offline(int cpu);
>
> extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
>
> Index: slab/mm/page_alloc.c
> ===================================================================
> --- slab.orig/mm/page_alloc.c
> +++ slab/mm/page_alloc.c
> @@ -6251,7 +6251,7 @@ static int page_alloc_cpu_dead(unsigned
> struct zone *zone;
>
> lru_add_drain_cpu(cpu);
> - mlock_drain_remote(cpu);
> + mlock_drain_offline(cpu);
> drain_pages(cpu);
>
> /*
>
>
TBH, I am still trying to understand if we need the migrate_{en,dis}able():
- There is a data dependency beween cpu being filled and being used.
- If we get the cpu, and then migrate to a different cpu, the operation
will still be executed with the data from that starting cpu
- But maybe the compiler tries to optize this because the processor number
can be on a register and of easy access, which would break this.
Maybe a READ_ONCE() on smp_processor_id() should suffice?
Other than that, all the conversions done look correct.
That being said, I understand very little about mm code, so let's hope we
get proper feedback from those who do :)
Thanks!
Leo
© 2016 - 2026 Red Hat, Inc.