[v4] sched/mm: LRU drain flush on nohz_full

[PATCH 6/6] mm: Drain LRUs upon resume to userspace on nohz_full CPUs

Posted by Frederic Weisbecker 3 months ago

LRU batching can be source of disturbances for isolated workloads
running in the userspace because it requires kernel worker to handle
that and that would preempt the said task. The primary source for such
disruption would be __lru_add_drain_all which could be triggered from
non-isolated CPUs.

Why would an isolated CPU have anything on the pcp cache? Many syscalls
allocate pages that might end there. A typical and unavoidable one would
be fork/exec leaving pages on the cache behind just waiting for somebody
to drain.

Address the problem by noting a batch has been added to the cache and
schedule draining upon return to userspace so the work is done while the
syscall is still executing and there are no suprises while the task runs
in the userspace where it doesn't want to be preempted.

Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
---
 include/linux/pagevec.h  | 18 ++----------------
 include/linux/swap.h     |  1 +
 kernel/sched/isolation.c |  3 +++
 mm/swap.c                | 30 +++++++++++++++++++++++++++++-
 4 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 5d3a0cccc6bf..7e647b8df4c7 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -61,22 +61,8 @@ static inline unsigned int folio_batch_space(struct folio_batch *fbatch)
 	return PAGEVEC_SIZE - fbatch->nr;
 }
 
-/**
- * folio_batch_add() - Add a folio to a batch.
- * @fbatch: The folio batch.
- * @folio: The folio to add.
- *
- * The folio is added to the end of the batch.
- * The batch must have previously been initialised using folio_batch_init().
- *
- * Return: The number of slots still available.
- */
-static inline unsigned folio_batch_add(struct folio_batch *fbatch,
-		struct folio *folio)
-{
-	fbatch->folios[fbatch->nr++] = folio;
-	return folio_batch_space(fbatch);
-}
+unsigned int folio_batch_add(struct folio_batch *fbatch,
+			     struct folio *folio);
 
 /**
  * folio_batch_next - Return the next folio to process.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index bc0e1c275fc0..d74ad6c893a1 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -401,6 +401,7 @@ extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_cpu_zone(struct zone *zone);
 extern void lru_add_drain_all(void);
+extern void lru_add_and_bh_lrus_drain(void);
 void folio_deactivate(struct folio *folio);
 void folio_mark_lazyfree(struct folio *folio);
 extern void swap_setup(void);
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index d74c4ef91ce2..06882916c24f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -8,6 +8,8 @@
  *
  */
 
+#include <linux/swap.h>
+
 enum hk_flags {
 	HK_FLAG_DOMAIN		= BIT(HK_TYPE_DOMAIN),
 	HK_FLAG_MANAGED_IRQ	= BIT(HK_TYPE_MANAGED_IRQ),
@@ -253,6 +255,7 @@ __setup("isolcpus=", housekeeping_isolcpus_setup);
 #ifdef CONFIG_NO_HZ_FULL_WORK
 static void isolated_task_work(struct callback_head *head)
 {
+	lru_add_and_bh_lrus_drain();
 }
 
 int __isolated_task_work_queue(void)
diff --git a/mm/swap.c b/mm/swap.c
index 4fc322f7111a..da08c918cef4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -37,6 +37,7 @@
 #include <linux/page_idle.h>
 #include <linux/local_lock.h>
 #include <linux/buffer_head.h>
+#include <linux/sched/isolation.h>
 
 #include "internal.h"
 
@@ -155,6 +156,29 @@ static void lru_add(struct lruvec *lruvec, struct folio *folio)
 	trace_mm_lru_insertion(folio);
 }
 
+/**
+ * folio_batch_add() - Add a folio to a batch.
+ * @fbatch: The folio batch.
+ * @folio: The folio to add.
+ *
+ * The folio is added to the end of the batch.
+ * The batch must have previously been initialised using folio_batch_init().
+ *
+ * Return: The number of slots still available.
+ */
+unsigned int folio_batch_add(struct folio_batch *fbatch,
+			     struct folio *folio)
+{
+	unsigned int ret;
+
+	fbatch->folios[fbatch->nr++] = folio;
+	ret = folio_batch_space(fbatch);
+	isolated_task_work_queue();
+
+	return ret;
+}
+EXPORT_SYMBOL(folio_batch_add);
+
 static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 {
 	int i;
@@ -738,7 +762,7 @@ void lru_add_drain(void)
  * the same cpu. It shouldn't be a problem in !SMP case since
  * the core is only one and the locks will disable preemption.
  */
-static void lru_add_and_bh_lrus_drain(void)
+void lru_add_and_bh_lrus_drain(void)
 {
 	local_lock(&cpu_fbatches.lock);
 	lru_add_drain_cpu(smp_processor_id());
@@ -864,6 +888,10 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
 
+		/* Isolated CPUs handle their cache upon return to userspace */
+		if (IS_ENABLED(CONFIG_NO_HZ_FULL_WORK) && !housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
+			continue;
+
 		if (cpu_needs_drain(cpu)) {
 			INIT_WORK(work, lru_add_drain_per_cpu);
 			queue_work_on(cpu, mm_percpu_wq, work);
-- 
2.48.1

Re: [PATCH 6/6] mm: Drain LRUs upon resume to userspace on nohz_full CPUs

Posted by Matthew Wilcox 3 months ago

On Thu, Jul 03, 2025 at 04:07:17PM +0200, Frederic Weisbecker wrote:
> +unsigned int folio_batch_add(struct folio_batch *fbatch,
> +			     struct folio *folio)
> +{
> +	unsigned int ret;
> +
> +	fbatch->folios[fbatch->nr++] = folio;
> +	ret = folio_batch_space(fbatch);
> +	isolated_task_work_queue();

Umm.  LRUs use folio_batches, but they are definitely not the only user
of folio_batches.  Maybe you want to add a new lru_batch_add()
abstraction, because this call is definitely being done at the wrong
level.

Re: [PATCH 6/6] mm: Drain LRUs upon resume to userspace on nohz_full CPUs

Posted by Michal Hocko 3 months ago

On Thu 03-07-25 15:28:23, Matthew Wilcox wrote:
> On Thu, Jul 03, 2025 at 04:07:17PM +0200, Frederic Weisbecker wrote:
> > +unsigned int folio_batch_add(struct folio_batch *fbatch,
> > +			     struct folio *folio)
> > +{
> > +	unsigned int ret;
> > +
> > +	fbatch->folios[fbatch->nr++] = folio;
> > +	ret = folio_batch_space(fbatch);
> > +	isolated_task_work_queue();
> 
> Umm.  LRUs use folio_batches, but they are definitely not the only user
> of folio_batches.  Maybe you want to add a new lru_batch_add()
> abstraction, because this call is definitely being done at the wrong
> level.

You have answered one of my question in other response. My initial
thought was that __lru_add_drain_all seems to be a better fit. But then
we have a problem that draining will become an unbounded operation which
will become a problem for lru_cache_disable which will never converge
until isolated workload does the draining. So it indeed seems like we
need to queue draining when a page is added. Are there other places
where we put folios into teh folio_batch than folio_batch_add? I cannot
seem to see any...

-- 
Michal Hocko
SUSE Labs

Re: [PATCH 6/6] mm: Drain LRUs upon resume to userspace on nohz_full CPUs

Posted by Vlastimil Babka 2 months, 3 weeks ago

On 7/3/25 18:12, Michal Hocko wrote:
> On Thu 03-07-25 15:28:23, Matthew Wilcox wrote:
>> On Thu, Jul 03, 2025 at 04:07:17PM +0200, Frederic Weisbecker wrote:
>> > +unsigned int folio_batch_add(struct folio_batch *fbatch,
>> > +			     struct folio *folio)
>> > +{
>> > +	unsigned int ret;
>> > +
>> > +	fbatch->folios[fbatch->nr++] = folio;
>> > +	ret = folio_batch_space(fbatch);
>> > +	isolated_task_work_queue();
>> 
>> Umm.  LRUs use folio_batches, but they are definitely not the only user
>> of folio_batches.  Maybe you want to add a new lru_batch_add()
>> abstraction, because this call is definitely being done at the wrong
>> level.
> 
> You have answered one of my question in other response. My initial
> thought was that __lru_add_drain_all seems to be a better fit. But then

__lru_add_drain_all is the part where we queue the drain work to other cpus.
In order to not disrupt isolated cpus, the isolated cpus have to self-
schedule the work to be done on resume, which indeed means at the moment
they are filling the batches to be drained, as this patch does.

> we have a problem that draining will become an unbounded operation which
> will become a problem for lru_cache_disable which will never converge
> until isolated workload does the draining. So it indeed seems like we
> need to queue draining when a page is added. Are there other places
> where we put folios into teh folio_batch than folio_batch_add? I cannot
> seem to see any...

The problem Matthew points out isn't that there would be other places that
add folios to a fbatch, other than folio_batch_add(). The problem is that
many of the folio_batch_add() add something to a temporary batch on a stack,
which is not subject to lru draining, so it's wasteful to queue the
draining for those. 

The below diff should address this by creating folio_batch_add_lru() which
is only used in the right places that do fill a lru-drainable folio batch.
It also makes the function static inline again, in mm/internal.h, which
means it's adding some includes to it. For that I had to fix up some wrong
include places of internal.h in varous mm files - these includes should not
appear below "#define CREATE_TRACE_POINTS".

Frederic, feel free to fold this in your patch with my Co-developed-by:

I also noted that since this now handles invalidate_bh_lrus_cpu() maybe we
can drop the cpu_is_isolated() check from bh_lru_install(). I'm not even
sure if the check is equivalent or stronger than the check used in
isolated_task_work_queue().

I have a bit of remaining worry for __lru_add_drain_all() simply skipping
the isolated cpus because then it doesn't wait for the flushing to be
finished via flush_work(). It can thus return before the isolated cpu does
its drain-on-resume, which might violate some expectations of
lru_cache_disable(). Is there a possibility to make it wait for the drain-
on-resume or determine there's not any, in a reliable way?

----8<----
From eb6fb0b60a2ede567539e4be071cb92ff5ec221a Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 17 Jul 2025 21:05:48 +0200
Subject: [PATCH] mm: introduce folio_batch_add_lru

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
---
 include/linux/pagevec.h | 18 ++++++++++++++++--
 include/linux/swap.h    |  2 +-
 mm/internal.h           | 19 +++++++++++++++++--
 mm/mlock.c              |  6 +++---
 mm/mmap.c               |  4 ++--
 mm/percpu-vm.c          |  2 --
 mm/percpu.c             |  5 +++--
 mm/rmap.c               |  4 ++--
 mm/swap.c               | 25 +------------------------
 mm/vmalloc.c            |  6 +++---
 10 files changed, 48 insertions(+), 43 deletions(-)

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 7e647b8df4c7..5d3a0cccc6bf 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -61,8 +61,22 @@ static inline unsigned int folio_batch_space(struct folio_batch *fbatch)
 	return PAGEVEC_SIZE - fbatch->nr;
 }
 
-unsigned int folio_batch_add(struct folio_batch *fbatch,
-			     struct folio *folio);
+/**
+ * folio_batch_add() - Add a folio to a batch.
+ * @fbatch: The folio batch.
+ * @folio: The folio to add.
+ *
+ * The folio is added to the end of the batch.
+ * The batch must have previously been initialised using folio_batch_init().
+ *
+ * Return: The number of slots still available.
+ */
+static inline unsigned folio_batch_add(struct folio_batch *fbatch,
+		struct folio *folio)
+{
+	fbatch->folios[fbatch->nr++] = folio;
+	return folio_batch_space(fbatch);
+}
 
 /**
  * folio_batch_next - Return the next folio to process.
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d74ad6c893a1..0bede5cd4f61 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -401,7 +401,7 @@ extern void lru_add_drain(void);
 extern void lru_add_drain_cpu(int cpu);
 extern void lru_add_drain_cpu_zone(struct zone *zone);
 extern void lru_add_drain_all(void);
-extern void lru_add_and_bh_lrus_drain(void);
+void lru_add_and_bh_lrus_drain(void);
 void folio_deactivate(struct folio *folio);
 void folio_mark_lazyfree(struct folio *folio);
 extern void swap_setup(void);
diff --git a/mm/internal.h b/mm/internal.h
index 6b8ed2017743..c2848819ce5f 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -18,12 +18,12 @@
 #include <linux/swapops.h>
 #include <linux/swap_cgroup.h>
 #include <linux/tracepoint-defs.h>
+#include <linux/pagevec.h>
+#include <linux/sched/isolation.h>
 
 /* Internal core VMA manipulation functions. */
 #include "vma.h"
 
-struct folio_batch;
-
 /*
  * Maintains state across a page table move. The operation assumes both source
  * and destination VMAs already exist and are specified by the user.
@@ -414,6 +414,21 @@ static inline vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
 	return ret;
 }
 
+/*
+ * A version of folio_batch_add() to use with batches that are drained from
+ * lru_add_drain() and checked in cpu_needs_drain()
+ */
+static inline unsigned int
+folio_batch_add_lru(struct folio_batch *fbatch, struct folio *folio)
+{
+	/*
+	 * We could perhaps not queue when returning 0 which means the caller
+	 * should flush the batch immediately, but that's rare anyway.
+	 */
+	isolated_task_work_queue();
+	return folio_batch_add(fbatch, folio);
+}
+
 vm_fault_t do_swap_page(struct vm_fault *vmf);
 void folio_rotate_reclaimable(struct folio *folio);
 bool __folio_end_writeback(struct folio *folio);
diff --git a/mm/mlock.c b/mm/mlock.c
index 3cb72b579ffd..a95f248efdf2 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -254,7 +254,7 @@ void mlock_folio(struct folio *folio)
 	}
 
 	folio_get(folio);
-	if (!folio_batch_add(fbatch, mlock_lru(folio)) ||
+	if (!folio_batch_add_lru(fbatch, mlock_lru(folio)) ||
 	    folio_test_large(folio) || lru_cache_disabled())
 		mlock_folio_batch(fbatch);
 	local_unlock(&mlock_fbatch.lock);
@@ -277,7 +277,7 @@ void mlock_new_folio(struct folio *folio)
 	__count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
 
 	folio_get(folio);
-	if (!folio_batch_add(fbatch, mlock_new(folio)) ||
+	if (!folio_batch_add_lru(fbatch, mlock_new(folio)) ||
 	    folio_test_large(folio) || lru_cache_disabled())
 		mlock_folio_batch(fbatch);
 	local_unlock(&mlock_fbatch.lock);
@@ -298,7 +298,7 @@ void munlock_folio(struct folio *folio)
 	 * which will check whether the folio is multiply mlocked.
 	 */
 	folio_get(folio);
-	if (!folio_batch_add(fbatch, folio) ||
+	if (!folio_batch_add_lru(fbatch, folio) ||
 	    folio_test_large(folio) || lru_cache_disabled())
 		mlock_folio_batch(fbatch);
 	local_unlock(&mlock_fbatch.lock);
diff --git a/mm/mmap.c b/mm/mmap.c
index 09c563c95112..4d9a5d8616c3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -54,11 +54,11 @@
 #include <asm/tlb.h>
 #include <asm/mmu_context.h>
 
+#include "internal.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/mmap.h>
 
-#include "internal.h"
-
 #ifndef arch_mmap_check
 #define arch_mmap_check(addr, len, flags)	(0)
 #endif
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index cd69caf6aa8d..cd3aa1610294 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -8,8 +8,6 @@
  * Chunks are mapped into vmalloc areas and populated page by page.
  * This is the default chunk allocator.
  */
-#include "internal.h"
-
 static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
 				    unsigned int cpu, int page_idx)
 {
diff --git a/mm/percpu.c b/mm/percpu.c
index b35494c8ede2..b8c34a931205 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -93,11 +93,12 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
+#include "internal.h"
+#include "percpu-internal.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/percpu.h>
 
-#include "percpu-internal.h"
-
 /*
  * The slots are sorted by the size of the biggest continuous free area.
  * 1-31 bytes share the same slot.
diff --git a/mm/rmap.c b/mm/rmap.c
index fb63d9256f09..a4eab1664e25 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -78,12 +78,12 @@
 
 #include <asm/tlbflush.h>
 
+#include "internal.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/tlb.h>
 #include <trace/events/migrate.h>
 
-#include "internal.h"
-
 static struct kmem_cache *anon_vma_cachep;
 static struct kmem_cache *anon_vma_chain_cachep;
 
diff --git a/mm/swap.c b/mm/swap.c
index da08c918cef4..98897171adaf 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -156,29 +156,6 @@ static void lru_add(struct lruvec *lruvec, struct folio *folio)
 	trace_mm_lru_insertion(folio);
 }
 
-/**
- * folio_batch_add() - Add a folio to a batch.
- * @fbatch: The folio batch.
- * @folio: The folio to add.
- *
- * The folio is added to the end of the batch.
- * The batch must have previously been initialised using folio_batch_init().
- *
- * Return: The number of slots still available.
- */
-unsigned int folio_batch_add(struct folio_batch *fbatch,
-			     struct folio *folio)
-{
-	unsigned int ret;
-
-	fbatch->folios[fbatch->nr++] = folio;
-	ret = folio_batch_space(fbatch);
-	isolated_task_work_queue();
-
-	return ret;
-}
-EXPORT_SYMBOL(folio_batch_add);
-
 static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
 {
 	int i;
@@ -215,7 +192,7 @@ static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
 	else
 		local_lock(&cpu_fbatches.lock);
 
-	if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) ||
+	if (!folio_batch_add_lru(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) ||
 	    lru_cache_disabled())
 		folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);
 
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ab986dd09b6a..b3a28e353b7e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -44,12 +44,12 @@
 #include <asm/shmparam.h>
 #include <linux/page_owner.h>
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/vmalloc.h>
-
 #include "internal.h"
 #include "pgalloc-track.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/vmalloc.h>
+
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
 static unsigned int __ro_after_init ioremap_max_page_shift = BITS_PER_LONG - 1;
 
-- 
2.50.1

Re: [PATCH 6/6] mm: Drain LRUs upon resume to userspace on nohz_full CPUs

Posted by Michal Hocko 3 months ago

On Thu 03-07-25 16:07:17, Frederic Weisbecker wrote:
[...]
> @@ -864,6 +888,10 @@ static inline void __lru_add_drain_all(bool force_all_cpus)
>  	for_each_online_cpu(cpu) {
>  		struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
>  
> +		/* Isolated CPUs handle their cache upon return to userspace */
> +		if (IS_ENABLED(CONFIG_NO_HZ_FULL_WORK) && !housekeeping_cpu(cpu, HK_TYPE_KERNEL_NOISE))
> +			continue;
> +

Two questions. Where do you actually queue the work to be executed on
the return to the userspace? And why don't you do that only if
cpu_needs_drain?

>  		if (cpu_needs_drain(cpu)) {
>  			INIT_WORK(work, lru_add_drain_per_cpu);
>  			queue_work_on(cpu, mm_percpu_wq, work);
> -- 
> 2.48.1
> 

-- 
Michal Hocko
SUSE Labs

[PATCH 1/6] task_work: Provide means to check if a work is queued
[PATCH 2/6] sched/fair: Use task_work_queued() on numa_work
[PATCH 3/6] sched: Use task_work_queued() on cid_work
[PATCH 4/6] tick/nohz: Move nohz_full related fields out of hot task struct's places
[PATCH 5/6] sched/isolation: Introduce isolated task work
[PATCH 6/6] mm: Drain LRUs upon resume to userspace on nohz_full CPUs