[v3] mm/page_alloc: Batch callers of free_pcppages_bulk

[PATCH v3 3/3] mm/page_alloc: Batch page freeing in free_frozen_page_commit

Posted by Joshua Hahn 4 months, 1 week ago

Before returning, free_frozen_page_commit calls free_pcppages_bulk using
nr_pcp_free to determine how many pages can appropritately be freed,
based on the tunable parameters stored in pcp. While this number is an
accurate representation of how many pages should be freed in total, it
is not an appropriate number of pages to free at once using
free_pcppages_bulk, since we have seen the value consistently go above
2000 in the Meta fleet on larger machines.

As such, perform batched page freeing in free_pcppages_bulk by using
pcp->batch member. In order to ensure that other processes are not
starved of the zone lock, free both the zone lock and pcp lock to yield to
other threads.

Note that because free_frozen_page_commit now performs a spinlock inside the
function (and can fail), the function may now return with a freed pcp.
To handle this, return true if the pcp is locked on exit and false otherwise.

In addition, since free_frozen_page_commit must now be aware of what UP
flags were stored at the time of the spin lock, and because we must be
able to report new UP flags to the callers, add a new unsigned long*
parameter UP_flags to keep track of this.

Suggested-by: Chris Mason <clm@fb.com>
Co-developed-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
---
 mm/page_alloc.c | 66 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 57 insertions(+), 9 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f525f197c5fd..9b9f5a44496c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2818,12 +2818,21 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
 	return high;
 }
 
-static void free_frozen_page_commit(struct zone *zone,
+/*
+ * Tune pcp alloc factor and adjust count & free_count. Free pages to bring the
+ * pcp's watermarks below high.
+ *
+ * May return a freed pcp, if during page freeing the pcp spinlock cannot be
+ * reacquired. Return true if pcp is locked, false otherwise.
+ */
+static bool free_frozen_page_commit(struct zone *zone,
 		struct per_cpu_pages *pcp, struct page *page, int migratetype,
-		unsigned int order, fpi_t fpi_flags)
+		unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags)
 {
 	int high, batch;
+	int to_free, to_free_batched;
 	int pindex;
+	int cpu = smp_processor_id();
 	bool free_high = false;
 
 	/*
@@ -2861,15 +2870,20 @@ static void free_frozen_page_commit(struct zone *zone,
 		 * Do not attempt to take a zone lock. Let pcp->count get
 		 * over high mark temporarily.
 		 */
-		return;
+		return true;
 	}
 
 	high = nr_pcp_high(pcp, zone, batch, free_high);
 	if (pcp->count < high)
-		return;
+		return true;
+
+	to_free = nr_pcp_free(pcp, batch, high, free_high);
+	if (to_free == 0)
+		return true;
 
-	free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
-			   pcp, pindex);
+free_batch:
+	to_free_batched = min(to_free, batch);
+	free_pcppages_bulk(zone, to_free_batched, pcp, pindex);
 	if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&
 	    zone_watermark_ok(zone, 0, high_wmark_pages(zone),
 			      ZONE_MOVABLE, 0)) {
@@ -2887,6 +2901,35 @@ static void free_frozen_page_commit(struct zone *zone,
 		    next_memory_node(pgdat->node_id) < MAX_NUMNODES)
 			atomic_set(&pgdat->kswapd_failures, 0);
 	}
+	high = nr_pcp_high(pcp, zone, batch, free_high);
+	to_free -= to_free_batched;
+	if (pcp->count >= high) {
+		pcp_spin_unlock(pcp);
+		pcp_trylock_finish(*UP_flags);
+
+		pcp_trylock_prepare(*UP_flags);
+		pcp = pcp_spin_trylock(zone->per_cpu_pageset);
+		if (!pcp) {
+			pcp_trylock_finish(*UP_flags);
+			return false;
+		}
+
+		/*
+		 * Check if this thread has been migrated to a different
+		 * CPU. If that is the case, give up and indicate that
+		 * the pcp is returned in an unlocked state.
+		 */
+		if (smp_processor_id() != cpu) {
+			pcp_spin_unlock(pcp);
+			pcp_trylock_finish(*UP_flags);
+			return false;
+		}
+	}
+
+	if (to_free > 0 && pcp->count >= high)
+		goto free_batch;
+
+	return true;
 }
 
 /*
@@ -2934,7 +2977,9 @@ static void __free_frozen_pages(struct page *page, unsigned int order,
 	pcp_trylock_prepare(UP_flags);
 	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
 	if (pcp) {
-		free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
+		if (!free_frozen_page_commit(zone, pcp, page, migratetype,
+						order, fpi_flags, &UP_flags))
+			return;
 		pcp_spin_unlock(pcp);
 	} else {
 		free_one_page(zone, page, pfn, order, fpi_flags);
@@ -3034,8 +3079,11 @@ void free_unref_folios(struct folio_batch *folios)
 			migratetype = MIGRATE_MOVABLE;
 
 		trace_mm_page_free_batched(&folio->page);
-		free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
-					order, FPI_NONE);
+		if (!free_frozen_page_commit(zone, pcp, &folio->page,
+				migratetype, order, FPI_NONE, &UP_flags)) {
+			pcp = NULL;
+			locked_zone = NULL;
+		}
 	}
 
 	if (pcp) {
-- 
2.47.3

Re: [PATCH v3 3/3] mm/page_alloc: Batch page freeing in free_frozen_page_commit

Posted by Vlastimil Babka 4 months ago

On 10/2/25 22:46, Joshua Hahn wrote:
> Before returning, free_frozen_page_commit calls free_pcppages_bulk using
> nr_pcp_free to determine how many pages can appropritately be freed,
> based on the tunable parameters stored in pcp. While this number is an
> accurate representation of how many pages should be freed in total, it
> is not an appropriate number of pages to free at once using
> free_pcppages_bulk, since we have seen the value consistently go above
> 2000 in the Meta fleet on larger machines.
> 
> As such, perform batched page freeing in free_pcppages_bulk by using
> pcp->batch member. In order to ensure that other processes are not
> starved of the zone lock, free both the zone lock and pcp lock to yield to
> other threads.
> 
> Note that because free_frozen_page_commit now performs a spinlock inside the
> function (and can fail), the function may now return with a freed pcp.
> To handle this, return true if the pcp is locked on exit and false otherwise.
> 
> In addition, since free_frozen_page_commit must now be aware of what UP
> flags were stored at the time of the spin lock, and because we must be
> able to report new UP flags to the callers, add a new unsigned long*
> parameter UP_flags to keep track of this.
> 
> Suggested-by: Chris Mason <clm@fb.com>
> Co-developed-by: Johannes Weiner <hannes@cmpxchg.org>
> Signed-off-by: Joshua Hahn <joshua.hahnjy@gmail.com>
> ---
>  mm/page_alloc.c | 66 ++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 57 insertions(+), 9 deletions(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index f525f197c5fd..9b9f5a44496c 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -2818,12 +2818,21 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone,
>  	return high;
>  }
>  
> -static void free_frozen_page_commit(struct zone *zone,
> +/*
> + * Tune pcp alloc factor and adjust count & free_count. Free pages to bring the
> + * pcp's watermarks below high.
> + *
> + * May return a freed pcp, if during page freeing the pcp spinlock cannot be
> + * reacquired. Return true if pcp is locked, false otherwise.
> + */
> +static bool free_frozen_page_commit(struct zone *zone,
>  		struct per_cpu_pages *pcp, struct page *page, int migratetype,
> -		unsigned int order, fpi_t fpi_flags)
> +		unsigned int order, fpi_t fpi_flags, unsigned long *UP_flags)
>  {
>  	int high, batch;
> +	int to_free, to_free_batched;
>  	int pindex;
> +	int cpu = smp_processor_id();
>  	bool free_high = false;
>  
>  	/*
> @@ -2861,15 +2870,20 @@ static void free_frozen_page_commit(struct zone *zone,
>  		 * Do not attempt to take a zone lock. Let pcp->count get
>  		 * over high mark temporarily.
>  		 */
> -		return;
> +		return true;
>  	}
>  
>  	high = nr_pcp_high(pcp, zone, batch, free_high);
>  	if (pcp->count < high)
> -		return;
> +		return true;
> +
> +	to_free = nr_pcp_free(pcp, batch, high, free_high);
> +	if (to_free == 0)
> +		return true;
>  
> -	free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
> -			   pcp, pindex);
> +free_batch:
> +	to_free_batched = min(to_free, batch);
> +	free_pcppages_bulk(zone, to_free_batched, pcp, pindex);
>  	if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&

We could do this handling once after all batches. But maybe it's better to
act as soon as it becomes true, and checking once ber batch isn't measurably
slower, dunno.

>  	    zone_watermark_ok(zone, 0, high_wmark_pages(zone),
>  			      ZONE_MOVABLE, 0)) {
> @@ -2887,6 +2901,35 @@ static void free_frozen_page_commit(struct zone *zone,
>  		    next_memory_node(pgdat->node_id) < MAX_NUMNODES)
>  			atomic_set(&pgdat->kswapd_failures, 0);
>  	}
> +	high = nr_pcp_high(pcp, zone, batch, free_high);

It's not clear why we recalculate this. Ah I see, the calculation involves a
ZONE_BELOW_HIGH check which we might have changed above. So as a result ths
patch isn't a straightforward "we free the same amount of pages but in
smaller batches" but something different (and I'm not immediately sure what
exactly).

I think it's an argument for doing the ZONE_BELOW_HIGH test above just once
in the end, and not recalculating "high" here. I'd just stick to the
"to_free" calculated uprofront and decreasing it by "to_free_batched" each
round.
We should maybe also check that pcp->count went to 0 and bail out so we
don't make useless iterations in rare cases when someone else drains the pcp
between our batches (free_pcppages_bulk() protects itself from passing too
high "count" so it would be just some wasted cpu otherwise, not a disaster).

> +	to_free -= to_free_batched;
> +	if (pcp->count >= high) {
> +		pcp_spin_unlock(pcp);
> +		pcp_trylock_finish(*UP_flags);
> +
> +		pcp_trylock_prepare(*UP_flags);
> +		pcp = pcp_spin_trylock(zone->per_cpu_pageset);
> +		if (!pcp) {
> +			pcp_trylock_finish(*UP_flags);
> +			return false;
> +		}
> +
> +		/*
> +		 * Check if this thread has been migrated to a different
> +		 * CPU. If that is the case, give up and indicate that
> +		 * the pcp is returned in an unlocked state.
> +		 */
> +		if (smp_processor_id() != cpu) {

We could have remembered the old pcp pointer and compare that instead of
doing smp_processor_id(), although that should also work.

> +			pcp_spin_unlock(pcp);
> +			pcp_trylock_finish(*UP_flags);
> +			return false;
> +		}
> +	}
> +
> +	if (to_free > 0 && pcp->count >= high)
> +		goto free_batch;
> +
> +	return true;
>  }
>  
>  /*
> @@ -2934,7 +2977,9 @@ static void __free_frozen_pages(struct page *page, unsigned int order,
>  	pcp_trylock_prepare(UP_flags);
>  	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
>  	if (pcp) {
> -		free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
> +		if (!free_frozen_page_commit(zone, pcp, page, migratetype,
> +						order, fpi_flags, &UP_flags))
> +			return;
>  		pcp_spin_unlock(pcp);
>  	} else {
>  		free_one_page(zone, page, pfn, order, fpi_flags);
> @@ -3034,8 +3079,11 @@ void free_unref_folios(struct folio_batch *folios)
>  			migratetype = MIGRATE_MOVABLE;
>  
>  		trace_mm_page_free_batched(&folio->page);
> -		free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
> -					order, FPI_NONE);
> +		if (!free_frozen_page_commit(zone, pcp, &folio->page,
> +				migratetype, order, FPI_NONE, &UP_flags)) {
> +			pcp = NULL;
> +			locked_zone = NULL;
> +		}
>  	}
>  
>  	if (pcp) {

Re: [PATCH v3 3/3] mm/page_alloc: Batch page freeing in free_frozen_page_commit

Posted by Joshua Hahn 4 months ago

On Fri, 10 Oct 2025 15:37:23 +0200 Vlastimil Babka <vbabka@suse.cz> wrote:

[...snip...]

> >  	high = nr_pcp_high(pcp, zone, batch, free_high);
> >  	if (pcp->count < high)
> > -		return;
> > +		return true;
> > +
> > +	to_free = nr_pcp_free(pcp, batch, high, free_high);
> > +	if (to_free == 0)
> > +		return true;
> >  
> > -	free_pcppages_bulk(zone, nr_pcp_free(pcp, batch, high, free_high),
> > -			   pcp, pindex);
> > +free_batch:
> > +	to_free_batched = min(to_free, batch);
> > +	free_pcppages_bulk(zone, to_free_batched, pcp, pindex);
> >  	if (test_bit(ZONE_BELOW_HIGH, &zone->flags) &&

Hello Vlastimil, thank you for your review!

> We could do this handling once after all batches. But maybe it's better to
> act as soon as it becomes true, and checking once ber batch isn't measurably
> slower, dunno.

My thinking was that if we are to release the pcp and zone lock and another
task remotely inspects this pcp in the middle of the loop, I would want the
ZONE_BELOW_HIGH bit to reflect the correct state of the pcp, as opposed to
doing it once at the end.

> >  	    zone_watermark_ok(zone, 0, high_wmark_pages(zone),
> >  			      ZONE_MOVABLE, 0)) {
> > @@ -2887,6 +2901,35 @@ static void free_frozen_page_commit(struct zone *zone,
> >  		    next_memory_node(pgdat->node_id) < MAX_NUMNODES)
> >  			atomic_set(&pgdat->kswapd_failures, 0);
> >  	}
> > +	high = nr_pcp_high(pcp, zone, batch, free_high);
> 
> It's not clear why we recalculate this. Ah I see, the calculation involves a
> ZONE_BELOW_HIGH check which we might have changed above. So as a result ths
> patch isn't a straightforward "we free the same amount of pages but in
> smaller batches" but something different (and I'm not immediately sure what
> exactly).

Yes! So nr_pcp_high will check if we are now below high, and see what the
new high should be, and see if we need to free some more pages. My goal was
to prevent having to call free_frozen_page_commit again in the future once
we realize that there are still pages yet to be freed, and handle that case
within this loop as if it was what we originally decided to free.

> I think it's an argument for doing the ZONE_BELOW_HIGH test above just once
> in the end, and not recalculating "high" here. I'd just stick to the
> "to_free" calculated uprofront and decreasing it by "to_free_batched" each
> round.

Yes, this makes sense to me. The last sentence in my paragraph above seems
a little sneaky in doing more work than it originally was called to do, so
I do agree that we should do the ZONE_BELOW_HIGH check once, and also
calculate how many pages to free at once in the beginning.

> We should maybe also check that pcp->count went to 0 and bail out so we
> don't make useless iterations in rare cases when someone else drains the pcp
> between our batches (free_pcppages_bulk() protects itself from passing too
> high "count" so it would be just some wasted cpu otherwise, not a disaster).

Hopefully the check below is enough to handle this race case. When it
determines whether to iterate through in the loop again, we check that
to_free > 0 && pcp->count >= high, so if someone else were to drain the
drain the pcp->count, it would go under high (I don't think it's possible
for high to become negative here) it would break this check and prevent
another iteration. 

> > +	to_free -= to_free_batched;
> > +	if (pcp->count >= high) {
> > +		pcp_spin_unlock(pcp);
> > +		pcp_trylock_finish(*UP_flags);
> > +
> > +		pcp_trylock_prepare(*UP_flags);
> > +		pcp = pcp_spin_trylock(zone->per_cpu_pageset);
> > +		if (!pcp) {
> > +			pcp_trylock_finish(*UP_flags);
> > +			return false;
> > +		}
> > +
> > +		/*
> > +		 * Check if this thread has been migrated to a different
> > +		 * CPU. If that is the case, give up and indicate that
> > +		 * the pcp is returned in an unlocked state.
> > +		 */
> > +		if (smp_processor_id() != cpu) {
> 
> We could have remembered the old pcp pointer and compare that instead of
> doing smp_processor_id(), although that should also work.

I think that would have also worked. To be completely honest, I wasn't
sure if there was some pcp stuff inside that would change the pointer,
so just went with using the CPU int instead. 

> > +			pcp_spin_unlock(pcp);
> > +			pcp_trylock_finish(*UP_flags);
> > +			return false;
> > +		}
> > +	}
> > +
> > +	if (to_free > 0 && pcp->count >= high)

           Check is here:  ^^^^^^^^^^^^^^^^^^

> > +		goto free_batch;
> > +
> > +	return true;
> >  }
> >  
> >  /*
> > @@ -2934,7 +2977,9 @@ static void __free_frozen_pages(struct page *page, unsigned int order,
> >  	pcp_trylock_prepare(UP_flags);
> >  	pcp = pcp_spin_trylock(zone->per_cpu_pageset);
> >  	if (pcp) {
> > -		free_frozen_page_commit(zone, pcp, page, migratetype, order, fpi_flags);
> > +		if (!free_frozen_page_commit(zone, pcp, page, migratetype,
> > +						order, fpi_flags, &UP_flags))
> > +			return;
> >  		pcp_spin_unlock(pcp);
> >  	} else {
> >  		free_one_page(zone, page, pfn, order, fpi_flags);
> > @@ -3034,8 +3079,11 @@ void free_unref_folios(struct folio_batch *folios)
> >  			migratetype = MIGRATE_MOVABLE;
> >  
> >  		trace_mm_page_free_batched(&folio->page);
> > -		free_frozen_page_commit(zone, pcp, &folio->page, migratetype,
> > -					order, FPI_NONE);
> > +		if (!free_frozen_page_commit(zone, pcp, &folio->page,
> > +				migratetype, order, FPI_NONE, &UP_flags)) {
> > +			pcp = NULL;
> > +			locked_zone = NULL;
> > +		}
> >  	}
> >  
> >  	if (pcp) {

Thank you again for your review, Vlastimil. I agree with all of your
feedback, so I will make those changes (as well as the commit message
change for patch 2/3) and submit a new version. I hope you have a great day!
Joshua

[PATCH v3 1/3] mm/page_alloc/vmstat: Simplify refresh_cpu_vm_stats change detection
[PATCH v3 2/3] mm/page_alloc: Batch page freeing in decay_pcp_high
[PATCH v3 3/3] mm/page_alloc: Batch page freeing in free_frozen_page_commit