From nobody Thu Jun 18 08:58:40 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 2501BC433F5
	for <linux-kernel@archiver.kernel.org>; Wed, 20 Apr 2022 09:59:37 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1377428AbiDTKCU (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 20 Apr 2022 06:02:20 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:33990 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1377430AbiDTKCP (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 20 Apr 2022 06:02:15 -0400
Received: from outbound-smtp07.blacknight.com (outbound-smtp07.blacknight.com
 [46.22.139.12])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 170253C727
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 02:59:29 -0700 (PDT)
Received: from mail.blacknight.com (pemlinmail04.blacknight.ie [81.17.254.17])
        by outbound-smtp07.blacknight.com (Postfix) with ESMTPS id B5CC41C418C
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 10:59:28 +0100 (IST)
Received: (qmail 10287 invoked from network); 20 Apr 2022 09:59:28 -0000
Received: from unknown (HELO morpheus.112glenside.lan)
 (mgorman@techsingularity.net@[84.203.198.246])
  by 81.17.254.9 with ESMTPA; 20 Apr 2022 09:59:28 -0000
From: Mel Gorman <mgorman@techsingularity.net>
To: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>,
        Vlastimil Babka <vbabka@suse.cz>,
        Michal Hocko <mhocko@kernel.org>,
        LKML <linux-kernel@vger.kernel.org>,
        Linux-MM <linux-mm@kvack.org>,
        Mel Gorman <mgorman@techsingularity.net>
Subject: [PATCH 1/6] mm/page_alloc: Add page->buddy_list and page->pcp_list
Date: Wed, 20 Apr 2022 10:59:01 +0100
Message-Id: <20220420095906.27349-2-mgorman@techsingularity.net>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20220420095906.27349-1-mgorman@techsingularity.net>
References: <20220420095906.27349-1-mgorman@techsingularity.net>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

The page allocator uses page->lru for storing pages on either buddy or
PCP lists. Create page->buddy_list and page->pcp_list as a union with
page->lru. This is simply to clarify what type of list a page is on
in the page allocator.

No functional change intended.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/mm_types.h |  5 +++++
 mm/page_alloc.c          | 18 +++++++++---------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8834e38c06a4..a2782e8af307 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -87,6 +87,7 @@ struct page {
 			 */
 			union {
 				struct list_head lru;
+
 				/* Or, for the Unevictable "LRU list" slot */
 				struct {
 					/* Always even, to negate PageTail */
@@ -94,6 +95,10 @@ struct page {
 					/* Count page's or folio's mlocks */
 					unsigned int mlock_count;
 				};
+
+				/* Or, free page */
+				struct list_head buddy_list;
+				struct list_head pcp_list;
 			};
 			/* See page-flags.h for PAGE_MAPPING_FLAGS */
 			struct address_space *mapping;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2db95780e003..63976ad4b7f1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -780,7 +780,7 @@ static inline bool set_page_guard(struct zone *zone, st=
ruct page *page,
 		return false;
=20
 	__SetPageGuard(page);
-	INIT_LIST_HEAD(&page->lru);
+	INIT_LIST_HEAD(&page->buddy_list);
 	set_page_private(page, order);
 	/* Guard pages are not available for any usage */
 	__mod_zone_freepage_state(zone, -(1 << order), migratetype);
@@ -957,7 +957,7 @@ static inline void add_to_free_list(struct page *page, =
struct zone *zone,
 {
 	struct free_area *area =3D &zone->free_area[order];
=20
-	list_add(&page->lru, &area->free_list[migratetype]);
+	list_add(&page->buddy_list, &area->free_list[migratetype]);
 	area->nr_free++;
 }
=20
@@ -967,7 +967,7 @@ static inline void add_to_free_list_tail(struct page *p=
age, struct zone *zone,
 {
 	struct free_area *area =3D &zone->free_area[order];
=20
-	list_add_tail(&page->lru, &area->free_list[migratetype]);
+	list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
 	area->nr_free++;
 }
=20
@@ -981,7 +981,7 @@ static inline void move_to_free_list(struct page *page,=
 struct zone *zone,
 {
 	struct free_area *area =3D &zone->free_area[order];
=20
-	list_move_tail(&page->lru, &area->free_list[migratetype]);
+	list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
 }
=20
 static inline void del_page_from_free_list(struct page *page, struct zone =
*zone,
@@ -991,7 +991,7 @@ static inline void del_page_from_free_list(struct page =
*page, struct zone *zone,
 	if (page_reported(page))
 		__ClearPageReported(page);
=20
-	list_del(&page->lru);
+	list_del(&page->buddy_list);
 	__ClearPageBuddy(page);
 	set_page_private(page, 0);
 	zone->free_area[order].nr_free--;
@@ -1493,7 +1493,7 @@ static void free_pcppages_bulk(struct zone *zone, int=
 count,
 			mt =3D get_pcppage_migratetype(page);
=20
 			/* must delete to avoid corrupting pcp list */
-			list_del(&page->lru);
+			list_del(&page->pcp_list);
 			count -=3D nr_pages;
 			pcp->count -=3D nr_pages;
=20
@@ -3053,7 +3053,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned i=
nt order,
 		 * for IO devices that can merge IO requests if the physical
 		 * pages are ordered properly.
 		 */
-		list_add_tail(&page->lru, list);
+		list_add_tail(&page->pcp_list, list);
 		allocated++;
 		if (is_migrate_cma(get_pcppage_migratetype(page)))
 			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
@@ -3392,7 +3392,7 @@ static void free_unref_page_commit(struct page *page,=
 int migratetype,
 	__count_vm_event(PGFREE);
 	pcp =3D this_cpu_ptr(zone->per_cpu_pageset);
 	pindex =3D order_to_pindex(migratetype, order);
-	list_add(&page->lru, &pcp->lists[pindex]);
+	list_add(&page->pcp_list, &pcp->lists[pindex]);
 	pcp->count +=3D 1 << order;
=20
 	/*
@@ -3656,7 +3656,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, uns=
igned int order,
 		}
=20
 		page =3D list_first_entry(list, struct page, lru);
-		list_del(&page->lru);
+		list_del(&page->pcp_list);
 		pcp->count -=3D 1 << order;
 	} while (check_new_pcp(page, order));
=20
--=20
2.34.1
From nobody Thu Jun 18 08:58:40 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 3C51AC433EF
	for <linux-kernel@archiver.kernel.org>; Wed, 20 Apr 2022 10:07:09 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1377501AbiDTKJw (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 20 Apr 2022 06:09:52 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41916 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1377492AbiDTKJs (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 20 Apr 2022 06:09:48 -0400
X-Greylist: delayed 440 seconds by postgrey-1.37 at lindbergh.monkeyblade.net;
 Wed, 20 Apr 2022 03:07:00 PDT
Received: from outbound-smtp06.blacknight.com (outbound-smtp06.blacknight.com
 [81.17.249.39])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B74F61D0F5
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 03:07:00 -0700 (PDT)
Received: from mail.blacknight.com (pemlinmail04.blacknight.ie [81.17.254.17])
        by outbound-smtp06.blacknight.com (Postfix) with ESMTPS id 35628C2B9E
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 10:59:39 +0100 (IST)
Received: (qmail 10849 invoked from network); 20 Apr 2022 09:59:39 -0000
Received: from unknown (HELO morpheus.112glenside.lan)
 (mgorman@techsingularity.net@[84.203.198.246])
  by 81.17.254.9 with ESMTPA; 20 Apr 2022 09:59:38 -0000
From: Mel Gorman <mgorman@techsingularity.net>
To: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>,
        Vlastimil Babka <vbabka@suse.cz>,
        Michal Hocko <mhocko@kernel.org>,
        LKML <linux-kernel@vger.kernel.org>,
        Linux-MM <linux-mm@kvack.org>,
        Mel Gorman <mgorman@techsingularity.net>
Subject: [PATCH 2/6] mm/page_alloc: Use only one PCP list for THP-sized
 allocations
Date: Wed, 20 Apr 2022 10:59:02 +0100
Message-Id: <20220420095906.27349-3-mgorman@techsingularity.net>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20220420095906.27349-1-mgorman@techsingularity.net>
References: <20220420095906.27349-1-mgorman@techsingularity.net>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

The per_cpu_pages is cache-aligned on a standard x86-64 distribution
configuration but a later patch will add a new field which would push
the structure into the next cache line. Use only one list to store
THP-sized pages on the per-cpu list. This assumes that the vast majority
of THP-sized allocations are GFP_MOVABLE but even if it was another type,
it would not contribute to serious fragmentation that potentially causes
a later THP allocation failure. Align per_cpu_pages on the cacheline
boundary to ensure there is no false cache sharing.

After this patch, the structure sizing is;

struct per_cpu_pages {
        int                        count;                /*     0     4 */
        int                        high;                 /*     4     4 */
        int                        batch;                /*     8     4 */
        short int                  free_factor;          /*    12     2 */
        short int                  expire;               /*    14     2 */
        struct list_head           lists[13];            /*    16   208 */

        /* size: 256, cachelines: 4, members: 6 */
        /* padding: 32 */
} __attribute__((__aligned__(64)));

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/mmzone.h | 11 +++++++----
 mm/page_alloc.c        |  4 ++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 962b14d403e8..abe530748de6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -358,15 +358,18 @@ enum zone_watermarks {
 };
=20
 /*
- * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional
- * for pageblock size for THP if configured.
+ * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER. One additional li=
st
+ * for THP which will usually be GFP_MOVABLE. Even if it is another type,
+ * it should not contribute to serious fragmentation causing THP allocation
+ * failures.
  */
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #define NR_PCP_THP 1
 #else
 #define NR_PCP_THP 0
 #endif
-#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR=
_PCP_THP))
+#define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER=
 + 1))
+#define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP)
=20
 /*
  * Shift to encode migratetype and order in the same integer, with order
@@ -392,7 +395,7 @@ struct per_cpu_pages {
=20
 	/* Lists of pages, one per migrate type stored on the pcp-lists */
 	struct list_head lists[NR_PCP_LISTS];
-};
+} ____cacheline_aligned_in_smp;
=20
 struct per_cpu_zonestat {
 #ifdef CONFIG_SMP
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63976ad4b7f1..ed2deb93a758 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -648,7 +648,7 @@ static inline unsigned int order_to_pindex(int migratet=
ype, int order)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	if (order > PAGE_ALLOC_COSTLY_ORDER) {
 		VM_BUG_ON(order !=3D pageblock_order);
-		base =3D PAGE_ALLOC_COSTLY_ORDER + 1;
+		return NR_LOWORDER_PCP_LISTS;
 	}
 #else
 	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
@@ -662,7 +662,7 @@ static inline int pindex_to_order(unsigned int pindex)
 	int order =3D pindex / MIGRATE_PCPTYPES;
=20
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (order > PAGE_ALLOC_COSTLY_ORDER)
+	if (pindex =3D=3D NR_LOWORDER_PCP_LISTS)
 		order =3D pageblock_order;
 #else
 	VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
--=20
2.34.1
From nobody Thu Jun 18 08:58:40 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 309AEC433EF
	for <linux-kernel@archiver.kernel.org>; Wed, 20 Apr 2022 10:08:55 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1377514AbiDTKLi (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 20 Apr 2022 06:11:38 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:43602 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S242267AbiDTKLg (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 20 Apr 2022 06:11:36 -0400
X-Greylist: delayed 540 seconds by postgrey-1.37 at lindbergh.monkeyblade.net;
 Wed, 20 Apr 2022 03:08:51 PDT
Received: from outbound-smtp55.blacknight.com (outbound-smtp55.blacknight.com
 [46.22.136.239])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 6CA7E1D321
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 03:08:51 -0700 (PDT)
Received: from mail.blacknight.com (pemlinmail04.blacknight.ie [81.17.254.17])
        by outbound-smtp55.blacknight.com (Postfix) with ESMTPS id 23FA7FAE49
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 10:59:50 +0100 (IST)
Received: (qmail 11477 invoked from network); 20 Apr 2022 09:59:49 -0000
Received: from unknown (HELO morpheus.112glenside.lan)
 (mgorman@techsingularity.net@[84.203.198.246])
  by 81.17.254.9 with ESMTPA; 20 Apr 2022 09:59:49 -0000
From: Mel Gorman <mgorman@techsingularity.net>
To: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>,
        Vlastimil Babka <vbabka@suse.cz>,
        Michal Hocko <mhocko@kernel.org>,
        LKML <linux-kernel@vger.kernel.org>,
        Linux-MM <linux-mm@kvack.org>,
        Mel Gorman <mgorman@techsingularity.net>
Subject: [PATCH 3/6] mm/page_alloc: Split out buddy removal code from rmqueue
 into separate helper
Date: Wed, 20 Apr 2022 10:59:03 +0100
Message-Id: <20220420095906.27349-4-mgorman@techsingularity.net>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20220420095906.27349-1-mgorman@techsingularity.net>
References: <20220420095906.27349-1-mgorman@techsingularity.net>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

This is a preparation page to allow the buddy removal code to be reused
in a later patch.

No functional change.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 mm/page_alloc.c | 87 ++++++++++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 37 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ed2deb93a758..4c1acf666056 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3622,6 +3622,46 @@ static inline void zone_statistics(struct zone *pref=
erred_zone, struct zone *z,
 #endif
 }
=20
+static __always_inline
+struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
+			   unsigned int order, unsigned int alloc_flags,
+			   int migratetype)
+{
+	struct page *page;
+	unsigned long flags;
+
+	do {
+		page =3D NULL;
+		spin_lock_irqsave(&zone->lock, flags);
+		/*
+		 * order-0 request can reach here when the pcplist is skipped
+		 * due to non-CMA allocation context. HIGHATOMIC area is
+		 * reserved for high-order atomic allocation, so order-0
+		 * request should skip it.
+		 */
+		if (order > 0 && alloc_flags & ALLOC_HARDER) {
+			page =3D __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
+			if (page)
+				trace_mm_page_alloc_zone_locked(page, order, migratetype);
+		}
+		if (!page) {
+			page =3D __rmqueue(zone, order, migratetype, alloc_flags);
+			if (!page) {
+				spin_unlock_irqrestore(&zone->lock, flags);
+				return NULL;
+			}
+		}
+		__mod_zone_freepage_state(zone, -(1 << order),
+					  get_pcppage_migratetype(page));
+		spin_unlock_irqrestore(&zone->lock, flags);
+	} while (check_new_pages(page, order));
+
+	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+	zone_statistics(preferred_zone, zone, 1);
+
+	return page;
+}
+
 /* Remove page from the per-cpu list, caller must protect the list */
 static inline
 struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
@@ -3702,9 +3742,14 @@ struct page *rmqueue(struct zone *preferred_zone,
 			gfp_t gfp_flags, unsigned int alloc_flags,
 			int migratetype)
 {
-	unsigned long flags;
 	struct page *page;
=20
+	/*
+	 * We most definitely don't want callers attempting to
+	 * allocate greater than order-1 page units with __GFP_NOFAIL.
+	 */
+	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
+
 	if (likely(pcp_allowed_order(order))) {
 		/*
 		 * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
@@ -3718,38 +3763,10 @@ struct page *rmqueue(struct zone *preferred_zone,
 		}
 	}
=20
-	/*
-	 * We most definitely don't want callers attempting to
-	 * allocate greater than order-1 page units with __GFP_NOFAIL.
-	 */
-	WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-
-	do {
-		page =3D NULL;
-		spin_lock_irqsave(&zone->lock, flags);
-		/*
-		 * order-0 request can reach here when the pcplist is skipped
-		 * due to non-CMA allocation context. HIGHATOMIC area is
-		 * reserved for high-order atomic allocation, so order-0
-		 * request should skip it.
-		 */
-		if (order > 0 && alloc_flags & ALLOC_HARDER) {
-			page =3D __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
-			if (page)
-				trace_mm_page_alloc_zone_locked(page, order, migratetype);
-		}
-		if (!page) {
-			page =3D __rmqueue(zone, order, migratetype, alloc_flags);
-			if (!page)
-				goto failed;
-		}
-		__mod_zone_freepage_state(zone, -(1 << order),
-					  get_pcppage_migratetype(page));
-		spin_unlock_irqrestore(&zone->lock, flags);
-	} while (check_new_pages(page, order));
-
-	__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
-	zone_statistics(preferred_zone, zone, 1);
+	page =3D rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
+							migratetype);
+	if (unlikely(!page))
+		return NULL;
=20
 out:
 	/* Separate test+clear to avoid unnecessary atomics */
@@ -3760,10 +3777,6 @@ struct page *rmqueue(struct zone *preferred_zone,
=20
 	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
 	return page;
-
-failed:
-	spin_unlock_irqrestore(&zone->lock, flags);
-	return NULL;
 }
=20
 #ifdef CONFIG_FAIL_PAGE_ALLOC
--=20
2.34.1
From nobody Thu Jun 18 08:58:40 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 0CB14C433F5
	for <linux-kernel@archiver.kernel.org>; Wed, 20 Apr 2022 10:07:19 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1377517AbiDTKKB (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 20 Apr 2022 06:10:01 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41920 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1347063AbiDTKJs (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 20 Apr 2022 06:09:48 -0400
Received: from outbound-smtp31.blacknight.com (outbound-smtp31.blacknight.com
 [81.17.249.62])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B51163ED09
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 03:07:01 -0700 (PDT)
Received: from mail.blacknight.com (pemlinmail04.blacknight.ie [81.17.254.17])
        by outbound-smtp31.blacknight.com (Postfix) with ESMTPS id 20194C0E27
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 11:00:01 +0100 (IST)
Received: (qmail 12208 invoked from network); 20 Apr 2022 10:00:01 -0000
Received: from unknown (HELO morpheus.112glenside.lan)
 (mgorman@techsingularity.net@[84.203.198.246])
  by 81.17.254.9 with ESMTPA; 20 Apr 2022 10:00:00 -0000
From: Mel Gorman <mgorman@techsingularity.net>
To: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>,
        Vlastimil Babka <vbabka@suse.cz>,
        Michal Hocko <mhocko@kernel.org>,
        LKML <linux-kernel@vger.kernel.org>,
        Linux-MM <linux-mm@kvack.org>,
        Mel Gorman <mgorman@techsingularity.net>
Subject: [PATCH 4/6] mm/page_alloc: Remove unnecessary page == NULL check in
 rmqueue
Date: Wed, 20 Apr 2022 10:59:04 +0100
Message-Id: <20220420095906.27349-5-mgorman@techsingularity.net>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20220420095906.27349-1-mgorman@techsingularity.net>
References: <20220420095906.27349-1-mgorman@techsingularity.net>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

The VM_BUG_ON check for a valid page can be avoided with a simple
change in the flow. The ZONE_BOOSTED_WATERMARK is unlikely in general
and even more unlikely if the page allocation failed so mark the
branch unlikely.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 mm/page_alloc.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4c1acf666056..dc0fdeb3795c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3765,17 +3765,18 @@ struct page *rmqueue(struct zone *preferred_zone,
=20
 	page =3D rmqueue_buddy(preferred_zone, zone, order, alloc_flags,
 							migratetype);
-	if (unlikely(!page))
-		return NULL;
=20
 out:
 	/* Separate test+clear to avoid unnecessary atomics */
-	if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+	if (unlikely(test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags))) {
 		clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
 		wakeup_kswapd(zone, 0, 0, zone_idx(zone));
 	}
=20
-	VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
+	if (unlikely(!page))
+		return NULL;
+
+	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 	return page;
 }
=20
--=20
2.34.1
From nobody Thu Jun 18 08:58:40 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 10350C433FE
	for <linux-kernel@archiver.kernel.org>; Wed, 20 Apr 2022 10:00:21 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1377430AbiDTKDD (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 20 Apr 2022 06:03:03 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:35800 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S242487AbiDTKC6 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 20 Apr 2022 06:02:58 -0400
Received: from outbound-smtp43.blacknight.com (outbound-smtp43.blacknight.com
 [46.22.139.229])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id F0DE41ADB8
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 03:00:12 -0700 (PDT)
Received: from mail.blacknight.com (pemlinmail04.blacknight.ie [81.17.254.17])
        by outbound-smtp43.blacknight.com (Postfix) with ESMTPS id 5A5B2211D
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 11:00:11 +0100 (IST)
Received: (qmail 13237 invoked from network); 20 Apr 2022 10:00:11 -0000
Received: from unknown (HELO morpheus.112glenside.lan)
 (mgorman@techsingularity.net@[84.203.198.246])
  by 81.17.254.9 with ESMTPA; 20 Apr 2022 10:00:11 -0000
From: Mel Gorman <mgorman@techsingularity.net>
To: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>,
        Vlastimil Babka <vbabka@suse.cz>,
        Michal Hocko <mhocko@kernel.org>,
        LKML <linux-kernel@vger.kernel.org>,
        Linux-MM <linux-mm@kvack.org>,
        Mel Gorman <mgorman@techsingularity.net>
Subject: [PATCH 5/6] mm/page_alloc: Protect PCP lists with a spinlock
Date: Wed, 20 Apr 2022 10:59:05 +0100
Message-Id: <20220420095906.27349-6-mgorman@techsingularity.net>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20220420095906.27349-1-mgorman@techsingularity.net>
References: <20220420095906.27349-1-mgorman@techsingularity.net>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Currently the PCP lists are protected by using local_lock_irqsave to
prevent migration and IRQ reentrancy but this is inconvenient. Remote
draining of the lists is impossible and a workqueue is required and
every task allocation/free must disable then enable interrupts which is
expensive.

As preparation for dealing with both of those problems, protect the
lists with a spinlock. The IRQ-unsafe version of the lock is used
because IRQs are already disabled by local_lock_irqsave. spin_trylock
is used in preparation for a time when local_lock could be used instead
of lock_lock_irqsave.

The per_cpu_pages still fits within the same number of cache lines after
this patch relative to before the series.

struct per_cpu_pages {
        spinlock_t                 lock;                 /*     0     4 */
        int                        count;                /*     4     4 */
        int                        high;                 /*     8     4 */
        int                        batch;                /*    12     4 */
        short int                  free_factor;          /*    16     2 */
        short int                  expire;               /*    18     2 */

        /* XXX 4 bytes hole, try to pack */

        struct list_head           lists[13];            /*    24   208 */

        /* size: 256, cachelines: 4, members: 7 */
        /* sum members: 228, holes: 1, sum holes: 4 */
        /* padding: 24 */
} __attribute__((__aligned__(64)));

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 include/linux/mmzone.h |   1 +
 mm/page_alloc.c        | 155 +++++++++++++++++++++++++++++++++++------
 2 files changed, 136 insertions(+), 20 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index abe530748de6..8b5757735428 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -385,6 +385,7 @@ enum zone_watermarks {
=20
 /* Fields and list protected by pagesets local_lock in page_alloc.c */
 struct per_cpu_pages {
+	spinlock_t lock;	/* Protects lists field */
 	int count;		/* number of pages in the list */
 	int high;		/* high watermark, emptying needed */
 	int batch;		/* chunk size for buddy add/remove */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index dc0fdeb3795c..813c84b67c65 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -132,6 +132,17 @@ static DEFINE_PER_CPU(struct pagesets, pagesets) __may=
be_unused =3D {
 	.lock =3D INIT_LOCAL_LOCK(lock),
 };
=20
+#ifdef CONFIG_SMP
+/* On SMP, spin_trylock is sufficient protection */
+#define pcp_trylock_prepare(flags)	do { } while (0)
+#define pcp_trylock_finish(flag)	do { } while (0)
+#else
+
+/* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy.=
 */
+#define pcp_trylock_prepare(flags)	local_irq_save(flags)
+#define pcp_trylock_finish(flags)	local_irq_restore(flags)
+#endif
+
 #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
 DEFINE_PER_CPU(int, numa_node);
 EXPORT_PER_CPU_SYMBOL(numa_node);
@@ -3082,15 +3093,22 @@ static int rmqueue_bulk(struct zone *zone, unsigned=
 int order,
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
-	unsigned long flags;
 	int to_drain, batch;
=20
-	local_lock_irqsave(&pagesets.lock, flags);
 	batch =3D READ_ONCE(pcp->batch);
 	to_drain =3D min(pcp->count, batch);
-	if (to_drain > 0)
+	if (to_drain > 0) {
+		unsigned long flags;
+
+		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
+		local_irq_save(flags);
+
+		spin_lock(&pcp->lock);
 		free_pcppages_bulk(zone, to_drain, pcp, 0);
-	local_unlock_irqrestore(&pagesets.lock, flags);
+		spin_unlock(&pcp->lock);
+
+		local_irq_restore(flags);
+	}
 }
 #endif
=20
@@ -3103,16 +3121,21 @@ void drain_zone_pages(struct zone *zone, struct per=
_cpu_pages *pcp)
  */
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
-	unsigned long flags;
 	struct per_cpu_pages *pcp;
=20
-	local_lock_irqsave(&pagesets.lock, flags);
-
 	pcp =3D per_cpu_ptr(zone->per_cpu_pageset, cpu);
-	if (pcp->count)
+	if (pcp->count) {
+		unsigned long flags;
+
+		/* free_pcppages_bulk expects IRQs disabled for zone->lock */
+		local_irq_save(flags);
+
+		spin_lock(&pcp->lock);
 		free_pcppages_bulk(zone, pcp->count, pcp, 0);
+		spin_unlock(&pcp->lock);
=20
-	local_unlock_irqrestore(&pagesets.lock, flags);
+		local_irq_restore(flags);
+	}
 }
=20
 /*
@@ -3380,18 +3403,30 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, s=
truct zone *zone,
 	return min(READ_ONCE(pcp->batch) << 2, high);
 }
=20
-static void free_unref_page_commit(struct page *page, int migratetype,
-				   unsigned int order)
+/* Returns true if the page was committed to the per-cpu list. */
+static bool free_unref_page_commit(struct page *page, int migratetype,
+				   unsigned int order, bool locked)
 {
 	struct zone *zone =3D page_zone(page);
 	struct per_cpu_pages *pcp;
 	int high;
 	int pindex;
 	bool free_high;
+	unsigned long __maybe_unused UP_flags;
=20
 	__count_vm_event(PGFREE);
 	pcp =3D this_cpu_ptr(zone->per_cpu_pageset);
 	pindex =3D order_to_pindex(migratetype, order);
+
+	if (!locked) {
+		/* Protect against a parallel drain. */
+		pcp_trylock_prepare(UP_flags);
+		if (!spin_trylock(&pcp->lock)) {
+			pcp_trylock_finish(UP_flags);
+			return false;
+		}
+	}
+
 	list_add(&page->pcp_list, &pcp->lists[pindex]);
 	pcp->count +=3D 1 << order;
=20
@@ -3409,6 +3444,13 @@ static void free_unref_page_commit(struct page *page=
, int migratetype,
=20
 		free_pcppages_bulk(zone, nr_pcp_free(pcp, high, batch, free_high), pcp, =
pindex);
 	}
+
+	if (!locked) {
+		spin_unlock(&pcp->lock);
+		pcp_trylock_finish(UP_flags);
+	}
+
+	return true;
 }
=20
 /*
@@ -3419,6 +3461,7 @@ void free_unref_page(struct page *page, unsigned int =
order)
 	unsigned long flags;
 	unsigned long pfn =3D page_to_pfn(page);
 	int migratetype;
+	bool freed_pcp =3D false;
=20
 	if (!free_unref_page_prepare(page, pfn, order))
 		return;
@@ -3440,8 +3483,11 @@ void free_unref_page(struct page *page, unsigned int=
 order)
 	}
=20
 	local_lock_irqsave(&pagesets.lock, flags);
-	free_unref_page_commit(page, migratetype, order);
+	freed_pcp =3D free_unref_page_commit(page, migratetype, order, false);
 	local_unlock_irqrestore(&pagesets.lock, flags);
+
+	if (unlikely(!freed_pcp))
+		free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
 }
=20
 /*
@@ -3450,10 +3496,19 @@ void free_unref_page(struct page *page, unsigned in=
t order)
 void free_unref_page_list(struct list_head *list)
 {
 	struct page *page, *next;
+	struct per_cpu_pages *pcp;
+	struct zone *locked_zone;
 	unsigned long flags;
 	int batch_count =3D 0;
 	int migratetype;
=20
+	/*
+	 * An empty list is possible. Check early so that the later
+	 * lru_to_page() does not potentially read garbage.
+	 */
+	if (list_empty(list))
+		return;
+
 	/* Prepare pages for freeing */
 	list_for_each_entry_safe(page, next, list, lru) {
 		unsigned long pfn =3D page_to_pfn(page);
@@ -3474,8 +3529,26 @@ void free_unref_page_list(struct list_head *list)
 		}
 	}
=20
+	VM_BUG_ON(in_hardirq());
+
 	local_lock_irqsave(&pagesets.lock, flags);
+
+	page =3D lru_to_page(list);
+	locked_zone =3D page_zone(page);
+	pcp =3D this_cpu_ptr(locked_zone->per_cpu_pageset);
+	spin_lock(&pcp->lock);
+
 	list_for_each_entry_safe(page, next, list, lru) {
+		struct zone *zone =3D page_zone(page);
+
+		/* Different zone, different pcp lock. */
+		if (zone !=3D locked_zone) {
+			spin_unlock(&pcp->lock);
+			locked_zone =3D zone;
+			pcp =3D this_cpu_ptr(zone->per_cpu_pageset);
+			spin_lock(&pcp->lock);
+		}
+
 		/*
 		 * Non-isolated types over MIGRATE_PCPTYPES get added
 		 * to the MIGRATE_MOVABLE pcp list.
@@ -3485,18 +3558,25 @@ void free_unref_page_list(struct list_head *list)
 			migratetype =3D MIGRATE_MOVABLE;
=20
 		trace_mm_page_free_batched(page);
-		free_unref_page_commit(page, migratetype, 0);
+
+		/* True is dead code at the moment due to local_lock_irqsave. */
+		if (unlikely(!free_unref_page_commit(page, migratetype, 0, true)))
+			free_one_page(page_zone(page), page, page_to_pfn(page), 0, migratetype,=
 FPI_NONE);
=20
 		/*
 		 * Guard against excessive IRQ disabled times when we get
 		 * a large list of pages to free.
 		 */
 		if (++batch_count =3D=3D SWAP_CLUSTER_MAX) {
+			spin_unlock(&pcp->lock);
 			local_unlock_irqrestore(&pagesets.lock, flags);
 			batch_count =3D 0;
 			local_lock_irqsave(&pagesets.lock, flags);
+			pcp =3D this_cpu_ptr(locked_zone->per_cpu_pageset);
+			spin_lock(&pcp->lock);
 		}
 	}
+	spin_unlock(&pcp->lock);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 }
=20
@@ -3668,9 +3748,30 @@ struct page *__rmqueue_pcplist(struct zone *zone, un=
signed int order,
 			int migratetype,
 			unsigned int alloc_flags,
 			struct per_cpu_pages *pcp,
-			struct list_head *list)
+			struct list_head *list,
+			bool locked)
 {
 	struct page *page;
+	unsigned long __maybe_unused UP_flags;
+
+	/*
+	 * spin_trylock is not necessary right now due to due to
+	 * local_lock_irqsave and is a preparation step for
+	 * a conversion to local_lock using the trylock to prevent
+	 * IRQ re-entrancy. If pcp->lock cannot be acquired, the caller
+	 * uses rmqueue_buddy.
+	 *
+	 * TODO: Convert local_lock_irqsave to local_lock. Care
+	 * 	 is needed as the type of local_lock would need a
+	 * 	 PREEMPT_RT version due to threaded IRQs.
+	 */
+	if (unlikely(!locked)) {
+		pcp_trylock_prepare(UP_flags);
+		if (!spin_trylock(&pcp->lock)) {
+			pcp_trylock_finish(UP_flags);
+			return NULL;
+		}
+	}
=20
 	do {
 		if (list_empty(list)) {
@@ -3691,8 +3792,10 @@ struct page *__rmqueue_pcplist(struct zone *zone, un=
signed int order,
 					migratetype, alloc_flags);
=20
 			pcp->count +=3D alloced << order;
-			if (unlikely(list_empty(list)))
-				return NULL;
+			if (unlikely(list_empty(list))) {
+				page =3D NULL;
+				goto out;
+			}
 		}
=20
 		page =3D list_first_entry(list, struct page, lru);
@@ -3700,6 +3803,12 @@ struct page *__rmqueue_pcplist(struct zone *zone, un=
signed int order,
 		pcp->count -=3D 1 << order;
 	} while (check_new_pcp(page, order));
=20
+out:
+	if (!locked) {
+		spin_unlock(&pcp->lock);
+		pcp_trylock_finish(UP_flags);
+	}
+
 	return page;
 }
=20
@@ -3724,7 +3833,7 @@ static struct page *rmqueue_pcplist(struct zone *pref=
erred_zone,
 	pcp =3D this_cpu_ptr(zone->per_cpu_pageset);
 	pcp->free_factor >>=3D 1;
 	list =3D &pcp->lists[order_to_pindex(migratetype, order)];
-	page =3D __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, li=
st);
+	page =3D __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, li=
st, false);
 	local_unlock_irqrestore(&pagesets.lock, flags);
 	if (page) {
 		__count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
@@ -3759,7 +3868,8 @@ struct page *rmqueue(struct zone *preferred_zone,
 				migratetype !=3D MIGRATE_MOVABLE) {
 			page =3D rmqueue_pcplist(preferred_zone, zone, order,
 					gfp_flags, migratetype, alloc_flags);
-			goto out;
+			if (likely(page))
+				goto out;
 		}
 	}
=20
@@ -5326,6 +5436,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int prefe=
rred_nid,
 	local_lock_irqsave(&pagesets.lock, flags);
 	pcp =3D this_cpu_ptr(zone->per_cpu_pageset);
 	pcp_list =3D &pcp->lists[order_to_pindex(ac.migratetype, 0)];
+	spin_lock(&pcp->lock);
=20
 	while (nr_populated < nr_pages) {
=20
@@ -5336,11 +5447,13 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int pre=
ferred_nid,
 		}
=20
 		page =3D __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
-								pcp, pcp_list);
+							pcp, pcp_list, true);
 		if (unlikely(!page)) {
 			/* Try and get at least one page */
-			if (!nr_populated)
+			if (!nr_populated) {
+				spin_unlock(&pcp->lock);
 				goto failed_irq;
+			}
 			break;
 		}
 		nr_account++;
@@ -5353,6 +5466,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int prefe=
rred_nid,
 		nr_populated++;
 	}
=20
+	spin_unlock(&pcp->lock);
 	local_unlock_irqrestore(&pagesets.lock, flags);
=20
 	__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
@@ -6992,6 +7106,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *=
pcp, struct per_cpu_zonesta
 	memset(pcp, 0, sizeof(*pcp));
 	memset(pzstats, 0, sizeof(*pzstats));
=20
+	spin_lock_init(&pcp->lock);
 	for (pindex =3D 0; pindex < NR_PCP_LISTS; pindex++)
 		INIT_LIST_HEAD(&pcp->lists[pindex]);
=20
--=20
2.34.1
From nobody Thu Jun 18 08:58:40 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 8FFA2C433F5
	for <linux-kernel@archiver.kernel.org>; Wed, 20 Apr 2022 10:00:46 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S1377478AbiDTKD3 (ORCPT
        <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 20 Apr 2022 06:03:29 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:37156 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1377460AbiDTKDX (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 20 Apr 2022 06:03:23 -0400
Received: from outbound-smtp08.blacknight.com (outbound-smtp08.blacknight.com
 [46.22.139.13])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id CDC9C3ED00
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 03:00:22 -0700 (PDT)
Received: from mail.blacknight.com (pemlinmail04.blacknight.ie [81.17.254.17])
        by outbound-smtp08.blacknight.com (Postfix) with ESMTPS id 87D351C41E5
        for <linux-kernel@vger.kernel.org>;
 Wed, 20 Apr 2022 11:00:21 +0100 (IST)
Received: (qmail 14142 invoked from network); 20 Apr 2022 10:00:21 -0000
Received: from unknown (HELO morpheus.112glenside.lan)
 (mgorman@techsingularity.net@[84.203.198.246])
  by 81.17.254.9 with ESMTPA; 20 Apr 2022 10:00:21 -0000
From: Mel Gorman <mgorman@techsingularity.net>
To: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>,
        Vlastimil Babka <vbabka@suse.cz>,
        Michal Hocko <mhocko@kernel.org>,
        LKML <linux-kernel@vger.kernel.org>,
        Linux-MM <linux-mm@kvack.org>,
        Mel Gorman <mgorman@techsingularity.net>
Subject: [PATCH 6/6] mm/page_alloc: Remotely drain per-cpu lists
Date: Wed, 20 Apr 2022 10:59:06 +0100
Message-Id: <20220420095906.27349-7-mgorman@techsingularity.net>
X-Mailer: git-send-email 2.34.1
In-Reply-To: <20220420095906.27349-1-mgorman@techsingularity.net>
References: <20220420095906.27349-1-mgorman@techsingularity.net>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

From: Nicolas Saenz Julienne <nsaenzju@redhat.com>

Some setups, notably NOHZ_FULL CPUs, are too busy to handle the per-cpu
drain work queued by __drain_all_pages(). So introduce new a mechanism
to remotely drain the per-cpu lists. It is made possible by remotely
locking 'struct per_cpu_pages' new per-cpu spinlocks. A benefit of this
new scheme is that drain operations are now migration safe.

There was no observed performance degradation vs. the previous scheme.
Both netperf and hackbench were run in parallel to triggering the
__drain_all_pages(NULL, true) code path around ~100 times per second.
The new scheme performs a bit better (~5%), although the important point
here is there are no performance regressions vs. the previous mechanism.
Per-cpu lists draining happens only in slow paths.

Link: https://lore.kernel.org/r/20211103170512.2745765-4-nsaenzju@redhat.com
Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
---
 mm/page_alloc.c | 66 +++++++++----------------------------------------
 1 file changed, 11 insertions(+), 55 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 813c84b67c65..17d11eb0413e 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -161,13 +161,7 @@ DEFINE_PER_CPU(int, _numa_mem_);		/* Kernel "local mem=
ory" node */
 EXPORT_PER_CPU_SYMBOL(_numa_mem_);
 #endif
=20
-/* work_structs for global per-cpu drains */
-struct pcpu_drain {
-	struct zone *zone;
-	struct work_struct work;
-};
 static DEFINE_MUTEX(pcpu_drain_mutex);
-static DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
=20
 #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
 volatile unsigned long latent_entropy __latent_entropy;
@@ -3087,9 +3081,6 @@ static int rmqueue_bulk(struct zone *zone, unsigned i=
nt order,
  * Called from the vmstat counter updater to drain pagesets of this
  * currently executing processor on remote nodes after they have
  * expired.
- *
- * Note that this function must be called with the thread pinned to
- * a single processor.
  */
 void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
 {
@@ -3114,10 +3105,6 @@ void drain_zone_pages(struct zone *zone, struct per_=
cpu_pages *pcp)
=20
 /*
  * Drain pcplists of the indicated processor and zone.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
  */
 static void drain_pages_zone(unsigned int cpu, struct zone *zone)
 {
@@ -3140,10 +3127,6 @@ static void drain_pages_zone(unsigned int cpu, struc=
t zone *zone)
=20
 /*
  * Drain pcplists of all zones on the indicated processor.
- *
- * The processor must either be the current processor and the
- * thread pinned to the current processor or a processor that
- * is not online.
  */
 static void drain_pages(unsigned int cpu)
 {
@@ -3156,9 +3139,6 @@ static void drain_pages(unsigned int cpu)
=20
 /*
  * Spill all of this CPU's per-cpu pages back into the buddy allocator.
- *
- * The CPU has to be pinned. When zone parameter is non-NULL, spill just
- * the single zone's pages.
  */
 void drain_local_pages(struct zone *zone)
 {
@@ -3170,24 +3150,6 @@ void drain_local_pages(struct zone *zone)
 		drain_pages(cpu);
 }
=20
-static void drain_local_pages_wq(struct work_struct *work)
-{
-	struct pcpu_drain *drain;
-
-	drain =3D container_of(work, struct pcpu_drain, work);
-
-	/*
-	 * drain_all_pages doesn't use proper cpu hotplug protection so
-	 * we can race with cpu offline when the WQ can move this from
-	 * a cpu pinned worker to an unbound one. We can operate on a different
-	 * cpu which is alright but we also have to make sure to not move to
-	 * a different one.
-	 */
-	migrate_disable();
-	drain_local_pages(drain->zone);
-	migrate_enable();
-}
-
 /*
  * The implementation of drain_all_pages(), exposing an extra parameter to
  * drain on all cpus.
@@ -3208,13 +3170,6 @@ static void __drain_all_pages(struct zone *zone, boo=
l force_all_cpus)
 	 */
 	static cpumask_t cpus_with_pcps;
=20
-	/*
-	 * Make sure nobody triggers this path before mm_percpu_wq is fully
-	 * initialized.
-	 */
-	if (WARN_ON_ONCE(!mm_percpu_wq))
-		return;
-
 	/*
 	 * Do not drain if one is already in progress unless it's specific to
 	 * a zone. Such callers are primarily CMA and memory hotplug and need
@@ -3264,14 +3219,12 @@ static void __drain_all_pages(struct zone *zone, bo=
ol force_all_cpus)
 	}
=20
 	for_each_cpu(cpu, &cpus_with_pcps) {
-		struct pcpu_drain *drain =3D per_cpu_ptr(&pcpu_drain, cpu);
-
-		drain->zone =3D zone;
-		INIT_WORK(&drain->work, drain_local_pages_wq);
-		queue_work_on(cpu, mm_percpu_wq, &drain->work);
+		if (zone) {
+			drain_pages_zone(cpu, zone);
+		} else {
+			drain_pages(cpu);
+		}
 	}
-	for_each_cpu(cpu, &cpus_with_pcps)
-		flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
=20
 	mutex_unlock(&pcpu_drain_mutex);
 }
@@ -3280,8 +3233,6 @@ static void __drain_all_pages(struct zone *zone, bool=
 force_all_cpus)
  * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
  *
  * When zone parameter is non-NULL, spill just the single zone's pages.
- *
- * Note that this can be extremely slow as the draining happens in a workq=
ueue.
  */
 void drain_all_pages(struct zone *zone)
 {
@@ -3559,7 +3510,12 @@ void free_unref_page_list(struct list_head *list)
=20
 		trace_mm_page_free_batched(page);
=20
-		/* True is dead code at the moment due to local_lock_irqsave. */
+		/*
+		 * If there is a parallel drain in progress, free to the buddy
+		 * allocator directly. This is expensive as the zone lock will
+		 * be acquired multiple times but if a drain is in progress
+		 * then an expensive operation is already taking place.
+		 */
 		if (unlikely(!free_unref_page_commit(page, migratetype, 0, true)))
 			free_one_page(page_zone(page), page, page_to_pfn(page), 0, migratetype,=
 FPI_NONE);
=20
--=20
2.34.1