[v4] mm: page_alloc: freelist migratetype hygiene

[PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Johannes Weiner 1 year, 10 months ago

Free page accounting currently happens a bit too high up the call
stack, where it has to deal with guard pages, compaction capturing,
block stealing and even page isolation. This is subtle and fragile,
and makes it difficult to hack on the code.

Now that type violations on the freelists have been fixed, push the
accounting down to where pages enter and leave the freelist.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 include/linux/mm.h     |  18 ++--
 include/linux/vmstat.h |   8 --
 mm/debug_page_alloc.c  |  12 +--
 mm/internal.h          |   5 --
 mm/page_alloc.c        | 194 +++++++++++++++++++++++------------------
 mm/page_isolation.c    |   3 +-
 6 files changed, 120 insertions(+), 120 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8147b1302413..bd2e94391c7e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3781,24 +3781,22 @@ static inline bool page_is_guard(struct page *page)
 	return PageGuard(page);
 }
 
-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
-		      int migratetype);
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
 static inline bool set_page_guard(struct zone *zone, struct page *page,
-				  unsigned int order, int migratetype)
+				  unsigned int order)
 {
 	if (!debug_guardpage_enabled())
 		return false;
-	return __set_page_guard(zone, page, order, migratetype);
+	return __set_page_guard(zone, page, order);
 }
 
-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
-			int migratetype);
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
 static inline void clear_page_guard(struct zone *zone, struct page *page,
-				    unsigned int order, int migratetype)
+				    unsigned int order)
 {
 	if (!debug_guardpage_enabled())
 		return;
-	__clear_page_guard(zone, page, order, migratetype);
+	__clear_page_guard(zone, page, order);
 }
 
 #else	/* CONFIG_DEBUG_PAGEALLOC */
@@ -3808,9 +3806,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
 static inline bool debug_guardpage_enabled(void) { return false; }
 static inline bool page_is_guard(struct page *page) { return false; }
 static inline bool set_page_guard(struct zone *zone, struct page *page,
-			unsigned int order, int migratetype) { return false; }
+			unsigned int order) { return false; }
 static inline void clear_page_guard(struct zone *zone, struct page *page,
-				unsigned int order, int migratetype) {}
+				unsigned int order) {}
 #endif	/* CONFIG_DEBUG_PAGEALLOC */
 
 #ifdef __HAVE_ARCH_GATE_AREA
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 343906a98d6e..735eae6e272c 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -487,14 +487,6 @@ static inline void node_stat_sub_folio(struct folio *folio,
 	mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
 }
 
-static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
-					     int migratetype)
-{
-	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
-	if (is_migrate_cma(migratetype))
-		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
-}
-
 extern const char * const vmstat_text[];
 
 static inline const char *zone_stat_name(enum zone_stat_item item)
diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
index 6755f0c9d4a3..d46acf989dde 100644
--- a/mm/debug_page_alloc.c
+++ b/mm/debug_page_alloc.c
@@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
 }
 early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
 
-bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
-		      int migratetype)
+bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order)
 {
 	if (order >= debug_guardpage_minorder())
 		return false;
@@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
 	__SetPageGuard(page);
 	INIT_LIST_HEAD(&page->buddy_list);
 	set_page_private(page, order);
-	/* Guard pages are not available for any usage */
-	if (!is_migrate_isolate(migratetype))
-		__mod_zone_freepage_state(zone, -(1 << order), migratetype);
 
 	return true;
 }
 
-void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
-		      int migratetype)
+void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order)
 {
 	__ClearPageGuard(page);
-
 	set_page_private(page, 0);
-	if (!is_migrate_isolate(migratetype))
-		__mod_zone_freepage_state(zone, (1 << order), migratetype);
 }
diff --git a/mm/internal.h b/mm/internal.h
index d6e6c7d9f04e..0a4007b03d0d 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1036,11 +1036,6 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype)
 	return migratetype == MIGRATE_HIGHATOMIC;
 }
 
-static inline bool is_migrate_highatomic_page(struct page *page)
-{
-	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
-}
-
 void setup_zone_pageset(struct zone *zone);
 
 struct migration_target_control {
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index efb2581ac142..c46491f83ac2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -642,42 +642,72 @@ compaction_capture(struct capture_control *capc, struct page *page,
 }
 #endif /* CONFIG_COMPACTION */
 
-/* Used for pages not on another list */
-static inline void add_to_free_list(struct page *page, struct zone *zone,
-				    unsigned int order, int migratetype)
+static inline void account_freepages(struct page *page, struct zone *zone,
+				     int nr_pages, int migratetype)
 {
-	struct free_area *area = &zone->free_area[order];
+	if (is_migrate_isolate(migratetype))
+		return;
 
-	list_add(&page->buddy_list, &area->free_list[migratetype]);
-	area->nr_free++;
+	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
+
+	if (is_migrate_cma(migratetype))
+		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
 }
 
 /* Used for pages not on another list */
-static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
-					 unsigned int order, int migratetype)
+static inline void __add_to_free_list(struct page *page, struct zone *zone,
+				      unsigned int order, int migratetype,
+				      bool tail)
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
+	VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
+		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
+		     get_pageblock_migratetype(page), migratetype, 1 << order);
+
+	if (tail)
+		list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
+	else
+		list_add(&page->buddy_list, &area->free_list[migratetype]);
 	area->nr_free++;
 }
 
+static inline void add_to_free_list(struct page *page, struct zone *zone,
+				    unsigned int order, int migratetype,
+				    bool tail)
+{
+	__add_to_free_list(page, zone, order, migratetype, tail);
+	account_freepages(page, zone, 1 << order, migratetype);
+}
+
 /*
  * Used for pages which are on another list. Move the pages to the tail
  * of the list - so the moved pages won't immediately be considered for
  * allocation again (e.g., optimization for memory onlining).
  */
 static inline void move_to_free_list(struct page *page, struct zone *zone,
-				     unsigned int order, int migratetype)
+				     unsigned int order, int old_mt, int new_mt)
 {
 	struct free_area *area = &zone->free_area[order];
 
-	list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
+	/* Free page moving can fail, so it happens before the type update */
+	VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
+		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
+		     get_pageblock_migratetype(page), old_mt, 1 << order);
+
+	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
+
+	account_freepages(page, zone, -(1 << order), old_mt);
+	account_freepages(page, zone, 1 << order, new_mt);
 }
 
-static inline void del_page_from_free_list(struct page *page, struct zone *zone,
-					   unsigned int order)
+static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
+					     unsigned int order, int migratetype)
 {
+        VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
+		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
+		     get_pageblock_migratetype(page), migratetype, 1 << order);
+
 	/* clear reported state and update reported page count */
 	if (page_reported(page))
 		__ClearPageReported(page);
@@ -688,6 +718,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
 	zone->free_area[order].nr_free--;
 }
 
+static inline void del_page_from_free_list(struct page *page, struct zone *zone,
+					   unsigned int order, int migratetype)
+{
+	__del_page_from_free_list(page, zone, order, migratetype);
+	account_freepages(page, zone, -(1 << order), migratetype);
+}
+
 static inline struct page *get_page_from_free_area(struct free_area *area,
 					    int migratetype)
 {
@@ -759,18 +796,16 @@ static inline void __free_one_page(struct page *page,
 	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
 
 	VM_BUG_ON(migratetype == -1);
-	if (likely(!is_migrate_isolate(migratetype)))
-		__mod_zone_freepage_state(zone, 1 << order, migratetype);
-
 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 
+	account_freepages(page, zone, 1 << order, migratetype);
+
 	while (order < MAX_PAGE_ORDER) {
-		if (compaction_capture(capc, page, order, migratetype)) {
-			__mod_zone_freepage_state(zone, -(1 << order),
-								migratetype);
+		int buddy_mt = migratetype;
+
+		if (compaction_capture(capc, page, order, migratetype))
 			return;
-		}
 
 		buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
 		if (!buddy)
@@ -783,19 +818,12 @@ static inline void __free_one_page(struct page *page,
 			 * pageblock isolation could cause incorrect freepage or CMA
 			 * accounting or HIGHATOMIC accounting.
 			 */
-			int buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
+			buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
 
-			if (migratetype != buddy_mt) {
-				if (!migratetype_is_mergeable(migratetype) ||
-				    !migratetype_is_mergeable(buddy_mt))
-					goto done_merging;
-				/*
-				 * Match buddy type. This ensures that
-				 * an expand() down the line puts the
-				 * sub-blocks on the right freelists.
-				 */
-				set_pageblock_migratetype(buddy, migratetype);
-			}
+			if (migratetype != buddy_mt &&
+			    (!migratetype_is_mergeable(migratetype) ||
+			     !migratetype_is_mergeable(buddy_mt)))
+				goto done_merging;
 		}
 
 		/*
@@ -803,9 +831,19 @@ static inline void __free_one_page(struct page *page,
 		 * merge with it and move up one order.
 		 */
 		if (page_is_guard(buddy))
-			clear_page_guard(zone, buddy, order, migratetype);
+			clear_page_guard(zone, buddy, order);
 		else
-			del_page_from_free_list(buddy, zone, order);
+			__del_page_from_free_list(buddy, zone, order, buddy_mt);
+
+		if (unlikely(buddy_mt != migratetype)) {
+			/*
+			 * Match buddy type. This ensures that an
+			 * expand() down the line puts the sub-blocks
+			 * on the right freelists.
+			 */
+			set_pageblock_migratetype(buddy, migratetype);
+		}
+
 		combined_pfn = buddy_pfn & pfn;
 		page = page + (combined_pfn - pfn);
 		pfn = combined_pfn;
@@ -822,10 +860,7 @@ static inline void __free_one_page(struct page *page,
 	else
 		to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
 
-	if (to_tail)
-		add_to_free_list_tail(page, zone, order, migratetype);
-	else
-		add_to_free_list(page, zone, order, migratetype);
+	__add_to_free_list(page, zone, order, migratetype, to_tail);
 
 	/* Notify page reporting subsystem of freed page */
 	if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
@@ -1314,10 +1349,10 @@ static inline void expand(struct zone *zone, struct page *page,
 		 * Corresponding page table entries will not be touched,
 		 * pages will stay not present in virtual address space
 		 */
-		if (set_page_guard(zone, &page[size], high, migratetype))
+		if (set_page_guard(zone, &page[size], high))
 			continue;
 
-		add_to_free_list(&page[size], zone, high, migratetype);
+		add_to_free_list(&page[size], zone, high, migratetype, false);
 		set_buddy_order(&page[size], high);
 	}
 }
@@ -1487,7 +1522,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
 		page = get_page_from_free_area(area, migratetype);
 		if (!page)
 			continue;
-		del_page_from_free_list(page, zone, current_order);
+		del_page_from_free_list(page, zone, current_order, migratetype);
 		expand(zone, page, order, current_order, migratetype);
 		trace_mm_page_alloc_zone_locked(page, order, migratetype,
 				pcp_allowed_order(order) &&
@@ -1527,7 +1562,7 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
  * type's freelist.
  */
 static int move_freepages(struct zone *zone, unsigned long start_pfn,
-			  unsigned long end_pfn, int migratetype)
+			  unsigned long end_pfn, int old_mt, int new_mt)
 {
 	struct page *page;
 	unsigned long pfn;
@@ -1549,12 +1584,14 @@ static int move_freepages(struct zone *zone, unsigned long start_pfn,
 		VM_BUG_ON_PAGE(page_zone(page) != zone, page);
 
 		order = buddy_order(page);
-		move_to_free_list(page, zone, order, migratetype);
+
+		move_to_free_list(page, zone, order, old_mt, new_mt);
+
 		pfn += 1 << order;
 		pages_moved += 1 << order;
 	}
 
-	set_pageblock_migratetype(pfn_to_page(start_pfn), migratetype);
+	set_pageblock_migratetype(pfn_to_page(start_pfn), new_mt);
 
 	return pages_moved;
 }
@@ -1612,7 +1649,7 @@ static bool prep_move_freepages_block(struct zone *zone, struct page *page,
 }
 
 static int move_freepages_block(struct zone *zone, struct page *page,
-				int migratetype)
+				int old_mt, int new_mt)
 {
 	unsigned long start_pfn, end_pfn;
 
@@ -1620,7 +1657,7 @@ static int move_freepages_block(struct zone *zone, struct page *page,
 				       NULL, NULL))
 		return -1;
 
-	return move_freepages(zone, start_pfn, end_pfn, migratetype);
+	return move_freepages(zone, start_pfn, end_pfn, old_mt, new_mt);
 }
 
 #ifdef CONFIG_MEMORY_ISOLATION
@@ -1692,7 +1729,6 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 				  int migratetype)
 {
 	unsigned long start_pfn, end_pfn, pfn;
-	int nr_moved, mt;
 
 	if (!prep_move_freepages_block(zone, page, &start_pfn, &end_pfn,
 				       NULL, NULL))
@@ -1703,11 +1739,9 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 	if (pfn != start_pfn) {
 		struct page *buddy = pfn_to_page(pfn);
 		int order = buddy_order(buddy);
-		int mt = get_pfnblock_migratetype(buddy, pfn);
 
-		if (!is_migrate_isolate(mt))
-			__mod_zone_freepage_state(zone, -(1UL << order), mt);
-		del_page_from_free_list(buddy, zone, order);
+		del_page_from_free_list(buddy, zone, order,
+					get_pfnblock_migratetype(buddy, pfn));
 		set_pageblock_migratetype(page, migratetype);
 		split_large_buddy(zone, buddy, pfn, order);
 		return true;
@@ -1715,23 +1749,17 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
 
 	/* We're the starting block of a larger buddy */
 	if (PageBuddy(page) && buddy_order(page) > pageblock_order) {
-		int mt = get_pfnblock_migratetype(page, pfn);
 		int order = buddy_order(page);
 
-		if (!is_migrate_isolate(mt))
-			__mod_zone_freepage_state(zone, -(1UL << order), mt);
-		del_page_from_free_list(page, zone, order);
+		del_page_from_free_list(page, zone, order,
+					get_pfnblock_migratetype(page, pfn));
 		set_pageblock_migratetype(page, migratetype);
 		split_large_buddy(zone, page, pfn, order);
 		return true;
 	}
 
-	mt = get_pfnblock_migratetype(page, start_pfn);
-	nr_moved = move_freepages(zone, start_pfn, end_pfn, migratetype);
-	if (!is_migrate_isolate(mt))
-		__mod_zone_freepage_state(zone, -nr_moved, mt);
-	else if (!is_migrate_isolate(migratetype))
-		__mod_zone_freepage_state(zone, nr_moved, migratetype);
+	move_freepages(zone, start_pfn, end_pfn,
+		       get_pfnblock_migratetype(page, start_pfn), migratetype);
 	return true;
 }
 #endif /* CONFIG_MEMORY_ISOLATION */
@@ -1845,7 +1873,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
 
 	/* Take ownership for orders >= pageblock_order */
 	if (current_order >= pageblock_order) {
-		del_page_from_free_list(page, zone, current_order);
+		del_page_from_free_list(page, zone, current_order, block_type);
 		change_pageblock_range(page, current_order, start_type);
 		expand(zone, page, order, current_order, start_type);
 		return page;
@@ -1895,12 +1923,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
 	 */
 	if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
 			page_group_by_mobility_disabled) {
-		move_freepages(zone, start_pfn, end_pfn, start_type);
+		move_freepages(zone, start_pfn, end_pfn, block_type, start_type);
 		return __rmqueue_smallest(zone, order, start_type);
 	}
 
 single_page:
-	del_page_from_free_list(page, zone, current_order);
+	del_page_from_free_list(page, zone, current_order, block_type);
 	expand(zone, page, order, current_order, block_type);
 	return page;
 }
@@ -1970,7 +1998,7 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone)
 	mt = get_pageblock_migratetype(page);
 	/* Only reserve normal pageblocks (i.e., they can merge with others) */
 	if (migratetype_is_mergeable(mt))
-		if (move_freepages_block(zone, page,
+		if (move_freepages_block(zone, page, mt,
 					 MIGRATE_HIGHATOMIC) != -1)
 			zone->nr_reserved_highatomic += pageblock_nr_pages;
 
@@ -2011,11 +2039,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 		spin_lock_irqsave(&zone->lock, flags);
 		for (order = 0; order < NR_PAGE_ORDERS; order++) {
 			struct free_area *area = &(zone->free_area[order]);
+			int mt;
 
 			page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
 			if (!page)
 				continue;
 
+			mt = get_pageblock_migratetype(page);
 			/*
 			 * In page freeing path, migratetype change is racy so
 			 * we can counter several free pages in a pageblock
@@ -2023,7 +2053,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * from highatomic to ac->migratetype. So we should
 			 * adjust the count once.
 			 */
-			if (is_migrate_highatomic_page(page)) {
+			if (is_migrate_highatomic(mt)) {
 				/*
 				 * It should never happen but changes to
 				 * locking could inadvertently allow a per-cpu
@@ -2045,7 +2075,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
 			 * of pageblocks that cannot be completely freed
 			 * may increase.
 			 */
-			ret = move_freepages_block(zone, page, ac->migratetype);
+			ret = move_freepages_block(zone, page, mt,
+						   ac->migratetype);
 			/*
 			 * Reserving this block already succeeded, so this should
 			 * not fail on zone boundaries.
@@ -2251,12 +2282,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 		 * pages are ordered properly.
 		 */
 		list_add_tail(&page->pcp_list, list);
-		if (is_migrate_cma(get_pageblock_migratetype(page)))
-			__mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
-					      -(1 << order));
 	}
-
-	__mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
 	spin_unlock_irqrestore(&zone->lock, flags);
 
 	return i;
@@ -2748,11 +2774,9 @@ int __isolate_free_page(struct page *page, unsigned int order)
 		watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
 		if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
 			return 0;
-
-		__mod_zone_freepage_state(zone, -(1UL << order), mt);
 	}
 
-	del_page_from_free_list(page, zone, order);
+	del_page_from_free_list(page, zone, order, mt);
 
 	/*
 	 * Set the pageblock if the isolated page is at least half of a
@@ -2767,7 +2791,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
 			 * with others)
 			 */
 			if (migratetype_is_mergeable(mt))
-				move_freepages_block(zone, page,
+				move_freepages_block(zone, page, mt,
 						     MIGRATE_MOVABLE);
 		}
 	}
@@ -2852,8 +2876,6 @@ struct page *rmqueue_buddy(struct zone *preferred_zone, struct zone *zone,
 				return NULL;
 			}
 		}
-		__mod_zone_freepage_state(zone, -(1 << order),
-					  get_pageblock_migratetype(page));
 		spin_unlock_irqrestore(&zone->lock, flags);
 	} while (check_new_pages(page, order));
 
@@ -6737,8 +6759,9 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 
 		BUG_ON(page_count(page));
 		BUG_ON(!PageBuddy(page));
+		VM_WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE);
 		order = buddy_order(page);
-		del_page_from_free_list(page, zone, order);
+		del_page_from_free_list(page, zone, order, MIGRATE_ISOLATE);
 		pfn += (1 << order);
 	}
 	spin_unlock_irqrestore(&zone->lock, flags);
@@ -6788,10 +6811,10 @@ static void break_down_buddy_pages(struct zone *zone, struct page *page,
 			current_buddy = page + size;
 		}
 
-		if (set_page_guard(zone, current_buddy, high, migratetype))
+		if (set_page_guard(zone, current_buddy, high))
 			continue;
 
-		add_to_free_list(current_buddy, zone, high, migratetype);
+		add_to_free_list(current_buddy, zone, high, migratetype, false);
 		set_buddy_order(current_buddy, high);
 	}
 }
@@ -6817,12 +6840,11 @@ bool take_page_off_buddy(struct page *page)
 			int migratetype = get_pfnblock_migratetype(page_head,
 								   pfn_head);
 
-			del_page_from_free_list(page_head, zone, page_order);
+			del_page_from_free_list(page_head, zone, page_order,
+						migratetype);
 			break_down_buddy_pages(zone, page_head, page, 0,
 						page_order, migratetype);
 			SetPageHWPoisonTakenOff(page);
-			if (!is_migrate_isolate(migratetype))
-				__mod_zone_freepage_state(zone, -1, migratetype);
 			ret = true;
 			break;
 		}
@@ -6930,7 +6952,7 @@ static bool try_to_accept_memory_one(struct zone *zone)
 	list_del(&page->lru);
 	last = list_empty(&zone->unaccepted_pages);
 
-	__mod_zone_freepage_state(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+	account_freepages(page, zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
 	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
@@ -6982,7 +7004,7 @@ static bool __free_unaccepted(struct page *page)
 	spin_lock_irqsave(&zone->lock, flags);
 	first = list_empty(&zone->unaccepted_pages);
 	list_add_tail(&page->lru, &zone->unaccepted_pages);
-	__mod_zone_freepage_state(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+	account_freepages(page, zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
 	__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 042937d5abe4..914a71c580d8 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -252,7 +252,8 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
 		 * Isolating this block already succeeded, so this
 		 * should not fail on zone boundaries.
 		 */
-		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
+		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page,
+							   migratetype));
 	} else {
 		set_pageblock_migratetype(page, migratetype);
 		__putback_isolated_page(page, order, migratetype);
-- 
2.44.0

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Baolin Wang 1 year, 10 months ago


On 2024/3/21 02:02, Johannes Weiner wrote:
> Free page accounting currently happens a bit too high up the call
> stack, where it has to deal with guard pages, compaction capturing,
> block stealing and even page isolation. This is subtle and fragile,
> and makes it difficult to hack on the code.
> 
> Now that type violations on the freelists have been fixed, push the
> accounting down to where pages enter and leave the freelist.
> 
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> ---
>   include/linux/mm.h     |  18 ++--
>   include/linux/vmstat.h |   8 --
>   mm/debug_page_alloc.c  |  12 +--
>   mm/internal.h          |   5 --
>   mm/page_alloc.c        | 194 +++++++++++++++++++++++------------------
>   mm/page_isolation.c    |   3 +-
>   6 files changed, 120 insertions(+), 120 deletions(-)
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 8147b1302413..bd2e94391c7e 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -3781,24 +3781,22 @@ static inline bool page_is_guard(struct page *page)
>   	return PageGuard(page);
>   }
>   
> -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
> -		      int migratetype);
> +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order);
>   static inline bool set_page_guard(struct zone *zone, struct page *page,
> -				  unsigned int order, int migratetype)
> +				  unsigned int order)
>   {
>   	if (!debug_guardpage_enabled())
>   		return false;
> -	return __set_page_guard(zone, page, order, migratetype);
> +	return __set_page_guard(zone, page, order);
>   }
>   
> -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
> -			int migratetype);
> +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order);
>   static inline void clear_page_guard(struct zone *zone, struct page *page,
> -				    unsigned int order, int migratetype)
> +				    unsigned int order)
>   {
>   	if (!debug_guardpage_enabled())
>   		return;
> -	__clear_page_guard(zone, page, order, migratetype);
> +	__clear_page_guard(zone, page, order);
>   }
>   
>   #else	/* CONFIG_DEBUG_PAGEALLOC */
> @@ -3808,9 +3806,9 @@ static inline unsigned int debug_guardpage_minorder(void) { return 0; }
>   static inline bool debug_guardpage_enabled(void) { return false; }
>   static inline bool page_is_guard(struct page *page) { return false; }
>   static inline bool set_page_guard(struct zone *zone, struct page *page,
> -			unsigned int order, int migratetype) { return false; }
> +			unsigned int order) { return false; }
>   static inline void clear_page_guard(struct zone *zone, struct page *page,
> -				unsigned int order, int migratetype) {}
> +				unsigned int order) {}
>   #endif	/* CONFIG_DEBUG_PAGEALLOC */
>   
>   #ifdef __HAVE_ARCH_GATE_AREA
> diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
> index 343906a98d6e..735eae6e272c 100644
> --- a/include/linux/vmstat.h
> +++ b/include/linux/vmstat.h
> @@ -487,14 +487,6 @@ static inline void node_stat_sub_folio(struct folio *folio,
>   	mod_node_page_state(folio_pgdat(folio), item, -folio_nr_pages(folio));
>   }
>   
> -static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages,
> -					     int migratetype)
> -{
> -	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
> -	if (is_migrate_cma(migratetype))
> -		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
> -}
> -
>   extern const char * const vmstat_text[];
>   
>   static inline const char *zone_stat_name(enum zone_stat_item item)
> diff --git a/mm/debug_page_alloc.c b/mm/debug_page_alloc.c
> index 6755f0c9d4a3..d46acf989dde 100644
> --- a/mm/debug_page_alloc.c
> +++ b/mm/debug_page_alloc.c
> @@ -32,8 +32,7 @@ static int __init debug_guardpage_minorder_setup(char *buf)
>   }
>   early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
>   
> -bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
> -		      int migratetype)
> +bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order)
>   {
>   	if (order >= debug_guardpage_minorder())
>   		return false;
> @@ -41,19 +40,12 @@ bool __set_page_guard(struct zone *zone, struct page *page, unsigned int order,
>   	__SetPageGuard(page);
>   	INIT_LIST_HEAD(&page->buddy_list);
>   	set_page_private(page, order);
> -	/* Guard pages are not available for any usage */
> -	if (!is_migrate_isolate(migratetype))
> -		__mod_zone_freepage_state(zone, -(1 << order), migratetype);
>   
>   	return true;
>   }
>   
> -void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order,
> -		      int migratetype)
> +void __clear_page_guard(struct zone *zone, struct page *page, unsigned int order)
>   {
>   	__ClearPageGuard(page);
> -
>   	set_page_private(page, 0);
> -	if (!is_migrate_isolate(migratetype))
> -		__mod_zone_freepage_state(zone, (1 << order), migratetype);
>   }
> diff --git a/mm/internal.h b/mm/internal.h
> index d6e6c7d9f04e..0a4007b03d0d 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -1036,11 +1036,6 @@ static inline bool is_migrate_highatomic(enum migratetype migratetype)
>   	return migratetype == MIGRATE_HIGHATOMIC;
>   }
>   
> -static inline bool is_migrate_highatomic_page(struct page *page)
> -{
> -	return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC;
> -}
> -
>   void setup_zone_pageset(struct zone *zone);
>   
>   struct migration_target_control {
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index efb2581ac142..c46491f83ac2 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -642,42 +642,72 @@ compaction_capture(struct capture_control *capc, struct page *page,
>   }
>   #endif /* CONFIG_COMPACTION */
>   
> -/* Used for pages not on another list */
> -static inline void add_to_free_list(struct page *page, struct zone *zone,
> -				    unsigned int order, int migratetype)
> +static inline void account_freepages(struct page *page, struct zone *zone,
> +				     int nr_pages, int migratetype)
>   {
> -	struct free_area *area = &zone->free_area[order];
> +	if (is_migrate_isolate(migratetype))
> +		return;
>   
> -	list_add(&page->buddy_list, &area->free_list[migratetype]);
> -	area->nr_free++;
> +	__mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages);
> +
> +	if (is_migrate_cma(migratetype))
> +		__mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages);
>   }
>   
>   /* Used for pages not on another list */
> -static inline void add_to_free_list_tail(struct page *page, struct zone *zone,
> -					 unsigned int order, int migratetype)
> +static inline void __add_to_free_list(struct page *page, struct zone *zone,
> +				      unsigned int order, int migratetype,
> +				      bool tail)
>   {
>   	struct free_area *area = &zone->free_area[order];
>   
> -	list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
> +	VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
> +		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
> +		     get_pageblock_migratetype(page), migratetype, 1 << order);
> +
> +	if (tail)
> +		list_add_tail(&page->buddy_list, &area->free_list[migratetype]);
> +	else
> +		list_add(&page->buddy_list, &area->free_list[migratetype]);
>   	area->nr_free++;
>   }
>   
> +static inline void add_to_free_list(struct page *page, struct zone *zone,
> +				    unsigned int order, int migratetype,
> +				    bool tail)
> +{
> +	__add_to_free_list(page, zone, order, migratetype, tail);
> +	account_freepages(page, zone, 1 << order, migratetype);
> +}
> +
>   /*
>    * Used for pages which are on another list. Move the pages to the tail
>    * of the list - so the moved pages won't immediately be considered for
>    * allocation again (e.g., optimization for memory onlining).
>    */
>   static inline void move_to_free_list(struct page *page, struct zone *zone,
> -				     unsigned int order, int migratetype)
> +				     unsigned int order, int old_mt, int new_mt)
>   {
>   	struct free_area *area = &zone->free_area[order];
>   
> -	list_move_tail(&page->buddy_list, &area->free_list[migratetype]);
> +	/* Free page moving can fail, so it happens before the type update */
> +	VM_WARN_ONCE(get_pageblock_migratetype(page) != old_mt,
> +		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
> +		     get_pageblock_migratetype(page), old_mt, 1 << order);
> +
> +	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
> +
> +	account_freepages(page, zone, -(1 << order), old_mt);
> +	account_freepages(page, zone, 1 << order, new_mt);
>   }
>   
> -static inline void del_page_from_free_list(struct page *page, struct zone *zone,
> -					   unsigned int order)
> +static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
> +					     unsigned int order, int migratetype)
>   {
> +        VM_WARN_ONCE(get_pageblock_migratetype(page) != migratetype,
> +		     "page type is %lu, passed migratetype is %d (nr=%d)\n",
> +		     get_pageblock_migratetype(page), migratetype, 1 << order);
> +
>   	/* clear reported state and update reported page count */
>   	if (page_reported(page))
>   		__ClearPageReported(page);
> @@ -688,6 +718,13 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
>   	zone->free_area[order].nr_free--;
>   }
>   
> +static inline void del_page_from_free_list(struct page *page, struct zone *zone,
> +					   unsigned int order, int migratetype)
> +{
> +	__del_page_from_free_list(page, zone, order, migratetype);
> +	account_freepages(page, zone, -(1 << order), migratetype);
> +}
> +
>   static inline struct page *get_page_from_free_area(struct free_area *area,
>   					    int migratetype)
>   {
> @@ -759,18 +796,16 @@ static inline void __free_one_page(struct page *page,
>   	VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
>   
>   	VM_BUG_ON(migratetype == -1);
> -	if (likely(!is_migrate_isolate(migratetype)))
> -		__mod_zone_freepage_state(zone, 1 << order, migratetype);
> -
>   	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
>   	VM_BUG_ON_PAGE(bad_range(zone, page), page);
>   
> +	account_freepages(page, zone, 1 << order, migratetype);
> +
>   	while (order < MAX_PAGE_ORDER) {
> -		if (compaction_capture(capc, page, order, migratetype)) {
> -			__mod_zone_freepage_state(zone, -(1 << order),
> -								migratetype);
> +		int buddy_mt = migratetype;
> +
> +		if (compaction_capture(capc, page, order, migratetype))
>   			return;
> -		}

IIUC, if the released page is captured by compaction, then the 
statistics for free pages should be correspondingly decreased, 
otherwise, there will be a slight regression for my thpcompact benchmark.

thpcompact Percentage Faults Huge
                           k6.9-rc2-base        base + patch10 + 2 fixes	
Percentage huge-1        78.18 (   0.00%)       71.92 (  -8.01%)
Percentage huge-3        86.70 (   0.00%)       86.07 (  -0.73%)
Percentage huge-5        90.26 (   0.00%)       78.02 ( -13.57%)
Percentage huge-7        92.34 (   0.00%)       78.67 ( -14.81%)
Percentage huge-12       91.18 (   0.00%)       81.04 ( -11.12%)
Percentage huge-18       89.00 (   0.00%)       79.57 ( -10.60%)
Percentage huge-24       90.52 (   0.00%)       80.07 ( -11.54%)
Percentage huge-30       94.44 (   0.00%)       96.28 (   1.95%)
Percentage huge-32       93.09 (   0.00%)       99.39 (   6.77%)

I add below fix based on your fix 2, then the thpcompact Percentage 
looks good. How do you think for the fix?

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8330c5c2de6b..2facf844ef84 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -805,8 +805,10 @@ static inline void __free_one_page(struct page *page,
         while (order < MAX_PAGE_ORDER) {
                 int buddy_mt = migratetype;

-               if (compaction_capture(capc, page, order, migratetype))
+               if (compaction_capture(capc, page, order, migratetype)) {
+                       account_freepages(zone, -(1 << order), migratetype);
                         return;
+               }

                 buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
                 if (!buddy)

With my fix, the THP percentage looks better:
                       k6.9-rc2-base          base + patch10 + 2 fixes	+ 
my fix
Percentage huge-1        78.18 (   0.00%)       82.83 (   5.94%)
Percentage huge-3        86.70 (   0.00%)       93.47 (   7.81%)
Percentage huge-5        90.26 (   0.00%)       94.73 (   4.95%)
Percentage huge-7        92.34 (   0.00%)       95.22 (   3.12%)
Percentage huge-12       91.18 (   0.00%)       92.40 (   1.34%)
Percentage huge-18       89.00 (   0.00%)       85.39 (  -4.06%)
Percentage huge-24       90.52 (   0.00%)       94.70 (   4.61%)
Percentage huge-30       94.44 (   0.00%)       97.00 (   2.71%)
Percentage huge-32       93.09 (   0.00%)       92.87 (  -0.24%)

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Vlastimil Babka 1 year, 10 months ago

On 4/7/24 12:19 PM, Baolin Wang wrote:
> On 2024/3/21 02:02, Johannes Weiner wrote:
>>   
>> +	account_freepages(page, zone, 1 << order, migratetype);
>> +
>>   	while (order < MAX_PAGE_ORDER) {
>> -		if (compaction_capture(capc, page, order, migratetype)) {
>> -			__mod_zone_freepage_state(zone, -(1 << order),
>> -								migratetype);
>> +		int buddy_mt = migratetype;
>> +
>> +		if (compaction_capture(capc, page, order, migratetype))
>>   			return;
>> -		}
> 
> IIUC, if the released page is captured by compaction, then the 
> statistics for free pages should be correspondingly decreased, 
> otherwise, there will be a slight regression for my thpcompact benchmark.
> 
> thpcompact Percentage Faults Huge
>                            k6.9-rc2-base        base + patch10 + 2 fixes	
> Percentage huge-1        78.18 (   0.00%)       71.92 (  -8.01%)
> Percentage huge-3        86.70 (   0.00%)       86.07 (  -0.73%)
> Percentage huge-5        90.26 (   0.00%)       78.02 ( -13.57%)
> Percentage huge-7        92.34 (   0.00%)       78.67 ( -14.81%)
> Percentage huge-12       91.18 (   0.00%)       81.04 ( -11.12%)
> Percentage huge-18       89.00 (   0.00%)       79.57 ( -10.60%)
> Percentage huge-24       90.52 (   0.00%)       80.07 ( -11.54%)
> Percentage huge-30       94.44 (   0.00%)       96.28 (   1.95%)
> Percentage huge-32       93.09 (   0.00%)       99.39 (   6.77%)
> 
> I add below fix based on your fix 2, then the thpcompact Percentage 
> looks good. How do you think for the fix?

Yeah another well spotted, thanks. "slight regression" is an understatement,
this affects not just a "statistics" but very important counter
NR_FREE_PAGES which IIUC would eventually become larger than reality, make
the watermark checks false positive and result in depleted reserves etc etc.
Actually wondering why we're not seeing -next failures already (or maybe I
just haven't noticed).

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 8330c5c2de6b..2facf844ef84 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -805,8 +805,10 @@ static inline void __free_one_page(struct page *page,
>          while (order < MAX_PAGE_ORDER) {
>                  int buddy_mt = migratetype;
> 
> -               if (compaction_capture(capc, page, order, migratetype))
> +               if (compaction_capture(capc, page, order, migratetype)) {
> +                       account_freepages(zone, -(1 << order), migratetype);
>                          return;
> +               }
> 
>                  buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
>                  if (!buddy)
> 
> With my fix, the THP percentage looks better:
>                        k6.9-rc2-base          base + patch10 + 2 fixes	+ 
> my fix
> Percentage huge-1        78.18 (   0.00%)       82.83 (   5.94%)
> Percentage huge-3        86.70 (   0.00%)       93.47 (   7.81%)
> Percentage huge-5        90.26 (   0.00%)       94.73 (   4.95%)
> Percentage huge-7        92.34 (   0.00%)       95.22 (   3.12%)
> Percentage huge-12       91.18 (   0.00%)       92.40 (   1.34%)
> Percentage huge-18       89.00 (   0.00%)       85.39 (  -4.06%)
> Percentage huge-24       90.52 (   0.00%)       94.70 (   4.61%)
> Percentage huge-30       94.44 (   0.00%)       97.00 (   2.71%)
> Percentage huge-32       93.09 (   0.00%)       92.87 (  -0.24%)

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Johannes Weiner 1 year, 10 months ago

On Mon, Apr 08, 2024 at 09:38:20AM +0200, Vlastimil Babka wrote:
> On 4/7/24 12:19 PM, Baolin Wang wrote:
> > On 2024/3/21 02:02, Johannes Weiner wrote:
> >>   
> >> +	account_freepages(page, zone, 1 << order, migratetype);
> >> +
> >>   	while (order < MAX_PAGE_ORDER) {
> >> -		if (compaction_capture(capc, page, order, migratetype)) {
> >> -			__mod_zone_freepage_state(zone, -(1 << order),
> >> -								migratetype);
> >> +		int buddy_mt = migratetype;
> >> +
> >> +		if (compaction_capture(capc, page, order, migratetype))
> >>   			return;
> >> -		}
> > 
> > IIUC, if the released page is captured by compaction, then the 
> > statistics for free pages should be correspondingly decreased, 
> > otherwise, there will be a slight regression for my thpcompact benchmark.
> > 
> > thpcompact Percentage Faults Huge
> >                            k6.9-rc2-base        base + patch10 + 2 fixes	
> > Percentage huge-1        78.18 (   0.00%)       71.92 (  -8.01%)
> > Percentage huge-3        86.70 (   0.00%)       86.07 (  -0.73%)
> > Percentage huge-5        90.26 (   0.00%)       78.02 ( -13.57%)
> > Percentage huge-7        92.34 (   0.00%)       78.67 ( -14.81%)
> > Percentage huge-12       91.18 (   0.00%)       81.04 ( -11.12%)
> > Percentage huge-18       89.00 (   0.00%)       79.57 ( -10.60%)
> > Percentage huge-24       90.52 (   0.00%)       80.07 ( -11.54%)
> > Percentage huge-30       94.44 (   0.00%)       96.28 (   1.95%)
> > Percentage huge-32       93.09 (   0.00%)       99.39 (   6.77%)
> > 
> > I add below fix based on your fix 2, then the thpcompact Percentage 
> > looks good. How do you think for the fix?
> 
> Yeah another well spotted, thanks. "slight regression" is an understatement,
> this affects not just a "statistics" but very important counter
> NR_FREE_PAGES which IIUC would eventually become larger than reality, make
> the watermark checks false positive and result in depleted reserves etc etc.
> Actually wondering why we're not seeing -next failures already (or maybe I
> just haven't noticed).

Good catch indeed.

Trying to understand why I didn't notice this during testing, and I
think it's because I had order-10 pageblocks in my config. There is
this in compaction_capture():

	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
		return false;

Most compaction is for order-9 THPs on movable blocks, so I didn't get
much capturing in practice in order for that leak to be noticable.

In earlier versions of the patches I had more aggressive capturing,
but also did the accounting in add_page_to/del_page_from_freelist(),
so capturing only steals what's already accounted to be off list.

With the __add/__del and the consolidated account_freepages()
optimization, compaction_capture() needs explicit accounting again.

> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 8330c5c2de6b..2facf844ef84 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -805,8 +805,10 @@ static inline void __free_one_page(struct page *page,
> >          while (order < MAX_PAGE_ORDER) {
> >                  int buddy_mt = migratetype;
> > 
> > -               if (compaction_capture(capc, page, order, migratetype))
> > +               if (compaction_capture(capc, page, order, migratetype)) {
> > +                       account_freepages(zone, -(1 << order), migratetype);
> >                          return;
> > +               }
> > 
> >                  buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
> >                  if (!buddy)
> > 
> > With my fix, the THP percentage looks better:
> >                        k6.9-rc2-base          base + patch10 + 2 fixes	+ 
> > my fix
> > Percentage huge-1        78.18 (   0.00%)       82.83 (   5.94%)
> > Percentage huge-3        86.70 (   0.00%)       93.47 (   7.81%)
> > Percentage huge-5        90.26 (   0.00%)       94.73 (   4.95%)
> > Percentage huge-7        92.34 (   0.00%)       95.22 (   3.12%)
> > Percentage huge-12       91.18 (   0.00%)       92.40 (   1.34%)
> > Percentage huge-18       89.00 (   0.00%)       85.39 (  -4.06%)
> > Percentage huge-24       90.52 (   0.00%)       94.70 (   4.61%)
> > Percentage huge-30       94.44 (   0.00%)       97.00 (   2.71%)
> > Percentage huge-32       93.09 (   0.00%)       92.87 (  -0.24%)

Acked-by: Johannes Weiner <hannes@cmpxchg.org>

With fixed indentation:

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 70f82635f650..96815e3c22f2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -805,8 +805,10 @@ static inline void __free_one_page(struct page *page,
 	while (order < MAX_PAGE_ORDER) {
 		int buddy_mt = migratetype;
 
-		if (compaction_capture(capc, page, order, migratetype))
+		if (compaction_capture(capc, page, order, migratetype)) {
+			account_freepages(zone, -(1 << order), migratetype);
 			return;
+		}
 
 		buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
 		if (!buddy)

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Baolin Wang 1 year, 10 months ago


On 2024/4/8 22:23, Johannes Weiner wrote:
> On Mon, Apr 08, 2024 at 09:38:20AM +0200, Vlastimil Babka wrote:
>> On 4/7/24 12:19 PM, Baolin Wang wrote:
>>> On 2024/3/21 02:02, Johannes Weiner wrote:
>>>>    
>>>> +	account_freepages(page, zone, 1 << order, migratetype);
>>>> +
>>>>    	while (order < MAX_PAGE_ORDER) {
>>>> -		if (compaction_capture(capc, page, order, migratetype)) {
>>>> -			__mod_zone_freepage_state(zone, -(1 << order),
>>>> -								migratetype);
>>>> +		int buddy_mt = migratetype;
>>>> +
>>>> +		if (compaction_capture(capc, page, order, migratetype))
>>>>    			return;
>>>> -		}
>>>
>>> IIUC, if the released page is captured by compaction, then the
>>> statistics for free pages should be correspondingly decreased,
>>> otherwise, there will be a slight regression for my thpcompact benchmark.
>>>
>>> thpcompact Percentage Faults Huge
>>>                             k6.9-rc2-base        base + patch10 + 2 fixes	
>>> Percentage huge-1        78.18 (   0.00%)       71.92 (  -8.01%)
>>> Percentage huge-3        86.70 (   0.00%)       86.07 (  -0.73%)
>>> Percentage huge-5        90.26 (   0.00%)       78.02 ( -13.57%)
>>> Percentage huge-7        92.34 (   0.00%)       78.67 ( -14.81%)
>>> Percentage huge-12       91.18 (   0.00%)       81.04 ( -11.12%)
>>> Percentage huge-18       89.00 (   0.00%)       79.57 ( -10.60%)
>>> Percentage huge-24       90.52 (   0.00%)       80.07 ( -11.54%)
>>> Percentage huge-30       94.44 (   0.00%)       96.28 (   1.95%)
>>> Percentage huge-32       93.09 (   0.00%)       99.39 (   6.77%)
>>>
>>> I add below fix based on your fix 2, then the thpcompact Percentage
>>> looks good. How do you think for the fix?
>>
>> Yeah another well spotted, thanks. "slight regression" is an understatement,
>> this affects not just a "statistics" but very important counter
>> NR_FREE_PAGES which IIUC would eventually become larger than reality, make
>> the watermark checks false positive and result in depleted reserves etc etc.
>> Actually wondering why we're not seeing -next failures already (or maybe I
>> just haven't noticed).
> 
> Good catch indeed.
> 
> Trying to understand why I didn't notice this during testing, and I
> think it's because I had order-10 pageblocks in my config. There is
> this in compaction_capture():
> 
> 	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
> 		return false;
> 
> Most compaction is for order-9 THPs on movable blocks, so I didn't get
> much capturing in practice in order for that leak to be noticable.

This makes me wonder why not use 'cc->migratetype' for migratetype 
comparison, so that low-order (like mTHP) compaction can directly get 
the released pages, which could avoid some compaction scans without 
mixing the migratetype?

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2facf844ef84..7a64020f8222 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -622,7 +622,7 @@ compaction_capture(struct capture_control *capc, 
struct page *page,
          * and vice-versa but no more than normal fallback logic which can
          * have trouble finding a high-order free page.
          */
-       if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
+       if (order < pageblock_order && capc->cc->migratetype != migratetype)
                 return false;

         capc->page = page;

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Zi Yan 1 year, 10 months ago

On 9 Apr 2024, at 5:31, Baolin Wang wrote:

> On 2024/4/8 22:23, Johannes Weiner wrote:
>> On Mon, Apr 08, 2024 at 09:38:20AM +0200, Vlastimil Babka wrote:
>>> On 4/7/24 12:19 PM, Baolin Wang wrote:
>>>> On 2024/3/21 02:02, Johannes Weiner wrote:
>>>>>   +	account_freepages(page, zone, 1 << order, migratetype);
>>>>> +
>>>>>    	while (order < MAX_PAGE_ORDER) {
>>>>> -		if (compaction_capture(capc, page, order, migratetype)) {
>>>>> -			__mod_zone_freepage_state(zone, -(1 << order),
>>>>> -								migratetype);
>>>>> +		int buddy_mt = migratetype;
>>>>> +
>>>>> +		if (compaction_capture(capc, page, order, migratetype))
>>>>>    			return;
>>>>> -		}
>>>>
>>>> IIUC, if the released page is captured by compaction, then the
>>>> statistics for free pages should be correspondingly decreased,
>>>> otherwise, there will be a slight regression for my thpcompact benchmark.
>>>>
>>>> thpcompact Percentage Faults Huge
>>>>                             k6.9-rc2-base        base + patch10 + 2 fixes	
>>>> Percentage huge-1        78.18 (   0.00%)       71.92 (  -8.01%)
>>>> Percentage huge-3        86.70 (   0.00%)       86.07 (  -0.73%)
>>>> Percentage huge-5        90.26 (   0.00%)       78.02 ( -13.57%)
>>>> Percentage huge-7        92.34 (   0.00%)       78.67 ( -14.81%)
>>>> Percentage huge-12       91.18 (   0.00%)       81.04 ( -11.12%)
>>>> Percentage huge-18       89.00 (   0.00%)       79.57 ( -10.60%)
>>>> Percentage huge-24       90.52 (   0.00%)       80.07 ( -11.54%)
>>>> Percentage huge-30       94.44 (   0.00%)       96.28 (   1.95%)
>>>> Percentage huge-32       93.09 (   0.00%)       99.39 (   6.77%)
>>>>
>>>> I add below fix based on your fix 2, then the thpcompact Percentage
>>>> looks good. How do you think for the fix?
>>>
>>> Yeah another well spotted, thanks. "slight regression" is an understatement,
>>> this affects not just a "statistics" but very important counter
>>> NR_FREE_PAGES which IIUC would eventually become larger than reality, make
>>> the watermark checks false positive and result in depleted reserves etc etc.
>>> Actually wondering why we're not seeing -next failures already (or maybe I
>>> just haven't noticed).
>>
>> Good catch indeed.
>>
>> Trying to understand why I didn't notice this during testing, and I
>> think it's because I had order-10 pageblocks in my config. There is
>> this in compaction_capture():
>>
>> 	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
>> 		return false;
>>
>> Most compaction is for order-9 THPs on movable blocks, so I didn't get
>> much capturing in practice in order for that leak to be noticable.
>
> This makes me wonder why not use 'cc->migratetype' for migratetype comparison, so that low-order (like mTHP) compaction can directly get the released pages, which could avoid some compaction scans without mixing the migratetype?
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 2facf844ef84..7a64020f8222 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -622,7 +622,7 @@ compaction_capture(struct capture_control *capc, struct page *page,
>          * and vice-versa but no more than normal fallback logic which can
>          * have trouble finding a high-order free page.
>          */
> -       if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
> +       if (order < pageblock_order && capc->cc->migratetype != migratetype)
>                 return false;
>
>         capc->page = page;

It is worth trying, since at the original patch time mTHP was not present and
not capturing any MIGRATE_MOVABLE makes sense. But with your change, the capture
will lose the opportunity of letting an unmovable request use a reclaimable
pageblock and vice-versa, like the comment says. Please change the comment
as well and we should monitor potential unmovable and reclaimable regression.


--
Best Regards,
Yan, Zi

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Baolin Wang 1 year, 10 months ago


On 2024/4/9 22:46, Zi Yan wrote:
> On 9 Apr 2024, at 5:31, Baolin Wang wrote:
> 
>> On 2024/4/8 22:23, Johannes Weiner wrote:
>>> On Mon, Apr 08, 2024 at 09:38:20AM +0200, Vlastimil Babka wrote:
>>>> On 4/7/24 12:19 PM, Baolin Wang wrote:
>>>>> On 2024/3/21 02:02, Johannes Weiner wrote:
>>>>>>    +	account_freepages(page, zone, 1 << order, migratetype);
>>>>>> +
>>>>>>     	while (order < MAX_PAGE_ORDER) {
>>>>>> -		if (compaction_capture(capc, page, order, migratetype)) {
>>>>>> -			__mod_zone_freepage_state(zone, -(1 << order),
>>>>>> -								migratetype);
>>>>>> +		int buddy_mt = migratetype;
>>>>>> +
>>>>>> +		if (compaction_capture(capc, page, order, migratetype))
>>>>>>     			return;
>>>>>> -		}
>>>>>
>>>>> IIUC, if the released page is captured by compaction, then the
>>>>> statistics for free pages should be correspondingly decreased,
>>>>> otherwise, there will be a slight regression for my thpcompact benchmark.
>>>>>
>>>>> thpcompact Percentage Faults Huge
>>>>>                              k6.9-rc2-base        base + patch10 + 2 fixes	
>>>>> Percentage huge-1        78.18 (   0.00%)       71.92 (  -8.01%)
>>>>> Percentage huge-3        86.70 (   0.00%)       86.07 (  -0.73%)
>>>>> Percentage huge-5        90.26 (   0.00%)       78.02 ( -13.57%)
>>>>> Percentage huge-7        92.34 (   0.00%)       78.67 ( -14.81%)
>>>>> Percentage huge-12       91.18 (   0.00%)       81.04 ( -11.12%)
>>>>> Percentage huge-18       89.00 (   0.00%)       79.57 ( -10.60%)
>>>>> Percentage huge-24       90.52 (   0.00%)       80.07 ( -11.54%)
>>>>> Percentage huge-30       94.44 (   0.00%)       96.28 (   1.95%)
>>>>> Percentage huge-32       93.09 (   0.00%)       99.39 (   6.77%)
>>>>>
>>>>> I add below fix based on your fix 2, then the thpcompact Percentage
>>>>> looks good. How do you think for the fix?
>>>>
>>>> Yeah another well spotted, thanks. "slight regression" is an understatement,
>>>> this affects not just a "statistics" but very important counter
>>>> NR_FREE_PAGES which IIUC would eventually become larger than reality, make
>>>> the watermark checks false positive and result in depleted reserves etc etc.
>>>> Actually wondering why we're not seeing -next failures already (or maybe I
>>>> just haven't noticed).
>>>
>>> Good catch indeed.
>>>
>>> Trying to understand why I didn't notice this during testing, and I
>>> think it's because I had order-10 pageblocks in my config. There is
>>> this in compaction_capture():
>>>
>>> 	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
>>> 		return false;
>>>
>>> Most compaction is for order-9 THPs on movable blocks, so I didn't get
>>> much capturing in practice in order for that leak to be noticable.
>>
>> This makes me wonder why not use 'cc->migratetype' for migratetype comparison, so that low-order (like mTHP) compaction can directly get the released pages, which could avoid some compaction scans without mixing the migratetype?
>>
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 2facf844ef84..7a64020f8222 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -622,7 +622,7 @@ compaction_capture(struct capture_control *capc, struct page *page,
>>           * and vice-versa but no more than normal fallback logic which can
>>           * have trouble finding a high-order free page.
>>           */
>> -       if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
>> +       if (order < pageblock_order && capc->cc->migratetype != migratetype)
>>                  return false;
>>
>>          capc->page = page;
> 
> It is worth trying, since at the original patch time mTHP was not present and
> not capturing any MIGRATE_MOVABLE makes sense. But with your change, the capture
> will lose the opportunity of letting an unmovable request use a reclaimable
> pageblock and vice-versa, like the comment says. Please change the comment
> as well and we should monitor potential unmovable and reclaimable regression.

Yes, but I think this case is easy to solve. Anyway let me try to do 
some measurement for mTHP.

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Vlastimil Babka 1 year, 10 months ago

On 4/8/24 4:23 PM, Johannes Weiner wrote:
> On Mon, Apr 08, 2024 at 09:38:20AM +0200, Vlastimil Babka wrote:
>> On 4/7/24 12:19 PM, Baolin Wang wrote:
>> > On 2024/3/21 02:02, Johannes Weiner wrote:
>> >>   
>> >> +	account_freepages(page, zone, 1 << order, migratetype);
>> >> +
>> >>   	while (order < MAX_PAGE_ORDER) {
>> >> -		if (compaction_capture(capc, page, order, migratetype)) {
>> >> -			__mod_zone_freepage_state(zone, -(1 << order),
>> >> -								migratetype);
>> >> +		int buddy_mt = migratetype;
>> >> +
>> >> +		if (compaction_capture(capc, page, order, migratetype))
>> >>   			return;
>> >> -		}
>> > 
>> > IIUC, if the released page is captured by compaction, then the 
>> > statistics for free pages should be correspondingly decreased, 
>> > otherwise, there will be a slight regression for my thpcompact benchmark.
>> > 
>> > thpcompact Percentage Faults Huge
>> >                            k6.9-rc2-base        base + patch10 + 2 fixes	
>> > Percentage huge-1        78.18 (   0.00%)       71.92 (  -8.01%)
>> > Percentage huge-3        86.70 (   0.00%)       86.07 (  -0.73%)
>> > Percentage huge-5        90.26 (   0.00%)       78.02 ( -13.57%)
>> > Percentage huge-7        92.34 (   0.00%)       78.67 ( -14.81%)
>> > Percentage huge-12       91.18 (   0.00%)       81.04 ( -11.12%)
>> > Percentage huge-18       89.00 (   0.00%)       79.57 ( -10.60%)
>> > Percentage huge-24       90.52 (   0.00%)       80.07 ( -11.54%)
>> > Percentage huge-30       94.44 (   0.00%)       96.28 (   1.95%)
>> > Percentage huge-32       93.09 (   0.00%)       99.39 (   6.77%)
>> > 
>> > I add below fix based on your fix 2, then the thpcompact Percentage 
>> > looks good. How do you think for the fix?
>> 
>> Yeah another well spotted, thanks. "slight regression" is an understatement,
>> this affects not just a "statistics" but very important counter
>> NR_FREE_PAGES which IIUC would eventually become larger than reality, make
>> the watermark checks false positive and result in depleted reserves etc etc.
>> Actually wondering why we're not seeing -next failures already (or maybe I
>> just haven't noticed).
> 
> Good catch indeed.
> 
> Trying to understand why I didn't notice this during testing, and I
> think it's because I had order-10 pageblocks in my config. There is
> this in compaction_capture():
> 
> 	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
> 		return false;
> 
> Most compaction is for order-9 THPs on movable blocks, so I didn't get
> much capturing in practice in order for that leak to be noticable.
> 
> In earlier versions of the patches I had more aggressive capturing,
> but also did the accounting in add_page_to/del_page_from_freelist(),
> so capturing only steals what's already accounted to be off list.
> 
> With the __add/__del and the consolidated account_freepages()
> optimization, compaction_capture() needs explicit accounting again.
> 
>> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> > index 8330c5c2de6b..2facf844ef84 100644
>> > --- a/mm/page_alloc.c
>> > +++ b/mm/page_alloc.c
>> > @@ -805,8 +805,10 @@ static inline void __free_one_page(struct page *page,
>> >          while (order < MAX_PAGE_ORDER) {
>> >                  int buddy_mt = migratetype;
>> > 
>> > -               if (compaction_capture(capc, page, order, migratetype))
>> > +               if (compaction_capture(capc, page, order, migratetype)) {
>> > +                       account_freepages(zone, -(1 << order), migratetype);
>> >                          return;
>> > +               }
>> > 
>> >                  buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
>> >                  if (!buddy)
>> > 
>> > With my fix, the THP percentage looks better:
>> >                        k6.9-rc2-base          base + patch10 + 2 fixes	+ 
>> > my fix
>> > Percentage huge-1        78.18 (   0.00%)       82.83 (   5.94%)
>> > Percentage huge-3        86.70 (   0.00%)       93.47 (   7.81%)
>> > Percentage huge-5        90.26 (   0.00%)       94.73 (   4.95%)
>> > Percentage huge-7        92.34 (   0.00%)       95.22 (   3.12%)
>> > Percentage huge-12       91.18 (   0.00%)       92.40 (   1.34%)
>> > Percentage huge-18       89.00 (   0.00%)       85.39 (  -4.06%)
>> > Percentage huge-24       90.52 (   0.00%)       94.70 (   4.61%)
>> > Percentage huge-30       94.44 (   0.00%)       97.00 (   2.71%)
>> > Percentage huge-32       93.09 (   0.00%)       92.87 (  -0.24%)
> 
> Acked-by: Johannes Weiner <hannes@cmpxchg.org>

Acked-by: Vlastimil Babka <vbabka@suse.cz>

> With fixed indentation:

Maybe Baolin could resend the finalized 2 fixups in a more ready-to-pick form?

> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 70f82635f650..96815e3c22f2 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -805,8 +805,10 @@ static inline void __free_one_page(struct page *page,
>  	while (order < MAX_PAGE_ORDER) {
>  		int buddy_mt = migratetype;
>  
> -		if (compaction_capture(capc, page, order, migratetype))
> +		if (compaction_capture(capc, page, order, migratetype)) {
> +			account_freepages(zone, -(1 << order), migratetype);
>  			return;
> +		}
>  
>  		buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
>  		if (!buddy)

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Baolin Wang 1 year, 10 months ago


On 2024/4/9 14:23, Vlastimil Babka wrote:
> On 4/8/24 4:23 PM, Johannes Weiner wrote:
>> On Mon, Apr 08, 2024 at 09:38:20AM +0200, Vlastimil Babka wrote:
>>> On 4/7/24 12:19 PM, Baolin Wang wrote:
>>>> On 2024/3/21 02:02, Johannes Weiner wrote:
>>>>>    
>>>>> +	account_freepages(page, zone, 1 << order, migratetype);
>>>>> +
>>>>>    	while (order < MAX_PAGE_ORDER) {
>>>>> -		if (compaction_capture(capc, page, order, migratetype)) {
>>>>> -			__mod_zone_freepage_state(zone, -(1 << order),
>>>>> -								migratetype);
>>>>> +		int buddy_mt = migratetype;
>>>>> +
>>>>> +		if (compaction_capture(capc, page, order, migratetype))
>>>>>    			return;
>>>>> -		}
>>>>
>>>> IIUC, if the released page is captured by compaction, then the
>>>> statistics for free pages should be correspondingly decreased,
>>>> otherwise, there will be a slight regression for my thpcompact benchmark.
>>>>
>>>> thpcompact Percentage Faults Huge
>>>>                             k6.9-rc2-base        base + patch10 + 2 fixes	
>>>> Percentage huge-1        78.18 (   0.00%)       71.92 (  -8.01%)
>>>> Percentage huge-3        86.70 (   0.00%)       86.07 (  -0.73%)
>>>> Percentage huge-5        90.26 (   0.00%)       78.02 ( -13.57%)
>>>> Percentage huge-7        92.34 (   0.00%)       78.67 ( -14.81%)
>>>> Percentage huge-12       91.18 (   0.00%)       81.04 ( -11.12%)
>>>> Percentage huge-18       89.00 (   0.00%)       79.57 ( -10.60%)
>>>> Percentage huge-24       90.52 (   0.00%)       80.07 ( -11.54%)
>>>> Percentage huge-30       94.44 (   0.00%)       96.28 (   1.95%)
>>>> Percentage huge-32       93.09 (   0.00%)       99.39 (   6.77%)
>>>>
>>>> I add below fix based on your fix 2, then the thpcompact Percentage
>>>> looks good. How do you think for the fix?
>>>
>>> Yeah another well spotted, thanks. "slight regression" is an understatement,
>>> this affects not just a "statistics" but very important counter
>>> NR_FREE_PAGES which IIUC would eventually become larger than reality, make
>>> the watermark checks false positive and result in depleted reserves etc etc.
>>> Actually wondering why we're not seeing -next failures already (or maybe I
>>> just haven't noticed).
>>
>> Good catch indeed.
>>
>> Trying to understand why I didn't notice this during testing, and I
>> think it's because I had order-10 pageblocks in my config. There is
>> this in compaction_capture():
>>
>> 	if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
>> 		return false;
>>
>> Most compaction is for order-9 THPs on movable blocks, so I didn't get
>> much capturing in practice in order for that leak to be noticable.
>>
>> In earlier versions of the patches I had more aggressive capturing,
>> but also did the accounting in add_page_to/del_page_from_freelist(),
>> so capturing only steals what's already accounted to be off list.
>>
>> With the __add/__del and the consolidated account_freepages()
>> optimization, compaction_capture() needs explicit accounting again.
>>
>>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>>> index 8330c5c2de6b..2facf844ef84 100644
>>>> --- a/mm/page_alloc.c
>>>> +++ b/mm/page_alloc.c
>>>> @@ -805,8 +805,10 @@ static inline void __free_one_page(struct page *page,
>>>>           while (order < MAX_PAGE_ORDER) {
>>>>                   int buddy_mt = migratetype;
>>>>
>>>> -               if (compaction_capture(capc, page, order, migratetype))
>>>> +               if (compaction_capture(capc, page, order, migratetype)) {
>>>> +                       account_freepages(zone, -(1 << order), migratetype);
>>>>                           return;
>>>> +               }
>>>>
>>>>                   buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
>>>>                   if (!buddy)
>>>>
>>>> With my fix, the THP percentage looks better:
>>>>                         k6.9-rc2-base          base + patch10 + 2 fixes	+
>>>> my fix
>>>> Percentage huge-1        78.18 (   0.00%)       82.83 (   5.94%)
>>>> Percentage huge-3        86.70 (   0.00%)       93.47 (   7.81%)
>>>> Percentage huge-5        90.26 (   0.00%)       94.73 (   4.95%)
>>>> Percentage huge-7        92.34 (   0.00%)       95.22 (   3.12%)
>>>> Percentage huge-12       91.18 (   0.00%)       92.40 (   1.34%)
>>>> Percentage huge-18       89.00 (   0.00%)       85.39 (  -4.06%)
>>>> Percentage huge-24       90.52 (   0.00%)       94.70 (   4.61%)
>>>> Percentage huge-30       94.44 (   0.00%)       97.00 (   2.71%)
>>>> Percentage huge-32       93.09 (   0.00%)       92.87 (  -0.24%)
>>
>> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
> 
> Acked-by: Vlastimil Babka <vbabka@suse.cz>

Thanks.

>> With fixed indentation:
> 
> Maybe Baolin could resend the finalized 2 fixups in a more ready-to-pick form?

Sure, I've send it out.

And I see Andrew has already folded the first fix 
("mm-page_alloc-fix-freelist-movement-during-block-conversion-fix") into 
the patch set. If a formal fix patch is needed, Andrew, please let me know.

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Vlastimil Babka 1 year, 10 months ago

On 4/9/24 9:56 AM, Baolin Wang wrote:
> 
> 
>>>
>>> Acked-by: Johannes Weiner <hannes@cmpxchg.org>
>> 
>> Acked-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Thanks.
> 
>>> With fixed indentation:
>> 
>> Maybe Baolin could resend the finalized 2 fixups in a more ready-to-pick form?
> 
> Sure, I've send it out.

Thanks.

> And I see Andrew has already folded the first fix 
> ("mm-page_alloc-fix-freelist-movement-during-block-conversion-fix") into 
> the patch set. If a formal fix patch is needed, Andrew, please let me know.

Oh didn't notice, in that case nothing more should be needed.

[PATCH] mm: page_alloc: consolidate free page accounting fix 3

Posted by Baolin Wang 1 year, 10 months ago

If the released page is captured by compaction, now the free page accounting
is not correspondingly decreased, which can make the watermark checks false
positive and result in depleted reserves etc. And I can see the false positive
watermark checks in thpcompact benchmark, that led to a slight regression:

thpcompact Percentage Faults Huge
                           k6.9-rc2-base        base + patch10 + 2 fixes
Percentage huge-1        78.18 (   0.00%)       71.92 (  -8.01%)
Percentage huge-3        86.70 (   0.00%)       86.07 (  -0.73%)
Percentage huge-5        90.26 (   0.00%)       78.02 ( -13.57%)
Percentage huge-7        92.34 (   0.00%)       78.67 ( -14.81%)
Percentage huge-12       91.18 (   0.00%)       81.04 ( -11.12%)
Percentage huge-18       89.00 (   0.00%)       79.57 ( -10.60%)
Percentage huge-24       90.52 (   0.00%)       80.07 ( -11.54%)
Percentage huge-30       94.44 (   0.00%)       96.28 (   1.95%)
Percentage huge-32       93.09 (   0.00%)       99.39 (   6.77%)

After the fix, the regression is gone.

Signed-off-by: Baolin Wang <baolin.wang@linux.alibaba.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
---
 mm/page_alloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8330c5c2de6b..2facf844ef84 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -805,8 +805,10 @@ static inline void __free_one_page(struct page *page,
 	while (order < MAX_PAGE_ORDER) {
 		int buddy_mt = migratetype;
 
-		if (compaction_capture(capc, page, order, migratetype))
+		if (compaction_capture(capc, page, order, migratetype)) {
+			account_freepages(zone, -(1 << order), migratetype);
 			return;
+		}
 
 		buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
 		if (!buddy)
-- 
2.39.3

Re: [PATCH] mm: page_alloc: consolidate free page accounting fix 3

Posted by kernel test robot 1 year, 10 months ago

Hi Baolin,

kernel test robot noticed the following build errors:



url:    https://github.com/intel-lab-lkp/linux/commits/UPDATE-20240409-154935/Johannes-Weiner/mm-page_alloc-remove-pcppage-migratetype-caching/20240321-020814
base:   the 10th patch of https://lore.kernel.org/r/20240320180429.678181-11-hannes%40cmpxchg.org
patch link:    https://lore.kernel.org/r/a2a48baca69f103aa431fd201f8a06e3b95e203d.1712648441.git.baolin.wang%40linux.alibaba.com
patch subject: [PATCH] mm: page_alloc: consolidate free page accounting fix 3
config: x86_64-rhel-8.3 (https://download.01.org/0day-ci/archive/20240410/202404100551.aq0YQFuT-lkp@intel.com/config)
compiler: gcc-13 (Ubuntu 13.2.0-4ubuntu3) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240410/202404100551.aq0YQFuT-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202404100551.aq0YQFuT-lkp@intel.com/

All error/warnings (new ones prefixed by >>):

   mm/page_alloc.c: In function '__free_one_page':
>> mm/page_alloc.c:808:43: error: passing argument 1 of 'account_freepages' from incompatible pointer type [-Werror=incompatible-pointer-types]
     808 |                         account_freepages(zone, -(1 << order), migratetype);
         |                                           ^~~~
         |                                           |
         |                                           struct zone *
   mm/page_alloc.c:645:51: note: expected 'struct page *' but argument is of type 'struct zone *'
     645 | static inline void account_freepages(struct page *page, struct zone *zone,
         |                                      ~~~~~~~~~~~~~^~~~
>> mm/page_alloc.c:808:49: warning: passing argument 2 of 'account_freepages' makes pointer from integer without a cast [-Wint-conversion]
     808 |                         account_freepages(zone, -(1 << order), migratetype);
         |                                                 ^~~~~~~~~~~~~
         |                                                 |
         |                                                 int
   mm/page_alloc.c:645:70: note: expected 'struct zone *' but argument is of type 'int'
     645 | static inline void account_freepages(struct page *page, struct zone *zone,
         |                                                         ~~~~~~~~~~~~~^~~~
>> mm/page_alloc.c:808:25: error: too few arguments to function 'account_freepages'
     808 |                         account_freepages(zone, -(1 << order), migratetype);
         |                         ^~~~~~~~~~~~~~~~~
   mm/page_alloc.c:645:20: note: declared here
     645 | static inline void account_freepages(struct page *page, struct zone *zone,
         |                    ^~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/account_freepages +808 mm/page_alloc.c

   759	
   760	/*
   761	 * Freeing function for a buddy system allocator.
   762	 *
   763	 * The concept of a buddy system is to maintain direct-mapped table
   764	 * (containing bit values) for memory blocks of various "orders".
   765	 * The bottom level table contains the map for the smallest allocatable
   766	 * units of memory (here, pages), and each level above it describes
   767	 * pairs of units from the levels below, hence, "buddies".
   768	 * At a high level, all that happens here is marking the table entry
   769	 * at the bottom level available, and propagating the changes upward
   770	 * as necessary, plus some accounting needed to play nicely with other
   771	 * parts of the VM system.
   772	 * At each level, we keep a list of pages, which are heads of continuous
   773	 * free pages of length of (1 << order) and marked with PageBuddy.
   774	 * Page's order is recorded in page_private(page) field.
   775	 * So when we are allocating or freeing one, we can derive the state of the
   776	 * other.  That is, if we allocate a small block, and both were
   777	 * free, the remainder of the region must be split into blocks.
   778	 * If a block is freed, and its buddy is also free, then this
   779	 * triggers coalescing into a block of larger size.
   780	 *
   781	 * -- nyc
   782	 */
   783	
   784	static inline void __free_one_page(struct page *page,
   785			unsigned long pfn,
   786			struct zone *zone, unsigned int order,
   787			int migratetype, fpi_t fpi_flags)
   788	{
   789		struct capture_control *capc = task_capc(zone);
   790		unsigned long buddy_pfn = 0;
   791		unsigned long combined_pfn;
   792		struct page *buddy;
   793		bool to_tail;
   794	
   795		VM_BUG_ON(!zone_is_initialized(zone));
   796		VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
   797	
   798		VM_BUG_ON(migratetype == -1);
   799		VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
   800		VM_BUG_ON_PAGE(bad_range(zone, page), page);
   801	
   802		account_freepages(page, zone, 1 << order, migratetype);
   803	
   804		while (order < MAX_PAGE_ORDER) {
   805			int buddy_mt = migratetype;
   806	
   807			if (compaction_capture(capc, page, order, migratetype)) {
 > 808				account_freepages(zone, -(1 << order), migratetype);
   809				return;
   810			}
   811	
   812			buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
   813			if (!buddy)
   814				goto done_merging;
   815	
   816			if (unlikely(order >= pageblock_order)) {
   817				/*
   818				 * We want to prevent merge between freepages on pageblock
   819				 * without fallbacks and normal pageblock. Without this,
   820				 * pageblock isolation could cause incorrect freepage or CMA
   821				 * accounting or HIGHATOMIC accounting.
   822				 */
   823				buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
   824	
   825				if (migratetype != buddy_mt &&
   826				    (!migratetype_is_mergeable(migratetype) ||
   827				     !migratetype_is_mergeable(buddy_mt)))
   828					goto done_merging;
   829			}
   830	
   831			/*
   832			 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
   833			 * merge with it and move up one order.
   834			 */
   835			if (page_is_guard(buddy))
   836				clear_page_guard(zone, buddy, order);
   837			else
   838				__del_page_from_free_list(buddy, zone, order, buddy_mt);
   839	
   840			if (unlikely(buddy_mt != migratetype)) {
   841				/*
   842				 * Match buddy type. This ensures that an
   843				 * expand() down the line puts the sub-blocks
   844				 * on the right freelists.
   845				 */
   846				set_pageblock_migratetype(buddy, migratetype);
   847			}
   848	
   849			combined_pfn = buddy_pfn & pfn;
   850			page = page + (combined_pfn - pfn);
   851			pfn = combined_pfn;
   852			order++;
   853		}
   854	
   855	done_merging:
   856		set_buddy_order(page, order);
   857	
   858		if (fpi_flags & FPI_TO_TAIL)
   859			to_tail = true;
   860		else if (is_shuffle_order(order))
   861			to_tail = shuffle_pick_tail();
   862		else
   863			to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
   864	
   865		__add_to_free_list(page, zone, order, migratetype, to_tail);
   866	
   867		/* Notify page reporting subsystem of freed page */
   868		if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
   869			page_reporting_notify_free(order);
   870	}
   871	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH] mm: page_alloc: consolidate free page accounting fix 3

Posted by kernel test robot 1 year, 10 months ago

Hi Baolin,

kernel test robot noticed the following build errors:



url:    https://github.com/intel-lab-lkp/linux/commits/UPDATE-20240409-154935/Johannes-Weiner/mm-page_alloc-remove-pcppage-migratetype-caching/20240321-020814
base:   the 10th patch of https://lore.kernel.org/r/20240320180429.678181-11-hannes%40cmpxchg.org
patch link:    https://lore.kernel.org/r/a2a48baca69f103aa431fd201f8a06e3b95e203d.1712648441.git.baolin.wang%40linux.alibaba.com
patch subject: [PATCH] mm: page_alloc: consolidate free page accounting fix 3
config: x86_64-kexec (https://download.01.org/0day-ci/archive/20240410/202404100519.mVXXF6kV-lkp@intel.com/config)
compiler: clang version 17.0.6 (https://github.com/llvm/llvm-project 6009708b4367171ccdbf4b5905cb6a803753fe18)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240410/202404100519.mVXXF6kV-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202404100519.mVXXF6kV-lkp@intel.com/

All errors (new ones prefixed by >>):

>> mm/page_alloc.c:808:54: error: too few arguments to function call, expected 4, have 3
     808 |                         account_freepages(zone, -(1 << order), migratetype);
         |                         ~~~~~~~~~~~~~~~~~                                 ^
   mm/page_alloc.c:645:20: note: 'account_freepages' declared here
     645 | static inline void account_freepages(struct page *page, struct zone *zone,
         |                    ^                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     646 |                                      int nr_pages, int migratetype)
         |                                      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   1 error generated.


vim +808 mm/page_alloc.c

   759	
   760	/*
   761	 * Freeing function for a buddy system allocator.
   762	 *
   763	 * The concept of a buddy system is to maintain direct-mapped table
   764	 * (containing bit values) for memory blocks of various "orders".
   765	 * The bottom level table contains the map for the smallest allocatable
   766	 * units of memory (here, pages), and each level above it describes
   767	 * pairs of units from the levels below, hence, "buddies".
   768	 * At a high level, all that happens here is marking the table entry
   769	 * at the bottom level available, and propagating the changes upward
   770	 * as necessary, plus some accounting needed to play nicely with other
   771	 * parts of the VM system.
   772	 * At each level, we keep a list of pages, which are heads of continuous
   773	 * free pages of length of (1 << order) and marked with PageBuddy.
   774	 * Page's order is recorded in page_private(page) field.
   775	 * So when we are allocating or freeing one, we can derive the state of the
   776	 * other.  That is, if we allocate a small block, and both were
   777	 * free, the remainder of the region must be split into blocks.
   778	 * If a block is freed, and its buddy is also free, then this
   779	 * triggers coalescing into a block of larger size.
   780	 *
   781	 * -- nyc
   782	 */
   783	
   784	static inline void __free_one_page(struct page *page,
   785			unsigned long pfn,
   786			struct zone *zone, unsigned int order,
   787			int migratetype, fpi_t fpi_flags)
   788	{
   789		struct capture_control *capc = task_capc(zone);
   790		unsigned long buddy_pfn = 0;
   791		unsigned long combined_pfn;
   792		struct page *buddy;
   793		bool to_tail;
   794	
   795		VM_BUG_ON(!zone_is_initialized(zone));
   796		VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
   797	
   798		VM_BUG_ON(migratetype == -1);
   799		VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
   800		VM_BUG_ON_PAGE(bad_range(zone, page), page);
   801	
   802		account_freepages(page, zone, 1 << order, migratetype);
   803	
   804		while (order < MAX_PAGE_ORDER) {
   805			int buddy_mt = migratetype;
   806	
   807			if (compaction_capture(capc, page, order, migratetype)) {
 > 808				account_freepages(zone, -(1 << order), migratetype);
   809				return;
   810			}
   811	
   812			buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
   813			if (!buddy)
   814				goto done_merging;
   815	
   816			if (unlikely(order >= pageblock_order)) {
   817				/*
   818				 * We want to prevent merge between freepages on pageblock
   819				 * without fallbacks and normal pageblock. Without this,
   820				 * pageblock isolation could cause incorrect freepage or CMA
   821				 * accounting or HIGHATOMIC accounting.
   822				 */
   823				buddy_mt = get_pfnblock_migratetype(buddy, buddy_pfn);
   824	
   825				if (migratetype != buddy_mt &&
   826				    (!migratetype_is_mergeable(migratetype) ||
   827				     !migratetype_is_mergeable(buddy_mt)))
   828					goto done_merging;
   829			}
   830	
   831			/*
   832			 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
   833			 * merge with it and move up one order.
   834			 */
   835			if (page_is_guard(buddy))
   836				clear_page_guard(zone, buddy, order);
   837			else
   838				__del_page_from_free_list(buddy, zone, order, buddy_mt);
   839	
   840			if (unlikely(buddy_mt != migratetype)) {
   841				/*
   842				 * Match buddy type. This ensures that an
   843				 * expand() down the line puts the sub-blocks
   844				 * on the right freelists.
   845				 */
   846				set_pageblock_migratetype(buddy, migratetype);
   847			}
   848	
   849			combined_pfn = buddy_pfn & pfn;
   850			page = page + (combined_pfn - pfn);
   851			pfn = combined_pfn;
   852			order++;
   853		}
   854	
   855	done_merging:
   856		set_buddy_order(page, order);
   857	
   858		if (fpi_flags & FPI_TO_TAIL)
   859			to_tail = true;
   860		else if (is_shuffle_order(order))
   861			to_tail = shuffle_pick_tail();
   862		else
   863			to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order);
   864	
   865		__add_to_free_list(page, zone, order, migratetype, to_tail);
   866	
   867		/* Notify page reporting subsystem of freed page */
   868		if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY))
   869			page_reporting_notify_free(order);
   870	}
   871	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH] mm: page_alloc: consolidate free page accounting fix 3

Posted by Johannes Weiner 1 year, 10 months ago

On Wed, Apr 10, 2024 at 05:15:01AM +0800, kernel test robot wrote:
> Hi Baolin,
> 
> kernel test robot noticed the following build errors:
> 
> 
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/UPDATE-20240409-154935/Johannes-Weiner/mm-page_alloc-remove-pcppage-migratetype-caching/20240321-020814
> base:   the 10th patch of https://lore.kernel.org/r/20240320180429.678181-11-hannes%40cmpxchg.org
> patch link:    https://lore.kernel.org/r/a2a48baca69f103aa431fd201f8a06e3b95e203d.1712648441.git.baolin.wang%40linux.alibaba.com
> patch subject: [PATCH] mm: page_alloc: consolidate free page accounting fix 3
> config: x86_64-kexec (https://download.01.org/0day-ci/archive/20240410/202404100519.mVXXF6kV-lkp@intel.com/config)
> compiler: clang version 17.0.6 (https://github.com/llvm/llvm-project 6009708b4367171ccdbf4b5905cb6a803753fe18)
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240410/202404100519.mVXXF6kV-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202404100519.mVXXF6kV-lkp@intel.com/
> 
> All errors (new ones prefixed by >>):
> 
> >> mm/page_alloc.c:808:54: error: too few arguments to function call, expected 4, have 3
>      808 |                         account_freepages(zone, -(1 << order), migratetype);
>          |                         ~~~~~~~~~~~~~~~~~                                 ^
>    mm/page_alloc.c:645:20: note: 'account_freepages' declared here
>      645 | static inline void account_freepages(struct page *page, struct zone *zone,
>          |                    ^                 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>      646 |                                      int nr_pages, int migratetype)
>          |                                      ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>    1 error generated.

Looks like a false alarm because the test bot didn't pick up the
fixlet to remove the page parameter from account_freepages():

https://lore.kernel.org/all/20240327185831.GB7597@cmpxchg.org/

It's right in Andrew's tree.

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Baolin Wang 1 year, 10 months ago


On 2024/4/8 15:38, Vlastimil Babka wrote:
> On 4/7/24 12:19 PM, Baolin Wang wrote:
>> On 2024/3/21 02:02, Johannes Weiner wrote:
>>>    
>>> +	account_freepages(page, zone, 1 << order, migratetype);
>>> +
>>>    	while (order < MAX_PAGE_ORDER) {
>>> -		if (compaction_capture(capc, page, order, migratetype)) {
>>> -			__mod_zone_freepage_state(zone, -(1 << order),
>>> -								migratetype);
>>> +		int buddy_mt = migratetype;
>>> +
>>> +		if (compaction_capture(capc, page, order, migratetype))
>>>    			return;
>>> -		}
>>
>> IIUC, if the released page is captured by compaction, then the
>> statistics for free pages should be correspondingly decreased,
>> otherwise, there will be a slight regression for my thpcompact benchmark.
>>
>> thpcompact Percentage Faults Huge
>>                             k6.9-rc2-base        base + patch10 + 2 fixes	
>> Percentage huge-1        78.18 (   0.00%)       71.92 (  -8.01%)
>> Percentage huge-3        86.70 (   0.00%)       86.07 (  -0.73%)
>> Percentage huge-5        90.26 (   0.00%)       78.02 ( -13.57%)
>> Percentage huge-7        92.34 (   0.00%)       78.67 ( -14.81%)
>> Percentage huge-12       91.18 (   0.00%)       81.04 ( -11.12%)
>> Percentage huge-18       89.00 (   0.00%)       79.57 ( -10.60%)
>> Percentage huge-24       90.52 (   0.00%)       80.07 ( -11.54%)
>> Percentage huge-30       94.44 (   0.00%)       96.28 (   1.95%)
>> Percentage huge-32       93.09 (   0.00%)       99.39 (   6.77%)
>>
>> I add below fix based on your fix 2, then the thpcompact Percentage
>> looks good. How do you think for the fix?
> 
> Yeah another well spotted, thanks. "slight regression" is an understatement,

Thanks for helping to confirm.

> this affects not just a "statistics" but very important counter
> NR_FREE_PAGES which IIUC would eventually become larger than reality, make
> the watermark checks false positive and result in depleted reserves etc etc.

Right, agree.

> Actually wondering why we're not seeing -next failures already (or maybe I
> just haven't noticed).
> 
>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>> index 8330c5c2de6b..2facf844ef84 100644
>> --- a/mm/page_alloc.c
>> +++ b/mm/page_alloc.c
>> @@ -805,8 +805,10 @@ static inline void __free_one_page(struct page *page,
>>           while (order < MAX_PAGE_ORDER) {
>>                   int buddy_mt = migratetype;
>>
>> -               if (compaction_capture(capc, page, order, migratetype))
>> +               if (compaction_capture(capc, page, order, migratetype)) {
>> +                       account_freepages(zone, -(1 << order), migratetype);
>>                           return;
>> +               }
>>
>>                   buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn);
>>                   if (!buddy)
>>
>> With my fix, the THP percentage looks better:
>>                         k6.9-rc2-base          base + patch10 + 2 fixes	+
>> my fix
>> Percentage huge-1        78.18 (   0.00%)       82.83 (   5.94%)
>> Percentage huge-3        86.70 (   0.00%)       93.47 (   7.81%)
>> Percentage huge-5        90.26 (   0.00%)       94.73 (   4.95%)
>> Percentage huge-7        92.34 (   0.00%)       95.22 (   3.12%)
>> Percentage huge-12       91.18 (   0.00%)       92.40 (   1.34%)
>> Percentage huge-18       89.00 (   0.00%)       85.39 (  -4.06%)
>> Percentage huge-24       90.52 (   0.00%)       94.70 (   4.61%)
>> Percentage huge-30       94.44 (   0.00%)       97.00 (   2.71%)
>> Percentage huge-32       93.09 (   0.00%)       92.87 (  -0.24%)

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Vlastimil Babka 1 year, 10 months ago

On 3/20/24 7:02 PM, Johannes Weiner wrote:
> Free page accounting currently happens a bit too high up the call
> stack, where it has to deal with guard pages, compaction capturing,
> block stealing and even page isolation. This is subtle and fragile,
> and makes it difficult to hack on the code.
> 
> Now that type violations on the freelists have been fixed, push the
> accounting down to where pages enter and leave the freelist.

Awesome!

> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

Just some nits:

> @@ -1314,10 +1349,10 @@ static inline void expand(struct zone *zone, struct page *page,
>  		 * Corresponding page table entries will not be touched,
>  		 * pages will stay not present in virtual address space
>  		 */
> -		if (set_page_guard(zone, &page[size], high, migratetype))
> +		if (set_page_guard(zone, &page[size], high))
>  			continue;
>  
> -		add_to_free_list(&page[size], zone, high, migratetype);
> +		add_to_free_list(&page[size], zone, high, migratetype, false);

This is account_freepages() in the hot loop, what if we instead used
__add_to_free_list(), sum up nr_pages and called account_freepages() once
outside of the loop?

>  		set_buddy_order(&page[size], high);
>  	}
>  }

<snip>

> diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> index 042937d5abe4..914a71c580d8 100644
> --- a/mm/page_isolation.c
> +++ b/mm/page_isolation.c
> @@ -252,7 +252,8 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
>  		 * Isolating this block already succeeded, so this
>  		 * should not fail on zone boundaries.
>  		 */
> -		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
> +		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page,
> +							   migratetype));
>  	} else {
>  		set_pageblock_migratetype(page, migratetype);
>  		__putback_isolated_page(page, order, migratetype);

Looks like a drive-by edit of an extra file just to adjust identation.

Re: [PATCH 10/10] mm: page_alloc: consolidate free page accounting

Posted by Johannes Weiner 1 year, 10 months ago

On Wed, Mar 27, 2024 at 09:54:01AM +0100, Vlastimil Babka wrote:
> On 3/20/24 7:02 PM, Johannes Weiner wrote:
> > Free page accounting currently happens a bit too high up the call
> > stack, where it has to deal with guard pages, compaction capturing,
> > block stealing and even page isolation. This is subtle and fragile,
> > and makes it difficult to hack on the code.
> > 
> > Now that type violations on the freelists have been fixed, push the
> > accounting down to where pages enter and leave the freelist.
> 
> Awesome!
> 
> > Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
> 
> Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
> 
> Just some nits:
> 
> > @@ -1314,10 +1349,10 @@ static inline void expand(struct zone *zone, struct page *page,
> >  		 * Corresponding page table entries will not be touched,
> >  		 * pages will stay not present in virtual address space
> >  		 */
> > -		if (set_page_guard(zone, &page[size], high, migratetype))
> > +		if (set_page_guard(zone, &page[size], high))
> >  			continue;
> >  
> > -		add_to_free_list(&page[size], zone, high, migratetype);
> > +		add_to_free_list(&page[size], zone, high, migratetype, false);
> 
> This is account_freepages() in the hot loop, what if we instead used
> __add_to_free_list(), sum up nr_pages and called account_freepages() once
> outside of the loop?

Good idea. I'll send a fixlet for that.

> >  		set_buddy_order(&page[size], high);
> >  	}
> >  }
> 
> <snip>
> 
> > diff --git a/mm/page_isolation.c b/mm/page_isolation.c
> > index 042937d5abe4..914a71c580d8 100644
> > --- a/mm/page_isolation.c
> > +++ b/mm/page_isolation.c
> > @@ -252,7 +252,8 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
> >  		 * Isolating this block already succeeded, so this
> >  		 * should not fail on zone boundaries.
> >  		 */
> > -		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
> > +		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page,
> > +							   migratetype));
> >  	} else {
> >  		set_pageblock_migratetype(page, migratetype);
> >  		__putback_isolated_page(page, order, migratetype);
> 
> Looks like a drive-by edit of an extra file just to adjust identation.

Argh, yeah, I think an earlier version mucked with the signature and I
didn't undo that cleanly. I'll send a fixlet for that too.

Thanks for the review!

[PATCH 1/3] mm: page_alloc: consolidate free page accounting fix

Posted by Johannes Weiner 1 year, 10 months ago

On Wed, Mar 27, 2024 at 09:54:01AM +0100, Vlastimil Babka wrote:
> > @@ -252,7 +252,8 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
> >  		 * Isolating this block already succeeded, so this
> >  		 * should not fail on zone boundaries.
> >  		 */
> > -		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
> > +		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page,
> > +							   migratetype));
> >  	} else {
> >  		set_pageblock_migratetype(page, migratetype);
> >  		__putback_isolated_page(page, order, migratetype);
> 
> Looks like a drive-by edit of an extra file just to adjust identation.

Andrew, can you please fold the following fixlet?

From fa8dae0b39c5acfd4c0035c5a25b706b3a105497 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 27 Mar 2024 12:19:17 -0400
Subject: [PATCH 1/3] mm: page_alloc: consolidate free page accounting fix

undo unrelated drive-by line wrap

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/page_isolation.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 914a71c580d8..042937d5abe4 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -252,8 +252,7 @@ static void unset_migratetype_isolate(struct page *page, int migratetype)
 		 * Isolating this block already succeeded, so this
 		 * should not fail on zone boundaries.
 		 */
-		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page,
-							   migratetype));
+		WARN_ON_ONCE(!move_freepages_block_isolate(zone, page, migratetype));
 	} else {
 		set_pageblock_migratetype(page, migratetype);
 		__putback_isolated_page(page, order, migratetype);
-- 
2.44.0

[PATCH 2/3] mm: page_alloc: consolidate free page accounting fix 2

Posted by Johannes Weiner 1 year, 10 months ago

From d3a0b6847f8bf1ecfa6b08c758dfa5a86cfb18b8 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 27 Mar 2024 12:28:41 -0400
Subject: [PATCH 2/3] mm: page_alloc: consolidate free page accounting fix 2

remove unused page parameter from account_freepages()

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/page_alloc.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 34c84ef16b66..8987e8869f6d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -643,8 +643,8 @@ compaction_capture(struct capture_control *capc, struct page *page,
 }
 #endif /* CONFIG_COMPACTION */
 
-static inline void account_freepages(struct page *page, struct zone *zone,
-				     int nr_pages, int migratetype)
+static inline void account_freepages(struct zone *zone, int nr_pages,
+				     int migratetype)
 {
 	if (is_migrate_isolate(migratetype))
 		return;
@@ -678,7 +678,7 @@ static inline void add_to_free_list(struct page *page, struct zone *zone,
 				    bool tail)
 {
 	__add_to_free_list(page, zone, order, migratetype, tail);
-	account_freepages(page, zone, 1 << order, migratetype);
+	account_freepages(zone, 1 << order, migratetype);
 }
 
 /*
@@ -698,8 +698,8 @@ static inline void move_to_free_list(struct page *page, struct zone *zone,
 
 	list_move_tail(&page->buddy_list, &area->free_list[new_mt]);
 
-	account_freepages(page, zone, -(1 << order), old_mt);
-	account_freepages(page, zone, 1 << order, new_mt);
+	account_freepages(zone, -(1 << order), old_mt);
+	account_freepages(zone, 1 << order, new_mt);
 }
 
 static inline void __del_page_from_free_list(struct page *page, struct zone *zone,
@@ -723,7 +723,7 @@ static inline void del_page_from_free_list(struct page *page, struct zone *zone,
 					   unsigned int order, int migratetype)
 {
 	__del_page_from_free_list(page, zone, order, migratetype);
-	account_freepages(page, zone, -(1 << order), migratetype);
+	account_freepages(zone, -(1 << order), migratetype);
 }
 
 static inline struct page *get_page_from_free_area(struct free_area *area,
@@ -800,7 +800,7 @@ static inline void __free_one_page(struct page *page,
 	VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 	VM_BUG_ON_PAGE(bad_range(zone, page), page);
 
-	account_freepages(page, zone, 1 << order, migratetype);
+	account_freepages(zone, 1 << order, migratetype);
 
 	while (order < MAX_PAGE_ORDER) {
 		int buddy_mt = migratetype;
@@ -6930,7 +6930,7 @@ static bool try_to_accept_memory_one(struct zone *zone)
 	list_del(&page->lru);
 	last = list_empty(&zone->unaccepted_pages);
 
-	account_freepages(page, zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+	account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
 	__mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
@@ -6982,7 +6982,7 @@ static bool __free_unaccepted(struct page *page)
 	spin_lock_irqsave(&zone->lock, flags);
 	first = list_empty(&zone->unaccepted_pages);
 	list_add_tail(&page->lru, &zone->unaccepted_pages);
-	account_freepages(page, zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+	account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
 	__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
 	spin_unlock_irqrestore(&zone->lock, flags);
 
-- 
2.44.0

[PATCH 3/3] mm: page_alloc: batch vmstat updates in expand()

Posted by Johannes Weiner 1 year, 10 months ago

On Wed, Mar 27, 2024 at 09:54:01AM +0100, Vlastimil Babka wrote:
> > @@ -1314,10 +1349,10 @@ static inline void expand(struct zone *zone, struct page *page,
> >  		 * Corresponding page table entries will not be touched,
> >  		 * pages will stay not present in virtual address space
> >  		 */
> > -		if (set_page_guard(zone, &page[size], high, migratetype))
> > +		if (set_page_guard(zone, &page[size], high))
> >  			continue;
> >  
> > -		add_to_free_list(&page[size], zone, high, migratetype);
> > +		add_to_free_list(&page[size], zone, high, migratetype, false);
> 
> This is account_freepages() in the hot loop, what if we instead used
> __add_to_free_list(), sum up nr_pages and called account_freepages() once
> outside of the loop?

How about this on top of the series? Could be folded into
mm-page_alloc-consolidate-free-page-accounting, but for credit and
bisectability (just in case) I think stand-alone makes sense.

---

From 361f5df28183d85c7718fe0b579438d3d58777be Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 27 Mar 2024 12:29:25 -0400
Subject: [PATCH 3/3] mm: page_alloc: batch vmstat updates in expand()

expand() currently updates vmstat for every subpage. This is
unnecessary, since they're all of the same zone and migratetype.

Count added pages locally, then do a single vmstat update.

Suggested-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
---
 mm/page_alloc.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8987e8869f6d..13fe5c612fbe 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1341,6 +1341,7 @@ static inline void expand(struct zone *zone, struct page *page,
 	int low, int high, int migratetype)
 {
 	unsigned long size = 1 << high;
+	unsigned long nr_added = 0;
 
 	while (high > low) {
 		high--;
@@ -1356,9 +1357,11 @@ static inline void expand(struct zone *zone, struct page *page,
 		if (set_page_guard(zone, &page[size], high))
 			continue;
 
-		add_to_free_list(&page[size], zone, high, migratetype, false);
+		__add_to_free_list(&page[size], zone, high, migratetype, false);
 		set_buddy_order(&page[size], high);
+		nr_added += size;
 	}
+	account_freepages(zone, nr_added, migratetype);
 }
 
 static void check_new_page_bad(struct page *page)
-- 
2.44.0

Re: [PATCH 3/3] mm: page_alloc: batch vmstat updates in expand()

Posted by Vlastimil Babka 1 year, 10 months ago

On 3/27/24 8:01 PM, Johannes Weiner wrote:
> On Wed, Mar 27, 2024 at 09:54:01AM +0100, Vlastimil Babka wrote:
>> > @@ -1314,10 +1349,10 @@ static inline void expand(struct zone *zone, struct page *page,
>> >  		 * Corresponding page table entries will not be touched,
>> >  		 * pages will stay not present in virtual address space
>> >  		 */
>> > -		if (set_page_guard(zone, &page[size], high, migratetype))
>> > +		if (set_page_guard(zone, &page[size], high))
>> >  			continue;
>> >  
>> > -		add_to_free_list(&page[size], zone, high, migratetype);
>> > +		add_to_free_list(&page[size], zone, high, migratetype, false);
>> 
>> This is account_freepages() in the hot loop, what if we instead used
>> __add_to_free_list(), sum up nr_pages and called account_freepages() once
>> outside of the loop?
> 
> How about this on top of the series? Could be folded into
> mm-page_alloc-consolidate-free-page-accounting, but for credit and
> bisectability (just in case) I think stand-alone makes sense.
> 
> ---
> 
> From 361f5df28183d85c7718fe0b579438d3d58777be Mon Sep 17 00:00:00 2001
> From: Johannes Weiner <hannes@cmpxchg.org>
> Date: Wed, 27 Mar 2024 12:29:25 -0400
> Subject: [PATCH 3/3] mm: page_alloc: batch vmstat updates in expand()
> 
> expand() currently updates vmstat for every subpage. This is
> unnecessary, since they're all of the same zone and migratetype.
> 
> Count added pages locally, then do a single vmstat update.
> 
> Suggested-by: Vlastimil Babka <vbabka@suse.cz>
> Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>

Reviewed-by: Vlastimil Babka <vbabka@suse.cz>

> ---
>  mm/page_alloc.c | 5 ++++-
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 8987e8869f6d..13fe5c612fbe 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1341,6 +1341,7 @@ static inline void expand(struct zone *zone, struct page *page,
>  	int low, int high, int migratetype)
>  {
>  	unsigned long size = 1 << high;
> +	unsigned long nr_added = 0;
>  
>  	while (high > low) {
>  		high--;
> @@ -1356,9 +1357,11 @@ static inline void expand(struct zone *zone, struct page *page,
>  		if (set_page_guard(zone, &page[size], high))
>  			continue;
>  
> -		add_to_free_list(&page[size], zone, high, migratetype, false);
> +		__add_to_free_list(&page[size], zone, high, migratetype, false);
>  		set_buddy_order(&page[size], high);
> +		nr_added += size;
>  	}
> +	account_freepages(zone, nr_added, migratetype);
>  }
>  
>  static void check_new_page_bad(struct page *page)