[PATCH RFC v1 2/2] mm, pcp: add more detail info about high order page count

Vern Hao posted 2 patches 1 year, 11 months ago
[PATCH RFC v1 2/2] mm, pcp: add more detail info about high order page count
Posted by Vern Hao 1 year, 11 months ago
From: Xin Hao <vernhao@tencent.com>

With this patch, we can see the distribution of pages of different orders on
each cpu, just like below.
	#cat /proc/zoneinfo
    ....
    cpu: 2
              total_count: 14286
                  order0 : 1260
                  order1 : 13
                  order2 : 42
                  order3 : 4
                  order4 : 0
                  order5 : 0
                  order6 : 0
                  order7 : 0
                  order8 : 0
                  order9 : 25
                  order10: 0
                  order11: 0
                  order12: 0
              high:  14541
              batch: 63

Signed-off-by: Xin Hao <vernhao@tencent.com>
---
 include/linux/mmzone.h |  1 +
 mm/page_alloc.c        |  4 ++++
 mm/vmstat.c            | 18 ++++++++++++------
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 883168776fea..55d25b4f51e5 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -684,6 +684,7 @@ enum zone_watermarks {
 struct per_cpu_pages {
 	spinlock_t lock;	/* Protects lists field */
 	int total_count;	/* total number of pages in the list */
+	int count[NR_PCP_LISTS]; /* per-order page counts */
 	int high;		/* high watermark, emptying needed */
 	int high_min;		/* min high watermark */
 	int high_max;		/* max high watermark */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 4e91e429b8d1..7ec2dc5c5ea5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1228,6 +1228,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 			list_del(&page->pcp_list);
 			count -= nr_pages;
 			pcp->total_count -= nr_pages;
+			pcp->count[order] -= 1;
 
 			/* MIGRATE_ISOLATE page should not go to pcplists */
 			VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
@@ -2478,6 +2479,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
 	pindex = order_to_pindex(migratetype, order);
 	list_add(&page->pcp_list, &pcp->lists[pindex]);
 	pcp->total_count += 1 << order;
+	pcp->count[order] += 1;
 
 	batch = READ_ONCE(pcp->batch);
 	/*
@@ -2858,6 +2860,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 					migratetype, alloc_flags);
 
 			pcp->total_count += alloced << order;
+			pcp->count[order] += alloced;
 			if (unlikely(list_empty(list)))
 				return NULL;
 		}
@@ -2865,6 +2868,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
 		page = list_first_entry(list, struct page, pcp_list);
 		list_del(&page->pcp_list);
 		pcp->total_count -= 1 << order;
+		pcp->count[order] -= 1;
 	} while (check_new_pages(page, order));
 
 	return page;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c1e8096ff0a6..e04300ec450f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1735,19 +1735,25 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 
 	seq_printf(m, "\n  pagesets");
 	for_each_online_cpu(i) {
+		int j;
 		struct per_cpu_pages *pcp;
 		struct per_cpu_zonestat __maybe_unused *pzstats;
 
 		pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
 		seq_printf(m,
 			   "\n    cpu: %i"
-			   "\n              count: %i"
-			   "\n              high:  %i"
-			   "\n              batch: %i",
+			   "\n              total_count: %i",
 			   i,
-			   pcp->total_count,
-			   pcp->high,
-			   pcp->batch);
+			   pcp->total_count);
+		for (j = 0; j < NR_PCP_LISTS; j++)
+			seq_printf(m,
+				   "\n                  order%-2i: %-3i",
+				   j, pcp->count[j]);
+		seq_printf(m,
+                          "\n              high:  %i"
+                          "\n              batch: %i",
+                          pcp->high,
+                          pcp->batch);
 #ifdef CONFIG_SMP
 		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
 		seq_printf(m, "\n  vm stats threshold: %d",
-- 
2.31.1
Re: [PATCH RFC v1 2/2] mm, pcp: add more detail info about high order page count
Posted by Mel Gorman 1 year, 11 months ago
On Mon, Jan 15, 2024 at 05:34:36PM +0800, Vern Hao wrote:
> From: Xin Hao <vernhao@tencent.com>
> 
> With this patch, we can see the distribution of pages of different orders on
> each cpu, just like below.
> 	#cat /proc/zoneinfo
>     ....
>     cpu: 2
>               total_count: 14286
>                   order0 : 1260
>                   order1 : 13
>                   order2 : 42
>                   order3 : 4
>                   order4 : 0
>                   order5 : 0
>                   order6 : 0
>                   order7 : 0
>                   order8 : 0
>                   order9 : 25
>                   order10: 0
>                   order11: 0
>                   order12: 0
>               high:  14541
>               batch: 63
> 
> Signed-off-by: Xin Hao <vernhao@tencent.com>

I am not a major fan because increasing the size of a per-cpu structure for
debugging purposes incurs a cost for everyone while only a tiny minority
may care. There is a mild risk it would break existing parsers of that file
although maybe that's not a big deal. However, the same information could be
extracted by locking the pcp structures and counting the items per list. It
would increase the cost of reading zoneinfo but it's unlikely the file is
read at high frequency. If that was a concern, a separate proc file could be
used. Finally, the same information likely can be extracted via a systemtap
script, a BPF script (if it can get to the right symbols and locking, I
didn't check) or via a kernel probe. Even with that information, it's not
clear what meaningful action a user can take, so this is a developer-only
feature really with a cost incurred for everybody.

-- 
Mel Gorman
SUSE Labs
Re: [PATCH RFC v1 2/2] mm, pcp: add more detail info about high order page count
Posted by David Rientjes 1 year, 11 months ago
On Mon, 15 Jan 2024, Vern Hao wrote:

> From: Xin Hao <vernhao@tencent.com>
> 
> With this patch, we can see the distribution of pages of different orders on
> each cpu, just like below.
> 	#cat /proc/zoneinfo
>     ....
>     cpu: 2
>               total_count: 14286

I don't think we should be changing the naming of the field if there are 
existing users that parse /proc/zoneinfo.

>                   order0 : 1260
>                   order1 : 13
>                   order2 : 42
>                   order3 : 4
>                   order4 : 0
>                   order5 : 0
>                   order6 : 0
>                   order7 : 0
>                   order8 : 0
>                   order9 : 25
>                   order10: 0
>                   order11: 0
>                   order12: 0
>               high:  14541
>               batch: 63
> 
> Signed-off-by: Xin Hao <vernhao@tencent.com>
> ---
>  include/linux/mmzone.h |  1 +
>  mm/page_alloc.c        |  4 ++++
>  mm/vmstat.c            | 18 ++++++++++++------
>  3 files changed, 17 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 883168776fea..55d25b4f51e5 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -684,6 +684,7 @@ enum zone_watermarks {
>  struct per_cpu_pages {
>  	spinlock_t lock;	/* Protects lists field */
>  	int total_count;	/* total number of pages in the list */
> +	int count[NR_PCP_LISTS]; /* per-order page counts */
>  	int high;		/* high watermark, emptying needed */
>  	int high_min;		/* min high watermark */
>  	int high_max;		/* max high watermark */
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 4e91e429b8d1..7ec2dc5c5ea5 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -1228,6 +1228,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
>  			list_del(&page->pcp_list);
>  			count -= nr_pages;
>  			pcp->total_count -= nr_pages;
> +			pcp->count[order] -= 1;
>  
>  			/* MIGRATE_ISOLATE page should not go to pcplists */
>  			VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
> @@ -2478,6 +2479,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp,
>  	pindex = order_to_pindex(migratetype, order);
>  	list_add(&page->pcp_list, &pcp->lists[pindex]);
>  	pcp->total_count += 1 << order;
> +	pcp->count[order] += 1;
>  
>  	batch = READ_ONCE(pcp->batch);
>  	/*
> @@ -2858,6 +2860,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
>  					migratetype, alloc_flags);
>  
>  			pcp->total_count += alloced << order;
> +			pcp->count[order] += alloced;
>  			if (unlikely(list_empty(list)))
>  				return NULL;
>  		}
> @@ -2865,6 +2868,7 @@ struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
>  		page = list_first_entry(list, struct page, pcp_list);
>  		list_del(&page->pcp_list);
>  		pcp->total_count -= 1 << order;
> +		pcp->count[order] -= 1;
>  	} while (check_new_pages(page, order));
>  
>  	return page;
> diff --git a/mm/vmstat.c b/mm/vmstat.c
> index c1e8096ff0a6..e04300ec450f 100644
> --- a/mm/vmstat.c
> +++ b/mm/vmstat.c
> @@ -1735,19 +1735,25 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
>  
>  	seq_printf(m, "\n  pagesets");
>  	for_each_online_cpu(i) {
> +		int j;
>  		struct per_cpu_pages *pcp;
>  		struct per_cpu_zonestat __maybe_unused *pzstats;
>  
>  		pcp = per_cpu_ptr(zone->per_cpu_pageset, i);
>  		seq_printf(m,
>  			   "\n    cpu: %i"
> -			   "\n              count: %i"
> -			   "\n              high:  %i"
> -			   "\n              batch: %i",
> +			   "\n              total_count: %i",
>  			   i,
> -			   pcp->total_count,
> -			   pcp->high,
> -			   pcp->batch);
> +			   pcp->total_count);
> +		for (j = 0; j < NR_PCP_LISTS; j++)
> +			seq_printf(m,
> +				   "\n                  order%-2i: %-3i",
> +				   j, pcp->count[j]);
> +		seq_printf(m,
> +                          "\n              high:  %i"
> +                          "\n              batch: %i",
> +                          pcp->high,
> +                          pcp->batch);
>  #ifdef CONFIG_SMP
>  		pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i);
>  		seq_printf(m, "\n  vm stats threshold: %d",
> -- 
> 2.31.1
> 
> 
>