[v1] khugepaged: Asynchronous mTHP collapse

[RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Dev Jain 1 year, 1 month ago

Pass order to alloc_charge_folio() and update mTHP statistics.

Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 include/linux/huge_mm.h |  2 ++
 mm/huge_memory.c        |  4 ++++
 mm/khugepaged.c         | 13 +++++++++----
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 93e509b6c00e..8b6d0fed99b3 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -119,6 +119,8 @@ enum mthp_stat_item {
 	MTHP_STAT_ANON_FAULT_ALLOC,
 	MTHP_STAT_ANON_FAULT_FALLBACK,
 	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
+	MTHP_STAT_ANON_COLLAPSE_ALLOC,
+	MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED,
 	MTHP_STAT_ZSWPOUT,
 	MTHP_STAT_SWPIN,
 	MTHP_STAT_SWPIN_FALLBACK,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2da5520bfe24..2e582fad4c77 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -615,6 +615,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
 DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
 DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+DEFINE_MTHP_STAT_ATTR(anon_collapse_alloc, MTHP_STAT_ANON_COLLAPSE_ALLOC);
+DEFINE_MTHP_STAT_ATTR(anon_collapse_alloc_failed, MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);
 DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
 DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
 DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
@@ -636,6 +638,8 @@ static struct attribute *anon_stats_attrs[] = {
 	&anon_fault_alloc_attr.attr,
 	&anon_fault_fallback_attr.attr,
 	&anon_fault_fallback_charge_attr.attr,
+	&anon_collapse_alloc_attr.attr,
+	&anon_collapse_alloc_failed_attr.attr,
 #ifndef CONFIG_SHMEM
 	&zswpout_attr.attr,
 	&swpin_attr.attr,
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 95643e6e5f31..02cd424b8e48 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1073,21 +1073,26 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
 }
 
 static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
-			      struct collapse_control *cc)
+			      int order, struct collapse_control *cc)
 {
 	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
 		     GFP_TRANSHUGE);
 	int node = hpage_collapse_find_target_node(cc);
 	struct folio *folio;
 
-	folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
+	folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
 	if (!folio) {
 		*foliop = NULL;
 		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
+		if (order != HPAGE_PMD_ORDER)
+			count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);
 		return SCAN_ALLOC_HUGE_PAGE_FAIL;
 	}
 
 	count_vm_event(THP_COLLAPSE_ALLOC);
+	if (order != HPAGE_PMD_ORDER)
+		count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC);
+
 	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
 		folio_put(folio);
 		*foliop = NULL;
@@ -1124,7 +1129,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	 */
 	mmap_read_unlock(mm);
 
-	result = alloc_charge_folio(&folio, mm, cc);
+	result = alloc_charge_folio(&folio, mm, order, cc);
 	if (result != SCAN_SUCCEED)
 		goto out_nolock;
 
@@ -1850,7 +1855,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
 	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
 	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
 
-	result = alloc_charge_folio(&new_folio, mm, cc);
+	result = alloc_charge_folio(&new_folio, mm, HPAGE_PMD_ORDER, cc);
 	if (result != SCAN_SUCCEED)
 		goto out;
 
-- 
2.30.2

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Ryan Roberts 1 year, 1 month ago

On 16/12/2024 16:50, Dev Jain wrote:
> Pass order to alloc_charge_folio() and update mTHP statistics.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>  include/linux/huge_mm.h |  2 ++
>  mm/huge_memory.c        |  4 ++++
>  mm/khugepaged.c         | 13 +++++++++----
>  3 files changed, 15 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 93e509b6c00e..8b6d0fed99b3 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -119,6 +119,8 @@ enum mthp_stat_item {
>  	MTHP_STAT_ANON_FAULT_ALLOC,
>  	MTHP_STAT_ANON_FAULT_FALLBACK,
>  	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> +	MTHP_STAT_ANON_COLLAPSE_ALLOC,
> +	MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED,
>  	MTHP_STAT_ZSWPOUT,
>  	MTHP_STAT_SWPIN,
>  	MTHP_STAT_SWPIN_FALLBACK,
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2da5520bfe24..2e582fad4c77 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -615,6 +615,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>  DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>  DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +DEFINE_MTHP_STAT_ATTR(anon_collapse_alloc, MTHP_STAT_ANON_COLLAPSE_ALLOC);
> +DEFINE_MTHP_STAT_ATTR(anon_collapse_alloc_failed, MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);
>  DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
>  DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
>  DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
> @@ -636,6 +638,8 @@ static struct attribute *anon_stats_attrs[] = {
>  	&anon_fault_alloc_attr.attr,
>  	&anon_fault_fallback_attr.attr,
>  	&anon_fault_fallback_charge_attr.attr,
> +	&anon_collapse_alloc_attr.attr,
> +	&anon_collapse_alloc_failed_attr.attr,
>  #ifndef CONFIG_SHMEM
>  	&zswpout_attr.attr,
>  	&swpin_attr.attr,
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 95643e6e5f31..02cd424b8e48 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1073,21 +1073,26 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
>  }
>  
>  static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> -			      struct collapse_control *cc)
> +			      int order, struct collapse_control *cc)
>  {
>  	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
>  		     GFP_TRANSHUGE);
>  	int node = hpage_collapse_find_target_node(cc);
>  	struct folio *folio;
>  
> -	folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
> +	folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
>  	if (!folio) {
>  		*foliop = NULL;
>  		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> +		if (order != HPAGE_PMD_ORDER)
> +			count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);

Bug? We should be calling count_mthp_stat() for all orders, but only calling
count_vm_event(THP_*) for PMD_ORDER, as per pattern laid out by other mTHP stats.

The aim is for existing THP stats (which are implicitly only counting PMD-sized
THP) to continue only to count PMD-sized THP. It's a userspace ABI and we were
scared of the potential to break things if we changed the existing counters'
semantics.

>  		return SCAN_ALLOC_HUGE_PAGE_FAIL;
>  	}
>  
>  	count_vm_event(THP_COLLAPSE_ALLOC);
> +	if (order != HPAGE_PMD_ORDER)
> +		count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC);

Same problem.

Also, I agree with Baolin that we don't want "anon" in the title. This is a
generic path used for file-backed memory. So once you fix the bug, the new stats
will also be counting the file-backed memory too (although for now, only for
PMD_ORDER).

> +
>  	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
>  		folio_put(folio);
>  		*foliop = NULL;
> @@ -1124,7 +1129,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>  	 */
>  	mmap_read_unlock(mm);
>  
> -	result = alloc_charge_folio(&folio, mm, cc);
> +	result = alloc_charge_folio(&folio, mm, order, cc);

Where is order coming from? I'm guessing that's added later, so this patch won't
compile on it's own? Perhaps HPAGE_PMD_ORDER for now?

>  	if (result != SCAN_SUCCEED)
>  		goto out_nolock;
>  
> @@ -1850,7 +1855,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>  	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
>  	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
>  
> -	result = alloc_charge_folio(&new_folio, mm, cc);
> +	result = alloc_charge_folio(&new_folio, mm, HPAGE_PMD_ORDER, cc);
>  	if (result != SCAN_SUCCEED)
>  		goto out;
>

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Dev Jain 1 year, 1 month ago

On 17/12/24 12:23 pm, Ryan Roberts wrote:
> On 16/12/2024 16:50, Dev Jain wrote:
>> Pass order to alloc_charge_folio() and update mTHP statistics.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   include/linux/huge_mm.h |  2 ++
>>   mm/huge_memory.c        |  4 ++++
>>   mm/khugepaged.c         | 13 +++++++++----
>>   3 files changed, 15 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 93e509b6c00e..8b6d0fed99b3 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -119,6 +119,8 @@ enum mthp_stat_item {
>>   	MTHP_STAT_ANON_FAULT_ALLOC,
>>   	MTHP_STAT_ANON_FAULT_FALLBACK,
>>   	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>> +	MTHP_STAT_ANON_COLLAPSE_ALLOC,
>> +	MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED,
>>   	MTHP_STAT_ZSWPOUT,
>>   	MTHP_STAT_SWPIN,
>>   	MTHP_STAT_SWPIN_FALLBACK,
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 2da5520bfe24..2e582fad4c77 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -615,6 +615,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>>   DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> +DEFINE_MTHP_STAT_ATTR(anon_collapse_alloc, MTHP_STAT_ANON_COLLAPSE_ALLOC);
>> +DEFINE_MTHP_STAT_ATTR(anon_collapse_alloc_failed, MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);
>>   DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
>>   DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
>>   DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
>> @@ -636,6 +638,8 @@ static struct attribute *anon_stats_attrs[] = {
>>   	&anon_fault_alloc_attr.attr,
>>   	&anon_fault_fallback_attr.attr,
>>   	&anon_fault_fallback_charge_attr.attr,
>> +	&anon_collapse_alloc_attr.attr,
>> +	&anon_collapse_alloc_failed_attr.attr,
>>   #ifndef CONFIG_SHMEM
>>   	&zswpout_attr.attr,
>>   	&swpin_attr.attr,
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 95643e6e5f31..02cd424b8e48 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1073,21 +1073,26 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
>>   }
>>   
>>   static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
>> -			      struct collapse_control *cc)
>> +			      int order, struct collapse_control *cc)
>>   {
>>   	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
>>   		     GFP_TRANSHUGE);
>>   	int node = hpage_collapse_find_target_node(cc);
>>   	struct folio *folio;
>>   
>> -	folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
>> +	folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
>>   	if (!folio) {
>>   		*foliop = NULL;
>>   		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
>> +		if (order != HPAGE_PMD_ORDER)
>> +			count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);
> Bug? We should be calling count_mthp_stat() for all orders, but only calling
> count_vm_event(THP_*) for PMD_ORDER, as per pattern laid out by other mTHP stats.

Ah okay.

>
> The aim is for existing THP stats (which are implicitly only counting PMD-sized
> THP) to continue only to count PMD-sized THP. It's a userspace ABI and we were
> scared of the potential to break things if we changed the existing counters'
> semantics.
>
>>   		return SCAN_ALLOC_HUGE_PAGE_FAIL;
>>   	}
>>   
>>   	count_vm_event(THP_COLLAPSE_ALLOC);
>> +	if (order != HPAGE_PMD_ORDER)
>> +		count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC);
> Same problem.
>
> Also, I agree with Baolin that we don't want "anon" in the title. This is a
> generic path used for file-backed memory. So once you fix the bug, the new stats
> will also be counting the file-backed memory too (although for now, only for
> PMD_ORDER).

Sure.

>> +
>>   	if (unlikely(mem_cgroup_charge(folio, mm, gfp))) {
>>   		folio_put(folio);
>>   		*foliop = NULL;
>> @@ -1124,7 +1129,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
>>   	 */
>>   	mmap_read_unlock(mm);
>>   
>> -	result = alloc_charge_folio(&folio, mm, cc);
>> +	result = alloc_charge_folio(&folio, mm, order, cc);
> Where is order coming from? I'm guessing that's added later, so this patch won't
> compile on it's own? Perhaps HPAGE_PMD_ORDER for now?

Okay yes, this won't compile on its own. I'll ensure sequential buildability next time.

>
>>   	if (result != SCAN_SUCCEED)
>>   		goto out_nolock;
>>   
>> @@ -1850,7 +1855,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
>>   	VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
>>   	VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
>>   
>> -	result = alloc_charge_folio(&new_folio, mm, cc);
>> +	result = alloc_charge_folio(&new_folio, mm, HPAGE_PMD_ORDER, cc);
>>   	if (result != SCAN_SUCCEED)
>>   		goto out;
>>   
>

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Matthew Wilcox 1 year, 1 month ago

On Mon, Dec 16, 2024 at 10:20:55PM +0530, Dev Jain wrote:
>  static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> -			      struct collapse_control *cc)
> +			      int order, struct collapse_control *cc)

unsigned, surely?

>  	if (!folio) {
>  		*foliop = NULL;
>  		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> +		if (order != HPAGE_PMD_ORDER)
> +			count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);

i don't understand why we need new statistics here.  we already have a
signal that memory allocation failures are preventing collapse from
being successful, why do we care if it's mthp or actual thp?

>  	count_vm_event(THP_COLLAPSE_ALLOC);
> +	if (order != HPAGE_PMD_ORDER)
> +		count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC);

similar question

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Ryan Roberts 1 year, 1 month ago

On 17/12/2024 04:17, Matthew Wilcox wrote:
> On Mon, Dec 16, 2024 at 10:20:55PM +0530, Dev Jain wrote:
>>  static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
>> -			      struct collapse_control *cc)
>> +			      int order, struct collapse_control *cc)
> 
> unsigned, surely?

I'm obviously feeling argumentative this morning...

There are plenty of examples of order being signed and unsigned in the code
base... it's a mess. Certainly the mTHP changes up to now have opted for
(signed) int. And get_order(), which I would assume to the authority, returns
(signed) int.

Personally I prefer int for small positive integers; it's more compact. But if
we're trying to establish a pattern to use unsigned int for all new uses of
order, that fine too, let's just document it somewhere?

> 
>>  	if (!folio) {
>>  		*foliop = NULL;
>>  		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
>> +		if (order != HPAGE_PMD_ORDER)
>> +			count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);
> 
> i don't understand why we need new statistics here.  we already have a
> signal that memory allocation failures are preventing collapse from
> being successful, why do we care if it's mthp or actual thp?

We previously decided that all existing THP stats would continue to only count
PMD-sized THP for fear of breaking userspace in subtle ways, and instead would
introduce new mTHP stats that can count for each order. We already have
MTHP_STAT_ANON_FAULT_ALLOC and MTHP_STAT_ANON_FAULT_FALLBACK (amongst others) so
these new stats fit the pattern well, IMHO.

(except for the bug that I called out in the other mail; we should call
count_mthp_stat() for all orders and call count_vm_event() only for PMD_ORDER).

> 
>>  	count_vm_event(THP_COLLAPSE_ALLOC);
>> +	if (order != HPAGE_PMD_ORDER)
>> +		count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC);
> 
> similar question
>

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Christoph Lameter (Ampere) 1 year, 1 month ago

On Tue, 17 Dec 2024, Ryan Roberts wrote:

> We previously decided that all existing THP stats would continue to only count
> PMD-sized THP for fear of breaking userspace in subtle ways, and instead would
> introduce new mTHP stats that can count for each order. We already have
> MTHP_STAT_ANON_FAULT_ALLOC and MTHP_STAT_ANON_FAULT_FALLBACK (amongst others) so
> these new stats fit the pattern well, IMHO.

Could we move all the stats somewhere into sysfs where we can get them by
page order? /proc/vmstat keeps getting new counters.

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Ryan Roberts 1 year, 1 month ago

On 20/12/2024 17:41, Christoph Lameter (Ampere) wrote:
> On Tue, 17 Dec 2024, Ryan Roberts wrote:
> 
>> We previously decided that all existing THP stats would continue to only count
>> PMD-sized THP for fear of breaking userspace in subtle ways, and instead would
>> introduce new mTHP stats that can count for each order. We already have
>> MTHP_STAT_ANON_FAULT_ALLOC and MTHP_STAT_ANON_FAULT_FALLBACK (amongst others) so
>> these new stats fit the pattern well, IMHO.
> 
> Could we move all the stats somewhere into sysfs where we can get them by
> page order? /proc/vmstat keeps getting new counters.

This is exactly what has been done already for mthp stats. They live at:

/sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/stats/.

So there is a directory per size, and there is a file per stat.

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Christoph Lameter (Ampere) 1 year, 1 month ago

On Fri, 20 Dec 2024, Ryan Roberts wrote:

> > Could we move all the stats somewhere into sysfs where we can get them by
> > page order? /proc/vmstat keeps getting new counters.
>
> This is exactly what has been done already for mthp stats. They live at:
>
> /sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/stats/.
>
> So there is a directory per size, and there is a file per stat.

Then lets drop all THP and huge page counters from vmstat.

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Ryan Roberts 1 year, 1 month ago

Sorry I missed this before heading off for Christmas...

On 20/12/2024 18:47, Christoph Lameter (Ampere) wrote:
> On Fri, 20 Dec 2024, Ryan Roberts wrote:
> 
>>> Could we move all the stats somewhere into sysfs where we can get them by
>>> page order? /proc/vmstat keeps getting new counters.
>>
>> This is exactly what has been done already for mthp stats. They live at:
>>
>> /sys/kernel/mm/transparent_hugepage/hugepages-<size>kB/stats/.
>>
>> So there is a directory per size, and there is a file per stat.
> 
> Then lets drop all THP and huge page counters from vmstat.

Previous discussion concluded that we can't remove counters from vmstat for fear
of breaking user space. So the policy is that vmstat THP entries should remain,
but they continue to only count PMD-sized THP.

The PMD-sized THP counters are effectively duplicated at:

/sys/kernel/mm/transparent_hugepage/hugepages-2048kB/stats/

(or whatever your PMD size happens to be).

Thanks,
Ryan

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Zi Yan 1 year, 1 month ago

On 17 Dec 2024, at 2:09, Ryan Roberts wrote:

> On 17/12/2024 04:17, Matthew Wilcox wrote:
>> On Mon, Dec 16, 2024 at 10:20:55PM +0530, Dev Jain wrote:
>>>  static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
>>> -			      struct collapse_control *cc)
>>> +			      int order, struct collapse_control *cc)
>>
>> unsigned, surely?
>
> I'm obviously feeling argumentative this morning...
>
> There are plenty of examples of order being signed and unsigned in the code
> base... it's a mess. Certainly the mTHP changes up to now have opted for
> (signed) int. And get_order(), which I would assume to the authority, returns
> (signed) int.
>
> Personally I prefer int for small positive integers; it's more compact. But if
> we're trying to establish a pattern to use unsigned int for all new uses of
> order, that fine too, let's just document it somewhere?

If unsigned is used, I wonder how to handle
for (unsigned order = 9; order >= 0; order--) case. We will need a signed order to
make this work, right?

--
Best Regards,
Yan, Zi

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Baolin Wang 1 year, 1 month ago


On 2024/12/17 00:50, Dev Jain wrote:
> Pass order to alloc_charge_folio() and update mTHP statistics.
> 
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>   include/linux/huge_mm.h |  2 ++
>   mm/huge_memory.c        |  4 ++++
>   mm/khugepaged.c         | 13 +++++++++----
>   3 files changed, 15 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 93e509b6c00e..8b6d0fed99b3 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -119,6 +119,8 @@ enum mthp_stat_item {
>   	MTHP_STAT_ANON_FAULT_ALLOC,
>   	MTHP_STAT_ANON_FAULT_FALLBACK,
>   	MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
> +	MTHP_STAT_ANON_COLLAPSE_ALLOC,
> +	MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED,
>   	MTHP_STAT_ZSWPOUT,
>   	MTHP_STAT_SWPIN,
>   	MTHP_STAT_SWPIN_FALLBACK,
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 2da5520bfe24..2e582fad4c77 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -615,6 +615,8 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
>   DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +DEFINE_MTHP_STAT_ATTR(anon_collapse_alloc, MTHP_STAT_ANON_COLLAPSE_ALLOC);
> +DEFINE_MTHP_STAT_ATTR(anon_collapse_alloc_failed, MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);
>   DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
>   DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
>   DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
> @@ -636,6 +638,8 @@ static struct attribute *anon_stats_attrs[] = {
>   	&anon_fault_alloc_attr.attr,
>   	&anon_fault_fallback_attr.attr,
>   	&anon_fault_fallback_charge_attr.attr,
> +	&anon_collapse_alloc_attr.attr,
> +	&anon_collapse_alloc_failed_attr.attr,
>   #ifndef CONFIG_SHMEM
>   	&zswpout_attr.attr,
>   	&swpin_attr.attr,
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 95643e6e5f31..02cd424b8e48 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1073,21 +1073,26 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm,
>   }
>   
>   static int alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
> -			      struct collapse_control *cc)
> +			      int order, struct collapse_control *cc)
>   {
>   	gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
>   		     GFP_TRANSHUGE);
>   	int node = hpage_collapse_find_target_node(cc);
>   	struct folio *folio;
>   
> -	folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
> +	folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
>   	if (!folio) {
>   		*foliop = NULL;
>   		count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
> +		if (order != HPAGE_PMD_ORDER)
> +			count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);
>   		return SCAN_ALLOC_HUGE_PAGE_FAIL;
>   	}
>   
>   	count_vm_event(THP_COLLAPSE_ALLOC);
> +	if (order != HPAGE_PMD_ORDER)
> +		count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC);

File collapse will also call alloc_charge_folio() to allocate THP, so 
using term '_ANON_' is not suitable for both anon and file collapse.

Re: [RFC PATCH 02/12] khugepaged: Generalize alloc_charge_folio()

Posted by Dev Jain 1 year, 1 month ago

On 17/12/24 8:21 am, Baolin Wang wrote:
>
>
> On 2024/12/17 00:50, Dev Jain wrote:
>> Pass order to alloc_charge_folio() and update mTHP statistics.
>>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   include/linux/huge_mm.h |  2 ++
>>   mm/huge_memory.c        |  4 ++++
>>   mm/khugepaged.c         | 13 +++++++++----
>>   3 files changed, 15 insertions(+), 4 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 93e509b6c00e..8b6d0fed99b3 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -119,6 +119,8 @@ enum mthp_stat_item {
>>       MTHP_STAT_ANON_FAULT_ALLOC,
>>       MTHP_STAT_ANON_FAULT_FALLBACK,
>>       MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE,
>> +    MTHP_STAT_ANON_COLLAPSE_ALLOC,
>> +    MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED,
>>       MTHP_STAT_ZSWPOUT,
>>       MTHP_STAT_SWPIN,
>>       MTHP_STAT_SWPIN_FALLBACK,
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 2da5520bfe24..2e582fad4c77 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -615,6 +615,8 @@ static struct kobj_attribute _name##_attr = 
>> __ATTR_RO(_name)
>>   DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC);
>>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, 
>> MTHP_STAT_ANON_FAULT_FALLBACK);
>>   DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, 
>> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> +DEFINE_MTHP_STAT_ATTR(anon_collapse_alloc, 
>> MTHP_STAT_ANON_COLLAPSE_ALLOC);
>> +DEFINE_MTHP_STAT_ATTR(anon_collapse_alloc_failed, 
>> MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);
>>   DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT);
>>   DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN);
>>   DEFINE_MTHP_STAT_ATTR(swpin_fallback, MTHP_STAT_SWPIN_FALLBACK);
>> @@ -636,6 +638,8 @@ static struct attribute *anon_stats_attrs[] = {
>>       &anon_fault_alloc_attr.attr,
>>       &anon_fault_fallback_attr.attr,
>>       &anon_fault_fallback_charge_attr.attr,
>> +    &anon_collapse_alloc_attr.attr,
>> +    &anon_collapse_alloc_failed_attr.attr,
>>   #ifndef CONFIG_SHMEM
>>       &zswpout_attr.attr,
>>       &swpin_attr.attr,
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 95643e6e5f31..02cd424b8e48 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1073,21 +1073,26 @@ static int __collapse_huge_page_swapin(struct 
>> mm_struct *mm,
>>   }
>>     static int alloc_charge_folio(struct folio **foliop, struct 
>> mm_struct *mm,
>> -                  struct collapse_control *cc)
>> +                  int order, struct collapse_control *cc)
>>   {
>>       gfp_t gfp = (cc->is_khugepaged ? 
>> alloc_hugepage_khugepaged_gfpmask() :
>>                GFP_TRANSHUGE);
>>       int node = hpage_collapse_find_target_node(cc);
>>       struct folio *folio;
>>   -    folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, 
>> &cc->alloc_nmask);
>> +    folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
>>       if (!folio) {
>>           *foliop = NULL;
>>           count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
>> +        if (order != HPAGE_PMD_ORDER)
>> +            count_mthp_stat(order, 
>> MTHP_STAT_ANON_COLLAPSE_ALLOC_FAILED);
>>           return SCAN_ALLOC_HUGE_PAGE_FAIL;
>>       }
>>         count_vm_event(THP_COLLAPSE_ALLOC);
>> +    if (order != HPAGE_PMD_ORDER)
>> +        count_mthp_stat(order, MTHP_STAT_ANON_COLLAPSE_ALLOC);
>
> File collapse will also call alloc_charge_folio() to allocate THP, so 
> using term '_ANON_' is not suitable for both anon and file collapse.

But currently file collapse will only allocate a PMD-folio, and I have 
extended to mTHP only for anon, so it makes sense? Although
I get your point in general...