[v2] Improve folio split related functions

[PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by Zi Yan 2 months, 2 weeks ago

can_split_folio() is just a refcount comparison, making sure only the
split caller holds an extra pin. Open code it with
folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
used by folio_ref_freeze(), add folio_cache_references() to calculate it.

Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Zi Yan <ziy@nvidia.com>
---
 include/linux/huge_mm.h |  1 -
 mm/huge_memory.c        | 43 ++++++++++++++++-------------------------
 mm/vmscan.c             |  3 ++-
 3 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 97686fb46e30..1ecaeccf39c9 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -369,7 +369,6 @@ enum split_type {
 	SPLIT_TYPE_NON_UNIFORM,
 };
 
-bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
 int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
 		unsigned int new_order);
 int folio_split_unmapped(struct folio *folio, unsigned int new_order);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c1f1055165dd..6c821c1c0ac3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
 	}
 }
 
-/* Racy check whether the huge page can be split */
-bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
-{
-	int extra_pins;
-
-	/* Additional pins from page cache */
-	if (folio_test_anon(folio))
-		extra_pins = folio_test_swapcache(folio) ?
-				folio_nr_pages(folio) : 0;
-	else
-		extra_pins = folio_nr_pages(folio);
-	if (pextra_pins)
-		*pextra_pins = extra_pins;
-	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
-					caller_pins;
-}
-
 static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
 {
 	for (; nr_pages; page++, nr_pages--)
@@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
 	return 0;
 }
 
+/* Number of folio references from the pagecache or the swapcache. */
+static unsigned int folio_cache_references(const struct folio *folio)
+{
+	if (folio_test_anon(folio) && !folio_test_swapcache(folio))
+		return 0;
+	return folio_nr_pages(folio);
+}
+
 static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
 					     struct page *split_at, struct xa_state *xas,
 					     struct address_space *mapping, bool do_lru,
 					     struct list_head *list, enum split_type split_type,
-					     pgoff_t end, int *nr_shmem_dropped, int extra_pins)
+					     pgoff_t end, int *nr_shmem_dropped)
 {
 	struct folio *end_folio = folio_next(folio);
 	struct folio *new_folio, *next;
 	int old_order = folio_order(folio);
 	int ret = 0;
 	struct deferred_split *ds_queue;
+	int extra_pins = folio_cache_references(folio);
 
 	VM_WARN_ON_ONCE(!mapping && end);
 	/* Prevent deferred_split_scan() touching ->_refcount */
@@ -3956,7 +3948,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	struct folio *new_folio, *next;
 	int nr_shmem_dropped = 0;
 	int remap_flags = 0;
-	int extra_pins, ret;
+	int ret;
 	pgoff_t end = 0;
 
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
@@ -4036,7 +4028,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	 * Racy check if we can split the page, before unmap_folio() will
 	 * split PMDs
 	 */
-	if (!can_split_folio(folio, 1, &extra_pins)) {
+	if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1) {
 		ret = -EAGAIN;
 		goto out_unlock;
 	}
@@ -4059,8 +4051,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	}
 
 	ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping,
-						true, list, split_type, end, &nr_shmem_dropped,
-						extra_pins);
+						true, list, split_type, end, &nr_shmem_dropped);
 fail:
 	if (mapping)
 		xas_unlock(&xas);
@@ -4134,20 +4125,20 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
  */
 int folio_split_unmapped(struct folio *folio, unsigned int new_order)
 {
-	int extra_pins, ret = 0;
+	int ret = 0;
 
 	VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
 	VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);
 
-	if (!can_split_folio(folio, 1, &extra_pins))
+	if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1)
 		return -EAGAIN;
 
 	local_irq_disable();
 	ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL,
 						NULL, false, NULL, SPLIT_TYPE_UNIFORM,
-						0, NULL, extra_pins);
+						0, NULL);
 	local_irq_enable();
 	return ret;
 }
@@ -4640,7 +4631,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 		 * can be split or not. So skip the check here.
 		 */
 		if (!folio_test_private(folio) &&
-		    !can_split_folio(folio, 0, NULL))
+		    folio_expected_ref_count(folio) != folio_ref_count(folio))
 			goto next;
 
 		if (!folio_trylock(folio))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 92980b072121..3b85652a42b9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1284,7 +1284,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
 					goto keep_locked;
 				if (folio_test_large(folio)) {
 					/* cannot split folio, skip it */
-					if (!can_split_folio(folio, 1, NULL))
+					if (folio_expected_ref_count(folio) !=
+					    folio_ref_count(folio) - 1)
 						goto activate_locked;
 					/*
 					 * Split partially mapped folios right away.
-- 
2.51.0

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by Balbir Singh 2 months, 2 weeks ago

On 11/22/25 13:55, Zi Yan wrote:
> can_split_folio() is just a refcount comparison, making sure only the
> split caller holds an extra pin. Open code it with
> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
> 
> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>  include/linux/huge_mm.h |  1 -
>  mm/huge_memory.c        | 43 ++++++++++++++++-------------------------
>  mm/vmscan.c             |  3 ++-
>  3 files changed, 19 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 97686fb46e30..1ecaeccf39c9 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -369,7 +369,6 @@ enum split_type {
>  	SPLIT_TYPE_NON_UNIFORM,
>  };
>  
> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>  int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>  		unsigned int new_order);
>  int folio_split_unmapped(struct folio *folio, unsigned int new_order);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c1f1055165dd..6c821c1c0ac3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>  	}
>  }
>  
> -/* Racy check whether the huge page can be split */
> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
> -{
> -	int extra_pins;
> -
> -	/* Additional pins from page cache */
> -	if (folio_test_anon(folio))
> -		extra_pins = folio_test_swapcache(folio) ?
> -				folio_nr_pages(folio) : 0;
> -	else
> -		extra_pins = folio_nr_pages(folio);
> -	if (pextra_pins)
> -		*pextra_pins = extra_pins;
> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
> -					caller_pins;
> -}
> -
>  static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>  {
>  	for (; nr_pages; page++, nr_pages--)
> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>  	return 0;
>  }
>  
> +/* Number of folio references from the pagecache or the swapcache. */
> +static unsigned int folio_cache_references(const struct folio *folio)

folio_cache_ref_count?

> +{
> +	if (folio_test_anon(folio) && !folio_test_swapcache(folio))
> +		return 0;
> +	return folio_nr_pages(folio);
> +}
> +

Does this belong to include/linux/mm.h with the other helpers?

>  static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
>  					     struct page *split_at, struct xa_state *xas,
>  					     struct address_space *mapping, bool do_lru,
>  					     struct list_head *list, enum split_type split_type,
> -					     pgoff_t end, int *nr_shmem_dropped, int extra_pins)
> +					     pgoff_t end, int *nr_shmem_dropped)
>  {
>  	struct folio *end_folio = folio_next(folio);
>  	struct folio *new_folio, *next;
>  	int old_order = folio_order(folio);
>  	int ret = 0;
>  	struct deferred_split *ds_queue;
> +	int extra_pins = folio_cache_references(folio);
>  
>  	VM_WARN_ON_ONCE(!mapping && end);
>  	/* Prevent deferred_split_scan() touching ->_refcount */
> @@ -3956,7 +3948,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>  	struct folio *new_folio, *next;
>  	int nr_shmem_dropped = 0;
>  	int remap_flags = 0;
> -	int extra_pins, ret;
> +	int ret;
>  	pgoff_t end = 0;
>  
>  	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
> @@ -4036,7 +4028,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>  	 * Racy check if we can split the page, before unmap_folio() will
>  	 * split PMDs
>  	 */
> -	if (!can_split_folio(folio, 1, &extra_pins)) {
> +	if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1) {
>  		ret = -EAGAIN;
>  		goto out_unlock;
>  	}
> @@ -4059,8 +4051,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>  	}
>  
>  	ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping,
> -						true, list, split_type, end, &nr_shmem_dropped,
> -						extra_pins);
> +						true, list, split_type, end, &nr_shmem_dropped);
>  fail:
>  	if (mapping)
>  		xas_unlock(&xas);
> @@ -4134,20 +4125,20 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>   */
>  int folio_split_unmapped(struct folio *folio, unsigned int new_order)
>  {
> -	int extra_pins, ret = 0;
> +	int ret = 0;
>  
>  	VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
>  	VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
>  	VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
>  	VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);
>  
> -	if (!can_split_folio(folio, 1, &extra_pins))
> +	if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1)
>  		return -EAGAIN;
>  
>  	local_irq_disable();
>  	ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL,
>  						NULL, false, NULL, SPLIT_TYPE_UNIFORM,
> -						0, NULL, extra_pins);
> +						0, NULL);
>  	local_irq_enable();
>  	return ret;
>  }
> @@ -4640,7 +4631,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
>  		 * can be split or not. So skip the check here.
>  		 */
>  		if (!folio_test_private(folio) &&
> -		    !can_split_folio(folio, 0, NULL))
> +		    folio_expected_ref_count(folio) != folio_ref_count(folio))
>  			goto next;
>  
>  		if (!folio_trylock(folio))
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 92980b072121..3b85652a42b9 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1284,7 +1284,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
>  					goto keep_locked;
>  				if (folio_test_large(folio)) {
>  					/* cannot split folio, skip it */
> -					if (!can_split_folio(folio, 1, NULL))
> +					if (folio_expected_ref_count(folio) !=
> +					    folio_ref_count(folio) - 1)
>  						goto activate_locked;
>  					/*
>  					 * Split partially mapped folios right away.


Otherwise, LGTM
Acked-by: Balbir Singh <balbirs@nvidia.com>

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by David Hildenbrand (Red Hat) 2 months, 2 weeks ago

On 11/24/25 23:14, Balbir Singh wrote:
> On 11/22/25 13:55, Zi Yan wrote:
>> can_split_folio() is just a refcount comparison, making sure only the
>> split caller holds an extra pin. Open code it with
>> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>>
>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>   include/linux/huge_mm.h |  1 -
>>   mm/huge_memory.c        | 43 ++++++++++++++++-------------------------
>>   mm/vmscan.c             |  3 ++-
>>   3 files changed, 19 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 97686fb46e30..1ecaeccf39c9 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -369,7 +369,6 @@ enum split_type {
>>   	SPLIT_TYPE_NON_UNIFORM,
>>   };
>>   
>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>   int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>   		unsigned int new_order);
>>   int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index c1f1055165dd..6c821c1c0ac3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>   	}
>>   }
>>   
>> -/* Racy check whether the huge page can be split */
>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>> -{
>> -	int extra_pins;
>> -
>> -	/* Additional pins from page cache */
>> -	if (folio_test_anon(folio))
>> -		extra_pins = folio_test_swapcache(folio) ?
>> -				folio_nr_pages(folio) : 0;
>> -	else
>> -		extra_pins = folio_nr_pages(folio);
>> -	if (pextra_pins)
>> -		*pextra_pins = extra_pins;
>> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
>> -					caller_pins;
>> -}
>> -
>>   static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>>   {
>>   	for (; nr_pages; page++, nr_pages--)
>> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>   	return 0;
>>   }
>>   
>> +/* Number of folio references from the pagecache or the swapcache. */
>> +static unsigned int folio_cache_references(const struct folio *folio)
> 
> folio_cache_ref_count?

Yes, makes sense.

> 
>> +{
>> +	if (folio_test_anon(folio) && !folio_test_swapcache(folio))
>> +		return 0;
>> +	return folio_nr_pages(folio);
>> +}
 >> +>
> Does this belong to include/linux/mm.h with the other helpers?

Not for now I think, in particular, as we require earlier 
!folio->mapping checks to give a correct answer. Most people should be 
using folio_expected_ref_count().

-- 
Cheers

David

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by Zi Yan 2 months, 2 weeks ago

On 25 Nov 2025, at 3:55, David Hildenbrand (Red Hat) wrote:

> On 11/24/25 23:14, Balbir Singh wrote:
>> On 11/22/25 13:55, Zi Yan wrote:
>>> can_split_folio() is just a refcount comparison, making sure only the
>>> split caller holds an extra pin. Open code it with
>>> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>>> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>>>
>>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>> ---
>>>   include/linux/huge_mm.h |  1 -
>>>   mm/huge_memory.c        | 43 ++++++++++++++++-------------------------
>>>   mm/vmscan.c             |  3 ++-
>>>   3 files changed, 19 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index 97686fb46e30..1ecaeccf39c9 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -369,7 +369,6 @@ enum split_type {
>>>   	SPLIT_TYPE_NON_UNIFORM,
>>>   };
>>>  -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>>   int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>   		unsigned int new_order);
>>>   int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index c1f1055165dd..6c821c1c0ac3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>>   	}
>>>   }
>>>  -/* Racy check whether the huge page can be split */
>>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>> -{
>>> -	int extra_pins;
>>> -
>>> -	/* Additional pins from page cache */
>>> -	if (folio_test_anon(folio))
>>> -		extra_pins = folio_test_swapcache(folio) ?
>>> -				folio_nr_pages(folio) : 0;
>>> -	else
>>> -		extra_pins = folio_nr_pages(folio);
>>> -	if (pextra_pins)
>>> -		*pextra_pins = extra_pins;
>>> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
>>> -					caller_pins;
>>> -}
>>> -
>>>   static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>>>   {
>>>   	for (; nr_pages; page++, nr_pages--)
>>> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>>   	return 0;
>>>   }
>>>  +/* Number of folio references from the pagecache or the swapcache. */
>>> +static unsigned int folio_cache_references(const struct folio *folio)
>>
>> folio_cache_ref_count?
>
> Yes, makes sense.
>
>>
>>> +{
>>> +	if (folio_test_anon(folio) && !folio_test_swapcache(folio))
>>> +		return 0;
>>> +	return folio_nr_pages(folio);
>>> +}
>>> +>
>> Does this belong to include/linux/mm.h with the other helpers?
>
> Not for now I think, in particular, as we require earlier !folio->mapping checks to give a correct answer. Most people should be using folio_expected_ref_count().
>

Got it. Will use folio_cache_ref_count() in the next version.

Best Regards,
Yan, Zi

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by David Hildenbrand (Red Hat) 2 months, 2 weeks ago

On 11/22/25 03:55, Zi Yan wrote:
> can_split_folio() is just a refcount comparison, making sure only the
> split caller holds an extra pin. Open code it with
> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
> 
> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
>   include/linux/huge_mm.h |  1 -
>   mm/huge_memory.c        | 43 ++++++++++++++++-------------------------
>   mm/vmscan.c             |  3 ++-
>   3 files changed, 19 insertions(+), 28 deletions(-)
> 
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 97686fb46e30..1ecaeccf39c9 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -369,7 +369,6 @@ enum split_type {
>   	SPLIT_TYPE_NON_UNIFORM,
>   };
>   
> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>   int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>   		unsigned int new_order);
>   int folio_split_unmapped(struct folio *folio, unsigned int new_order);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c1f1055165dd..6c821c1c0ac3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>   	}
>   }
>   
> -/* Racy check whether the huge page can be split */
> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
> -{
> -	int extra_pins;
> -
> -	/* Additional pins from page cache */
> -	if (folio_test_anon(folio))
> -		extra_pins = folio_test_swapcache(folio) ?
> -				folio_nr_pages(folio) : 0;
> -	else
> -		extra_pins = folio_nr_pages(folio);
> -	if (pextra_pins)
> -		*pextra_pins = extra_pins;
> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
> -					caller_pins;
> -}
> -
>   static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>   {
>   	for (; nr_pages; page++, nr_pages--)
> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>   	return 0;
>   }
>   
> +/* Number of folio references from the pagecache or the swapcache. */
> +static unsigned int folio_cache_references(const struct folio *folio)
> +{
> +	if (folio_test_anon(folio) && !folio_test_swapcache(folio))
> +		return 0;
> +	return folio_nr_pages(folio);
> +}
> +
>   static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
>   					     struct page *split_at, struct xa_state *xas,
>   					     struct address_space *mapping, bool do_lru,
>   					     struct list_head *list, enum split_type split_type,
> -					     pgoff_t end, int *nr_shmem_dropped, int extra_pins)
> +					     pgoff_t end, int *nr_shmem_dropped)
>   {
>   	struct folio *end_folio = folio_next(folio);
>   	struct folio *new_folio, *next;
>   	int old_order = folio_order(folio);
>   	int ret = 0;
>   	struct deferred_split *ds_queue;
> +	int extra_pins = folio_cache_references(folio);

Can we just inline the call do folio_cache_references() and get rid of extra_pins.
(which is a bad name either way)


if (folio_ref_freeze(folio, folio_cache_references(folio) + 1) {


BTW, now that we have this helper, I wonder if we should then also do for
clarification on the unfreeze path:

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0acdc2f26ee0c..7cbcf61b7971d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3824,8 +3824,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
  
                         zone_device_private_split_cb(folio, new_folio);
  
-                       expected_refs = folio_expected_ref_count(new_folio) + 1;
-                       folio_ref_unfreeze(new_folio, expected_refs);
+                       folio_ref_unfreeze(new_folio, folio_cache_references(new_folio) + 1);
  
                         if (do_lru)
                                 lru_add_split_folio(folio, new_folio, lruvec, list);
@@ -3868,8 +3867,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
                  * Otherwise, a parallel folio_try_get() can grab @folio
                  * and its caller can see stale page cache entries.
                  */
-               expected_refs = folio_expected_ref_count(folio) + 1;
-               folio_ref_unfreeze(folio, expected_refs);
+               folio_ref_unfreeze(folio, folio_cache_references(folio) + 1);
  
                 if (do_lru)
                         unlock_page_lruvec(lruvec);


-- 
Cheers

David

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by Zi Yan 2 months, 2 weeks ago

On 24 Nov 2025, at 5:41, David Hildenbrand (Red Hat) wrote:

> On 11/22/25 03:55, Zi Yan wrote:
>> can_split_folio() is just a refcount comparison, making sure only the
>> split caller holds an extra pin. Open code it with
>> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>>
>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>>   include/linux/huge_mm.h |  1 -
>>   mm/huge_memory.c        | 43 ++++++++++++++++-------------------------
>>   mm/vmscan.c             |  3 ++-
>>   3 files changed, 19 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 97686fb46e30..1ecaeccf39c9 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -369,7 +369,6 @@ enum split_type {
>>   	SPLIT_TYPE_NON_UNIFORM,
>>   };
>>  -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>   int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>   		unsigned int new_order);
>>   int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index c1f1055165dd..6c821c1c0ac3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>   	}
>>   }
>>  -/* Racy check whether the huge page can be split */
>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>> -{
>> -	int extra_pins;
>> -
>> -	/* Additional pins from page cache */
>> -	if (folio_test_anon(folio))
>> -		extra_pins = folio_test_swapcache(folio) ?
>> -				folio_nr_pages(folio) : 0;
>> -	else
>> -		extra_pins = folio_nr_pages(folio);
>> -	if (pextra_pins)
>> -		*pextra_pins = extra_pins;
>> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
>> -					caller_pins;
>> -}
>> -
>>   static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>>   {
>>   	for (; nr_pages; page++, nr_pages--)
>> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>   	return 0;
>>   }
>>  +/* Number of folio references from the pagecache or the swapcache. */
>> +static unsigned int folio_cache_references(const struct folio *folio)
>> +{
>> +	if (folio_test_anon(folio) && !folio_test_swapcache(folio))
>> +		return 0;
>> +	return folio_nr_pages(folio);
>> +}
>> +
>>   static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
>>   					     struct page *split_at, struct xa_state *xas,
>>   					     struct address_space *mapping, bool do_lru,
>>   					     struct list_head *list, enum split_type split_type,
>> -					     pgoff_t end, int *nr_shmem_dropped, int extra_pins)
>> +					     pgoff_t end, int *nr_shmem_dropped)
>>   {
>>   	struct folio *end_folio = folio_next(folio);
>>   	struct folio *new_folio, *next;
>>   	int old_order = folio_order(folio);
>>   	int ret = 0;
>>   	struct deferred_split *ds_queue;
>> +	int extra_pins = folio_cache_references(folio);
>
> Can we just inline the call do folio_cache_references() and get rid of extra_pins.
> (which is a bad name either way)
>
>
> if (folio_ref_freeze(folio, folio_cache_references(folio) + 1) {
>
>
> BTW, now that we have this helper, I wonder if we should then also do for
> clarification on the unfreeze path:
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0acdc2f26ee0c..7cbcf61b7971d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3824,8 +3824,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>                          zone_device_private_split_cb(folio, new_folio);
>  -                       expected_refs = folio_expected_ref_count(new_folio) + 1;
> -                       folio_ref_unfreeze(new_folio, expected_refs);
> +                       folio_ref_unfreeze(new_folio, folio_cache_references(new_folio) + 1);
>                          if (do_lru)
>                                 lru_add_split_folio(folio, new_folio, lruvec, list);
> @@ -3868,8 +3867,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>                  * Otherwise, a parallel folio_try_get() can grab @folio
>                  * and its caller can see stale page cache entries.
>                  */
> -               expected_refs = folio_expected_ref_count(folio) + 1;
> -               folio_ref_unfreeze(folio, expected_refs);
> +               folio_ref_unfreeze(folio, folio_cache_references(folio) + 1);
>                  if (do_lru)
>                         unlock_page_lruvec(lruvec);
>
>

Both make sense to me. Will make the change.

By comparing folio_cache_references() with folio_expected_ref_count(),
one difference is that folio_expected_ref_count() does not give right
refcount for shmem in swapcache.

This is the folio_expected_ref_count() code:

        if (folio_test_anon(folio)) {
                /* One reference per page from the swapcache. */
                ref_count += folio_test_swapcache(folio) << order;
        } else {
                /* One reference per page from the pagecache. */
                ref_count += !!folio->mapping << order;
                /* One reference from PG_private. */
                ref_count += folio_test_private(folio);
        }

shmem in swapcache mean !folio_test_anon(folio) && folio_test_swapcache(folio).
The above code gives 0, but folio_cache_references() gives folio_nr_pages(folio).
It should not cause any issue, since IIUC shmem in swapcache happens
when the folio has an additional ref,
folio_expected_ref_count() != folio_ref_count() anyway. For split, it is
not supported yet, so folio_expected_ref_count() in split code does not
affect shmem in swapcache. But folio_expected_ref_count() should be
fixed, right?

Like:

        if (folio_test_anon(folio)) {
                /* One reference per page from the swapcache. */
                ref_count += folio_test_swapcache(folio) << order;
        } else {
				/* One reference per page from shmem in the swapcache. */
                ref_count += folio_test_swapcache(folio) << order;
                /* One reference per page from the pagecache. */
                ref_count += !!folio->mapping << order;
                /* One reference from PG_private. */
                ref_count += folio_test_private(folio);
        }

or simplified into

   		if (!folio_test_anon(folio)) {
                /* One reference per page from the pagecache. */
                ref_count += !!folio->mapping << order;
                /* One reference from PG_private. */
                ref_count += folio_test_private(folio);
        }
		/* One reference per page from the swapcache (anon or shmem). */
        ref_count += folio_test_swapcache(folio) << order;
?

Best Regards,
Yan, Zi

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by David Hildenbrand (Red Hat) 2 months, 2 weeks ago

On 11/24/25 18:05, Zi Yan wrote:
> On 24 Nov 2025, at 5:41, David Hildenbrand (Red Hat) wrote:
> 
>> On 11/22/25 03:55, Zi Yan wrote:
>>> can_split_folio() is just a refcount comparison, making sure only the
>>> split caller holds an extra pin. Open code it with
>>> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>>> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>>>
>>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>> ---
>>>    include/linux/huge_mm.h |  1 -
>>>    mm/huge_memory.c        | 43 ++++++++++++++++-------------------------
>>>    mm/vmscan.c             |  3 ++-
>>>    3 files changed, 19 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index 97686fb46e30..1ecaeccf39c9 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -369,7 +369,6 @@ enum split_type {
>>>    	SPLIT_TYPE_NON_UNIFORM,
>>>    };
>>>   -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>>    int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>    		unsigned int new_order);
>>>    int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index c1f1055165dd..6c821c1c0ac3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>>    	}
>>>    }
>>>   -/* Racy check whether the huge page can be split */
>>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>> -{
>>> -	int extra_pins;
>>> -
>>> -	/* Additional pins from page cache */
>>> -	if (folio_test_anon(folio))
>>> -		extra_pins = folio_test_swapcache(folio) ?
>>> -				folio_nr_pages(folio) : 0;
>>> -	else
>>> -		extra_pins = folio_nr_pages(folio);
>>> -	if (pextra_pins)
>>> -		*pextra_pins = extra_pins;
>>> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
>>> -					caller_pins;
>>> -}
>>> -
>>>    static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>>>    {
>>>    	for (; nr_pages; page++, nr_pages--)
>>> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>>    	return 0;
>>>    }
>>>   +/* Number of folio references from the pagecache or the swapcache. */
>>> +static unsigned int folio_cache_references(const struct folio *folio)
>>> +{
>>> +	if (folio_test_anon(folio) && !folio_test_swapcache(folio))
>>> +		return 0;
>>> +	return folio_nr_pages(folio);
>>> +}
>>> +
>>>    static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
>>>    					     struct page *split_at, struct xa_state *xas,
>>>    					     struct address_space *mapping, bool do_lru,
>>>    					     struct list_head *list, enum split_type split_type,
>>> -					     pgoff_t end, int *nr_shmem_dropped, int extra_pins)
>>> +					     pgoff_t end, int *nr_shmem_dropped)
>>>    {
>>>    	struct folio *end_folio = folio_next(folio);
>>>    	struct folio *new_folio, *next;
>>>    	int old_order = folio_order(folio);
>>>    	int ret = 0;
>>>    	struct deferred_split *ds_queue;
>>> +	int extra_pins = folio_cache_references(folio);
>>
>> Can we just inline the call do folio_cache_references() and get rid of extra_pins.
>> (which is a bad name either way)
>>
>>
>> if (folio_ref_freeze(folio, folio_cache_references(folio) + 1) {
>>
>>
>> BTW, now that we have this helper, I wonder if we should then also do for
>> clarification on the unfreeze path:
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 0acdc2f26ee0c..7cbcf61b7971d 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3824,8 +3824,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>>                           zone_device_private_split_cb(folio, new_folio);
>>   -                       expected_refs = folio_expected_ref_count(new_folio) + 1;
>> -                       folio_ref_unfreeze(new_folio, expected_refs);
>> +                       folio_ref_unfreeze(new_folio, folio_cache_references(new_folio) + 1);
>>                           if (do_lru)
>>                                  lru_add_split_folio(folio, new_folio, lruvec, list);
>> @@ -3868,8 +3867,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>>                   * Otherwise, a parallel folio_try_get() can grab @folio
>>                   * and its caller can see stale page cache entries.
>>                   */
>> -               expected_refs = folio_expected_ref_count(folio) + 1;
>> -               folio_ref_unfreeze(folio, expected_refs);
>> +               folio_ref_unfreeze(folio, folio_cache_references(folio) + 1);
>>                   if (do_lru)
>>                          unlock_page_lruvec(lruvec);
>>
>>
> 
> Both make sense to me. Will make the change.
> 
> By comparing folio_cache_references() with folio_expected_ref_count(),
> one difference is that folio_expected_ref_count() does not give right
> refcount for shmem in swapcache.

Good point. Likely nobody runs into that right now because nobody can 
really does anything with these folios before they were re-added to the 
pagecache or mapped into page tables.

> 
> This is the folio_expected_ref_count() code:
> 
>          if (folio_test_anon(folio)) {
>                  /* One reference per page from the swapcache. */
>                  ref_count += folio_test_swapcache(folio) << order;
>          } else {
>                  /* One reference per page from the pagecache. */
>                  ref_count += !!folio->mapping << order;
>                  /* One reference from PG_private. */
>                  ref_count += folio_test_private(folio);
>          }
> 
> shmem in swapcache mean !folio_test_anon(folio) && folio_test_swapcache(folio).

See below, it's actually

folio_test_anon(folio) && folio_test_swapbacked(folio)&& 
folio_test_swapcache(folio)

I think ...

> The above code gives 0, but folio_cache_references() gives folio_nr_pages(folio).
> It should not cause any issue, since IIUC shmem in swapcache happens
> when the folio has an additional ref,
> folio_expected_ref_count() != folio_ref_count() anyway. For split, it is
> not supported yet, 

Right.

> so folio_expected_ref_count() in split code does not
> affect shmem in swapcache. But folio_expected_ref_count() should be
> fixed, right?

We should better handle it, agreed.

Staring at the history of folio_expected_ref_count() once again, back 
when we had folio_expected_refs() in migration code we didn't seem to 
handle it I think.

-static int folio_expected_refs(struct address_space *mapping,
-               struct folio *folio)
-{
-       int refs = 1;
-       if (!mapping)
-               return refs;
-
-       refs += folio_nr_pages(folio);
-       if (folio_test_private(folio))
-               refs++;
-
-       return refs;
-}


gup.c doesn't care, because the pages are still mapped.

khugepaged.c similarly.

memfd.c doesn't care because the pages are still in the pagecache.

So I suspect nothing is broken, but the migration case needs a second look.

> 
> Like:
> 
>          if (folio_test_anon(folio)) {
>                  /* One reference per page from the swapcache. */
>                  ref_count += folio_test_swapcache(folio) << order;
>          } else {
> 				/* One reference per page from shmem in the swapcache. */
>                  ref_count += folio_test_swapcache(folio) << order;
>                  /* One reference per page from the pagecache. */
>                  ref_count += !!folio->mapping << order;
>                  /* One reference from PG_private. */
>                  ref_count += folio_test_private(folio);
>          }
> 
> or simplified into
> 
>     		if (!folio_test_anon(folio)) {
>                  /* One reference per page from the pagecache. */
>                  ref_count += !!folio->mapping << order;
>                  /* One reference from PG_private. */
>                  ref_count += folio_test_private(folio);
>          }
> 		/* One reference per page from the swapcache (anon or shmem). */
>          ref_count += folio_test_swapcache(folio) << order;
> ?

That is incorrect I think due to swapcache being able to give false 
positives (PG_owner_priv_1).

-- 
Cheers

David

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by Zi Yan 2 months, 2 weeks ago

On 24 Nov 2025, at 14:22, David Hildenbrand (Red Hat) wrote:

> On 11/24/25 18:05, Zi Yan wrote:
>> On 24 Nov 2025, at 5:41, David Hildenbrand (Red Hat) wrote:
>>
>>> On 11/22/25 03:55, Zi Yan wrote:
>>>> can_split_folio() is just a refcount comparison, making sure only the
>>>> split caller holds an extra pin. Open code it with
>>>> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>>>> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>>>>
>>>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>> ---
>>>>    include/linux/huge_mm.h |  1 -
>>>>    mm/huge_memory.c        | 43 ++++++++++++++++-------------------------
>>>>    mm/vmscan.c             |  3 ++-
>>>>    3 files changed, 19 insertions(+), 28 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index 97686fb46e30..1ecaeccf39c9 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -369,7 +369,6 @@ enum split_type {
>>>>    	SPLIT_TYPE_NON_UNIFORM,
>>>>    };
>>>>   -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>>>    int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>>    		unsigned int new_order);
>>>>    int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index c1f1055165dd..6c821c1c0ac3 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>>>    	}
>>>>    }
>>>>   -/* Racy check whether the huge page can be split */
>>>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>>> -{
>>>> -	int extra_pins;
>>>> -
>>>> -	/* Additional pins from page cache */
>>>> -	if (folio_test_anon(folio))
>>>> -		extra_pins = folio_test_swapcache(folio) ?
>>>> -				folio_nr_pages(folio) : 0;
>>>> -	else
>>>> -		extra_pins = folio_nr_pages(folio);
>>>> -	if (pextra_pins)
>>>> -		*pextra_pins = extra_pins;
>>>> -	return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
>>>> -					caller_pins;
>>>> -}
>>>> -
>>>>    static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>>>>    {
>>>>    	for (; nr_pages; page++, nr_pages--)
>>>> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>>>    	return 0;
>>>>    }
>>>>   +/* Number of folio references from the pagecache or the swapcache. */
>>>> +static unsigned int folio_cache_references(const struct folio *folio)
>>>> +{
>>>> +	if (folio_test_anon(folio) && !folio_test_swapcache(folio))
>>>> +		return 0;
>>>> +	return folio_nr_pages(folio);
>>>> +}
>>>> +
>>>>    static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
>>>>    					     struct page *split_at, struct xa_state *xas,
>>>>    					     struct address_space *mapping, bool do_lru,
>>>>    					     struct list_head *list, enum split_type split_type,
>>>> -					     pgoff_t end, int *nr_shmem_dropped, int extra_pins)
>>>> +					     pgoff_t end, int *nr_shmem_dropped)
>>>>    {
>>>>    	struct folio *end_folio = folio_next(folio);
>>>>    	struct folio *new_folio, *next;
>>>>    	int old_order = folio_order(folio);
>>>>    	int ret = 0;
>>>>    	struct deferred_split *ds_queue;
>>>> +	int extra_pins = folio_cache_references(folio);
>>>
>>> Can we just inline the call do folio_cache_references() and get rid of extra_pins.
>>> (which is a bad name either way)
>>>
>>>
>>> if (folio_ref_freeze(folio, folio_cache_references(folio) + 1) {
>>>
>>>
>>> BTW, now that we have this helper, I wonder if we should then also do for
>>> clarification on the unfreeze path:
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 0acdc2f26ee0c..7cbcf61b7971d 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3824,8 +3824,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>>>                           zone_device_private_split_cb(folio, new_folio);
>>>   -                       expected_refs = folio_expected_ref_count(new_folio) + 1;
>>> -                       folio_ref_unfreeze(new_folio, expected_refs);
>>> +                       folio_ref_unfreeze(new_folio, folio_cache_references(new_folio) + 1);
>>>                           if (do_lru)
>>>                                  lru_add_split_folio(folio, new_folio, lruvec, list);
>>> @@ -3868,8 +3867,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>>>                   * Otherwise, a parallel folio_try_get() can grab @folio
>>>                   * and its caller can see stale page cache entries.
>>>                   */
>>> -               expected_refs = folio_expected_ref_count(folio) + 1;
>>> -               folio_ref_unfreeze(folio, expected_refs);
>>> +               folio_ref_unfreeze(folio, folio_cache_references(folio) + 1);
>>>                   if (do_lru)
>>>                          unlock_page_lruvec(lruvec);
>>>
>>>
>>
>> Both make sense to me. Will make the change.
>>
>> By comparing folio_cache_references() with folio_expected_ref_count(),
>> one difference is that folio_expected_ref_count() does not give right
>> refcount for shmem in swapcache.
>
> Good point. Likely nobody runs into that right now because nobody can really does anything with these folios before they were re-added to the pagecache or mapped into page tables.
>
>>
>> This is the folio_expected_ref_count() code:
>>
>>          if (folio_test_anon(folio)) {
>>                  /* One reference per page from the swapcache. */
>>                  ref_count += folio_test_swapcache(folio) << order;
>>          } else {
>>                  /* One reference per page from the pagecache. */
>>                  ref_count += !!folio->mapping << order;
>>                  /* One reference from PG_private. */
>>                  ref_count += folio_test_private(folio);
>>          }
>>
>> shmem in swapcache mean !folio_test_anon(folio) && folio_test_swapcache(folio).
>
> See below, it's actually
>
> folio_test_anon(folio) && folio_test_swapbacked(folio)&& folio_test_swapcache(folio)

!folio_test_anon(folio) && folio_test_swapbacked(folio)&&
folio_test_swapcache(folio)

Right?

>
> I think ...
>
>> The above code gives 0, but folio_cache_references() gives folio_nr_pages(folio).
>> It should not cause any issue, since IIUC shmem in swapcache happens
>> when the folio has an additional ref,
>> folio_expected_ref_count() != folio_ref_count() anyway. For split, it is
>> not supported yet,
>
> Right.
>
>> so folio_expected_ref_count() in split code does not
>> affect shmem in swapcache. But folio_expected_ref_count() should be
>> fixed, right?
>
> We should better handle it, agreed.
>
> Staring at the history of folio_expected_ref_count() once again, back when we had folio_expected_refs() in migration code we didn't seem to handle it I think.
>
> -static int folio_expected_refs(struct address_space *mapping,
> -               struct folio *folio)
> -{
> -       int refs = 1;
> -       if (!mapping)
> -               return refs;
> -
> -       refs += folio_nr_pages(folio);
> -       if (folio_test_private(folio))
> -               refs++;
> -
> -       return refs;
> -}
>
>
> gup.c doesn't care, because the pages are still mapped.
>
> khugepaged.c similarly.
>
> memfd.c doesn't care because the pages are still in the pagecache.
>
> So I suspect nothing is broken, but the migration case needs a second look.

For migration, shmem in swapcache happens in shmem_writeout(), where an
additional ref is placed on the folio. And migration caller places
a ref on the folio to before a migration. The folio has 2 refs and it is
not equal to folio_expected_ref_count()(returning 0) + 1 ,
or folio_expected_refs()(returning 1).

So it is safe.

>
>>
>> Like:
>>
>>          if (folio_test_anon(folio)) {
>>                  /* One reference per page from the swapcache. */
>>                  ref_count += folio_test_swapcache(folio) << order;
>>          } else {
>> 				/* One reference per page from shmem in the swapcache. */
>>                  ref_count += folio_test_swapcache(folio) << order;
>>                  /* One reference per page from the pagecache. */
>>                  ref_count += !!folio->mapping << order;
>>                  /* One reference from PG_private. */
>>                  ref_count += folio_test_private(folio);
>>          }
>>
>> or simplified into
>>
>>     		if (!folio_test_anon(folio)) {
>>                  /* One reference per page from the pagecache. */
>>                  ref_count += !!folio->mapping << order;
>>                  /* One reference from PG_private. */
>>                  ref_count += folio_test_private(folio);
>>          }
>> 		/* One reference per page from the swapcache (anon or shmem). */
>>          ref_count += folio_test_swapcache(folio) << order;
>> ?
>
> That is incorrect I think due to swapcache being able to give false positives (PG_owner_priv_1).

Got it. So it should be:

          if (folio_test_anon(folio)) {
                  /* One reference per page from the swapcache. */
                  ref_count += folio_test_swapcache(folio) << order;
          } else {
 				/* One reference per page from shmem in the swapcache. */
                  ref_count += (folio_test_swapbacked (folio) &&
								folio_test_swapcache(folio)) << order;
                  /* One reference per page from the pagecache. */
                  ref_count += !!folio->mapping << order;
                  /* One reference from PG_private. */
                  ref_count += folio_test_private(folio);
          }

I wonder if we should have folio_test_shmem_in_swapcache() instead.

BTW, this page flag reuse is really confusing. I see PG_checked is
PG_owner_priv_1 too and __folio_migrate_mapping() uses folio_test_swapcache()
to decide the number of i_pages entries. Wouldn’t that cause any issue?
ext4 does not release_folio() for migration when PG_checked is set,
ubifs clears PG_checked in release_folio(). I have not checked all other FS
yet. Maybe later.


Best Regards,
Yan, Zi

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by Miaohe Lin 2 months, 2 weeks ago

On 2025/11/25 5:08, Zi Yan wrote:
> On 24 Nov 2025, at 14:22, David Hildenbrand (Red Hat) wrote:
> 

<snip>

> 
> BTW, this page flag reuse is really confusing. I see PG_checked is
> PG_owner_priv_1 too and __folio_migrate_mapping() uses folio_test_swapcache()
> to decide the number of i_pages entries. Wouldn’t that cause any issue?
> ext4 does not release_folio() for migration when PG_checked is set,
> ubifs clears PG_checked in release_folio(). I have not checked all other FS
> yet. Maybe later.

folio_test_swapbacked() is also checked in folio_test_swapcache:

static __always_inline bool folio_test_swapcache(struct folio *folio)
{
	return folio_test_swapbacked(folio) &&
			test_bit(PG_swapcache, folio_flags(folio, 0));
}

So IMHO the reuse of this page flag should work fine.

Thanks.
.

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by David Hildenbrand (Red Hat) 2 months, 2 weeks ago

On 11/25/25 10:10, Miaohe Lin wrote:
> On 2025/11/25 5:08, Zi Yan wrote:
>> On 24 Nov 2025, at 14:22, David Hildenbrand (Red Hat) wrote:
>>
> 
> <snip>
> 
>>
>> BTW, this page flag reuse is really confusing. I see PG_checked is
>> PG_owner_priv_1 too and __folio_migrate_mapping() uses folio_test_swapcache()
>> to decide the number of i_pages entries. Wouldn’t that cause any issue?
>> ext4 does not release_folio() for migration when PG_checked is set,
>> ubifs clears PG_checked in release_folio(). I have not checked all other FS
>> yet. Maybe later.
> 
> folio_test_swapbacked() is also checked in folio_test_swapcache:
> 
> static __always_inline bool folio_test_swapcache(struct folio *folio)
> {
> 	return folio_test_swapbacked(folio) &&
> 			test_bit(PG_swapcache, folio_flags(folio, 0));
> }
> 
> So IMHO the reuse of this page flag should work fine.

Ahh, thanks for pointing that out. Confusing, as usually the 
folio_test_*() helper are straight bit tests,

All good then :)

-- 
Cheers

David

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by David Hildenbrand (Red Hat) 2 months, 2 weeks ago

>>
>>>
>>> Like:
>>>
>>>           if (folio_test_anon(folio)) {
>>>                   /* One reference per page from the swapcache. */
>>>                   ref_count += folio_test_swapcache(folio) << order;
>>>           } else {
>>> 				/* One reference per page from shmem in the swapcache. */
>>>                   ref_count += folio_test_swapcache(folio) << order;
>>>                   /* One reference per page from the pagecache. */
>>>                   ref_count += !!folio->mapping << order;
>>>                   /* One reference from PG_private. */
>>>                   ref_count += folio_test_private(folio);
>>>           }
>>>
>>> or simplified into
>>>
>>>      		if (!folio_test_anon(folio)) {
>>>                   /* One reference per page from the pagecache. */
>>>                   ref_count += !!folio->mapping << order;
>>>                   /* One reference from PG_private. */
>>>                   ref_count += folio_test_private(folio);
>>>           }
>>> 		/* One reference per page from the swapcache (anon or shmem). */
>>>           ref_count += folio_test_swapcache(folio) << order;
>>> ?
>>
>> That is incorrect I think due to swapcache being able to give false positives (PG_owner_priv_1).
> 
> Got it. So it should be:
> 
>            if (folio_test_anon(folio)) {
>                    /* One reference per page from the swapcache. */
>                    ref_count += folio_test_swapcache(folio) << order;
>            } else {
>   				/* One reference per page from shmem in the swapcache. */
>                    ref_count += (folio_test_swapbacked (folio) &&
> 								folio_test_swapcache(folio)) << order;
>                    /* One reference per page from the pagecache. */
>                    ref_count += !!folio->mapping << order;
>                    /* One reference from PG_private. */
>                    ref_count += folio_test_private(folio);
>            }

Interestingly, I think we would then also take proper care of anon folios in the
swapcache that are not anon yet. See __read_swap_cache_async().

I wonder if we can clean that up a bit, to highlight that PG_private etc
do not apply.

if (folio_test_anon(folio)) {
	/* One reference per page from the swapcache. */
	ref_count += folio_test_swapcache(folio) << order;
} else if (folio_test_swapbacked (folio) && folio_test_swapcache(folio)) {
	/* to-be-anon or shmem folio in the swapcache (!folio->mapping) */
	ref_count += 1ul << order;
	VM_WAN_ON_ONCE(folio->mapping);
} else {
	/* One reference per page from the pagecache. */
	ref_count += !!folio->mapping << order;
	/* One reference from PG_private. */
	ref_count += folio_test_private(folio);
}

Or maybe simply:


if (folio_test_swapbacked (folio) && folio_test_swapcache(folio)) {
	/*
	 * (to-be) anon or shmem (!folio->mapping) folio in the swapcache:
	 * One reference per page from the swapcache.
	 */
	ref_count += 1 << order;
	VM_WAN_ON_ONCE(!folio_test_anon(folio) && folio->mapping);
} else if (!folio_test_anon(folio)) {
	/* One reference per page from the pagecache. */
	ref_count += !!folio->mapping << order;
	/* One reference from PG_private. */
	ref_count += folio_test_private(folio);
}

> 
> I wonder if we should have folio_test_shmem_in_swapcache() instead.

Interestingly, thinking about it, I think it would also match to-be anon folios
and anon folios.

folio_in_swapcache() maybe ?

> 
> BTW, this page flag reuse is really confusing. 

Yes ...

> I see PG_checked is
> PG_owner_priv_1 too and __folio_migrate_mapping() uses folio_test_swapcache()
> to decide the number of i_pages entries. Wouldn’t that cause any issue?

Maybe at that point all false positives were ruled out?

It is horrible TBH.

-- 
Cheers

David

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by Zi Yan 2 months, 2 weeks ago

On 25 Nov 2025, at 3:52, David Hildenbrand (Red Hat) wrote:

>>>
>>>>
>>>> Like:
>>>>
>>>>           if (folio_test_anon(folio)) {
>>>>                   /* One reference per page from the swapcache. */
>>>>                   ref_count += folio_test_swapcache(folio) << order;
>>>>           } else {
>>>> 				/* One reference per page from shmem in the swapcache. */
>>>>                   ref_count += folio_test_swapcache(folio) << order;
>>>>                   /* One reference per page from the pagecache. */
>>>>                   ref_count += !!folio->mapping << order;
>>>>                   /* One reference from PG_private. */
>>>>                   ref_count += folio_test_private(folio);
>>>>           }
>>>>
>>>> or simplified into
>>>>
>>>>      		if (!folio_test_anon(folio)) {
>>>>                   /* One reference per page from the pagecache. */
>>>>                   ref_count += !!folio->mapping << order;
>>>>                   /* One reference from PG_private. */
>>>>                   ref_count += folio_test_private(folio);
>>>>           }
>>>> 		/* One reference per page from the swapcache (anon or shmem). */
>>>>           ref_count += folio_test_swapcache(folio) << order;
>>>> ?
>>>
>>> That is incorrect I think due to swapcache being able to give false positives (PG_owner_priv_1).
>>
>> Got it. So it should be:
>>
>>            if (folio_test_anon(folio)) {
>>                    /* One reference per page from the swapcache. */
>>                    ref_count += folio_test_swapcache(folio) << order;
>>            } else {
>>   				/* One reference per page from shmem in the swapcache. */
>>                    ref_count += (folio_test_swapbacked (folio) &&
>> 								folio_test_swapcache(folio)) << order;
>>                    /* One reference per page from the pagecache. */
>>                    ref_count += !!folio->mapping << order;
>>                    /* One reference from PG_private. */
>>                    ref_count += folio_test_private(folio);
>>            }
>
> Interestingly, I think we would then also take proper care of anon folios in the
> swapcache that are not anon yet. See __read_swap_cache_async().

Right. After add_to_swap_cache() in __read_swap_cache_async(), the folio
there is in the same state as shmem in swapcache.

>
> I wonder if we can clean that up a bit, to highlight that PG_private etc
> do not apply.
>
> if (folio_test_anon(folio)) {
> 	/* One reference per page from the swapcache. */
> 	ref_count += folio_test_swapcache(folio) << order;
> } else if (folio_test_swapbacked (folio) && folio_test_swapcache(folio)) {
> 	/* to-be-anon or shmem folio in the swapcache (!folio->mapping) */
> 	ref_count += 1ul << order;
> 	VM_WAN_ON_ONCE(folio->mapping);
> } else {
> 	/* One reference per page from the pagecache. */
> 	ref_count += !!folio->mapping << order;
> 	/* One reference from PG_private. */
> 	ref_count += folio_test_private(folio);
> }

I like this better, will send a patch for folio_expected_ref_count()
separately. Since folio_test_swapcache(folio) implies
folio_test_swapbacked (folio) as Maolin pointed out in another
email, I will get rid of folio_test_swapbacked(folio) in the above code.

>
> Or maybe simply:
>
>
> if (folio_test_swapbacked (folio) && folio_test_swapcache(folio)) {
> 	/*
> 	 * (to-be) anon or shmem (!folio->mapping) folio in the swapcache:
> 	 * One reference per page from the swapcache.
> 	 */
> 	ref_count += 1 << order;
> 	VM_WAN_ON_ONCE(!folio_test_anon(folio) && folio->mapping);
> } else if (!folio_test_anon(folio)) {
> 	/* One reference per page from the pagecache. */
> 	ref_count += !!folio->mapping << order;
> 	/* One reference from PG_private. */
> 	ref_count += folio_test_private(folio);
> }
>
>>
>> I wonder if we should have folio_test_shmem_in_swapcache() instead.
>
> Interestingly, thinking about it, I think it would also match to-be anon folios
> and anon folios.
>
> folio_in_swapcache() maybe ?

Yes, will come up with a patch for it and send along with
folio_expected_ref_count() patch.

>
>>
>> BTW, this page flag reuse is really confusing.
>
> Yes ...
>
>> I see PG_checked is
>> PG_owner_priv_1 too and __folio_migrate_mapping() uses folio_test_swapcache()
>> to decide the number of i_pages entries. Wouldn’t that cause any issue?
>
> Maybe at that point all false positives were ruled out?
>
> It is horrible TBH.
>
> -- 
> Cheers
>
> David


Best Regards,
Yan, Zi

Re: [PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation

Posted by Wei Yang 2 months, 2 weeks ago

On Fri, Nov 21, 2025 at 09:55:27PM -0500, Zi Yan wrote:
>can_split_folio() is just a refcount comparison, making sure only the
>split caller holds an extra pin. Open code it with
>folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>
>Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>Signed-off-by: Zi Yan <ziy@nvidia.com>

LGTM, Thanks

Reviewed-by: Wei Yang <richard.weiyang@gmail.com>

-- 
Wei Yang
Help you, Help me

[PATCH v2 1/4] mm/huge_memory: change folio_split_supported() to folio_check_splittable()
[PATCH v2 2/4] mm/huge_memory: replace can_split_folio() with direct refcount calculation
[PATCH v2 3/4] mm/huge_memory: make min_order_for_split() always return an order
[PATCH v2 4/4] mm/huge_memory: fix folio split stats counting