can_split_folio() is just a refcount comparison, making sure only the
split caller holds an extra pin. Open code it with
folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
used by folio_ref_freeze(), add folio_cache_references() to calculate it.
Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
Signed-off-by: Zi Yan <ziy@nvidia.com>
---
include/linux/huge_mm.h | 1 -
mm/huge_memory.c | 43 ++++++++++++++++-------------------------
mm/vmscan.c | 3 ++-
3 files changed, 19 insertions(+), 28 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 97686fb46e30..1ecaeccf39c9 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -369,7 +369,6 @@ enum split_type {
SPLIT_TYPE_NON_UNIFORM,
};
-bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
unsigned int new_order);
int folio_split_unmapped(struct folio *folio, unsigned int new_order);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c1f1055165dd..6c821c1c0ac3 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
}
}
-/* Racy check whether the huge page can be split */
-bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
-{
- int extra_pins;
-
- /* Additional pins from page cache */
- if (folio_test_anon(folio))
- extra_pins = folio_test_swapcache(folio) ?
- folio_nr_pages(folio) : 0;
- else
- extra_pins = folio_nr_pages(folio);
- if (pextra_pins)
- *pextra_pins = extra_pins;
- return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
- caller_pins;
-}
-
static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
{
for (; nr_pages; page++, nr_pages--)
@@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
return 0;
}
+/* Number of folio references from the pagecache or the swapcache. */
+static unsigned int folio_cache_references(const struct folio *folio)
+{
+ if (folio_test_anon(folio) && !folio_test_swapcache(folio))
+ return 0;
+ return folio_nr_pages(folio);
+}
+
static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
struct page *split_at, struct xa_state *xas,
struct address_space *mapping, bool do_lru,
struct list_head *list, enum split_type split_type,
- pgoff_t end, int *nr_shmem_dropped, int extra_pins)
+ pgoff_t end, int *nr_shmem_dropped)
{
struct folio *end_folio = folio_next(folio);
struct folio *new_folio, *next;
int old_order = folio_order(folio);
int ret = 0;
struct deferred_split *ds_queue;
+ int extra_pins = folio_cache_references(folio);
VM_WARN_ON_ONCE(!mapping && end);
/* Prevent deferred_split_scan() touching ->_refcount */
@@ -3956,7 +3948,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
struct folio *new_folio, *next;
int nr_shmem_dropped = 0;
int remap_flags = 0;
- int extra_pins, ret;
+ int ret;
pgoff_t end = 0;
VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
@@ -4036,7 +4028,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
* Racy check if we can split the page, before unmap_folio() will
* split PMDs
*/
- if (!can_split_folio(folio, 1, &extra_pins)) {
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1) {
ret = -EAGAIN;
goto out_unlock;
}
@@ -4059,8 +4051,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
}
ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping,
- true, list, split_type, end, &nr_shmem_dropped,
- extra_pins);
+ true, list, split_type, end, &nr_shmem_dropped);
fail:
if (mapping)
xas_unlock(&xas);
@@ -4134,20 +4125,20 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
*/
int folio_split_unmapped(struct folio *folio, unsigned int new_order)
{
- int extra_pins, ret = 0;
+ int ret = 0;
VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);
- if (!can_split_folio(folio, 1, &extra_pins))
+ if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1)
return -EAGAIN;
local_irq_disable();
ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL,
NULL, false, NULL, SPLIT_TYPE_UNIFORM,
- 0, NULL, extra_pins);
+ 0, NULL);
local_irq_enable();
return ret;
}
@@ -4640,7 +4631,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
* can be split or not. So skip the check here.
*/
if (!folio_test_private(folio) &&
- !can_split_folio(folio, 0, NULL))
+ folio_expected_ref_count(folio) != folio_ref_count(folio))
goto next;
if (!folio_trylock(folio))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 92980b072121..3b85652a42b9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1284,7 +1284,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
goto keep_locked;
if (folio_test_large(folio)) {
/* cannot split folio, skip it */
- if (!can_split_folio(folio, 1, NULL))
+ if (folio_expected_ref_count(folio) !=
+ folio_ref_count(folio) - 1)
goto activate_locked;
/*
* Split partially mapped folios right away.
--
2.51.0
On 11/22/25 13:55, Zi Yan wrote:
> can_split_folio() is just a refcount comparison, making sure only the
> split caller holds an extra pin. Open code it with
> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>
> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
> include/linux/huge_mm.h | 1 -
> mm/huge_memory.c | 43 ++++++++++++++++-------------------------
> mm/vmscan.c | 3 ++-
> 3 files changed, 19 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 97686fb46e30..1ecaeccf39c9 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -369,7 +369,6 @@ enum split_type {
> SPLIT_TYPE_NON_UNIFORM,
> };
>
> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
> int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> unsigned int new_order);
> int folio_split_unmapped(struct folio *folio, unsigned int new_order);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c1f1055165dd..6c821c1c0ac3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
> }
> }
>
> -/* Racy check whether the huge page can be split */
> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
> -{
> - int extra_pins;
> -
> - /* Additional pins from page cache */
> - if (folio_test_anon(folio))
> - extra_pins = folio_test_swapcache(folio) ?
> - folio_nr_pages(folio) : 0;
> - else
> - extra_pins = folio_nr_pages(folio);
> - if (pextra_pins)
> - *pextra_pins = extra_pins;
> - return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
> - caller_pins;
> -}
> -
> static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
> {
> for (; nr_pages; page++, nr_pages--)
> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
> return 0;
> }
>
> +/* Number of folio references from the pagecache or the swapcache. */
> +static unsigned int folio_cache_references(const struct folio *folio)
folio_cache_ref_count?
> +{
> + if (folio_test_anon(folio) && !folio_test_swapcache(folio))
> + return 0;
> + return folio_nr_pages(folio);
> +}
> +
Does this belong to include/linux/mm.h with the other helpers?
> static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
> struct page *split_at, struct xa_state *xas,
> struct address_space *mapping, bool do_lru,
> struct list_head *list, enum split_type split_type,
> - pgoff_t end, int *nr_shmem_dropped, int extra_pins)
> + pgoff_t end, int *nr_shmem_dropped)
> {
> struct folio *end_folio = folio_next(folio);
> struct folio *new_folio, *next;
> int old_order = folio_order(folio);
> int ret = 0;
> struct deferred_split *ds_queue;
> + int extra_pins = folio_cache_references(folio);
>
> VM_WARN_ON_ONCE(!mapping && end);
> /* Prevent deferred_split_scan() touching ->_refcount */
> @@ -3956,7 +3948,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
> struct folio *new_folio, *next;
> int nr_shmem_dropped = 0;
> int remap_flags = 0;
> - int extra_pins, ret;
> + int ret;
> pgoff_t end = 0;
>
> VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
> @@ -4036,7 +4028,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
> * Racy check if we can split the page, before unmap_folio() will
> * split PMDs
> */
> - if (!can_split_folio(folio, 1, &extra_pins)) {
> + if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1) {
> ret = -EAGAIN;
> goto out_unlock;
> }
> @@ -4059,8 +4051,7 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
> }
>
> ret = __folio_freeze_and_split_unmapped(folio, new_order, split_at, &xas, mapping,
> - true, list, split_type, end, &nr_shmem_dropped,
> - extra_pins);
> + true, list, split_type, end, &nr_shmem_dropped);
> fail:
> if (mapping)
> xas_unlock(&xas);
> @@ -4134,20 +4125,20 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
> */
> int folio_split_unmapped(struct folio *folio, unsigned int new_order)
> {
> - int extra_pins, ret = 0;
> + int ret = 0;
>
> VM_WARN_ON_ONCE_FOLIO(folio_mapped(folio), folio);
> VM_WARN_ON_ONCE_FOLIO(!folio_test_locked(folio), folio);
> VM_WARN_ON_ONCE_FOLIO(!folio_test_large(folio), folio);
> VM_WARN_ON_ONCE_FOLIO(!folio_test_anon(folio), folio);
>
> - if (!can_split_folio(folio, 1, &extra_pins))
> + if (folio_expected_ref_count(folio) != folio_ref_count(folio) - 1)
> return -EAGAIN;
>
> local_irq_disable();
> ret = __folio_freeze_and_split_unmapped(folio, new_order, &folio->page, NULL,
> NULL, false, NULL, SPLIT_TYPE_UNIFORM,
> - 0, NULL, extra_pins);
> + 0, NULL);
> local_irq_enable();
> return ret;
> }
> @@ -4640,7 +4631,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
> * can be split or not. So skip the check here.
> */
> if (!folio_test_private(folio) &&
> - !can_split_folio(folio, 0, NULL))
> + folio_expected_ref_count(folio) != folio_ref_count(folio))
> goto next;
>
> if (!folio_trylock(folio))
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index 92980b072121..3b85652a42b9 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1284,7 +1284,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list,
> goto keep_locked;
> if (folio_test_large(folio)) {
> /* cannot split folio, skip it */
> - if (!can_split_folio(folio, 1, NULL))
> + if (folio_expected_ref_count(folio) !=
> + folio_ref_count(folio) - 1)
> goto activate_locked;
> /*
> * Split partially mapped folios right away.
Otherwise, LGTM
Acked-by: Balbir Singh <balbirs@nvidia.com>
On 11/24/25 23:14, Balbir Singh wrote:
> On 11/22/25 13:55, Zi Yan wrote:
>> can_split_folio() is just a refcount comparison, making sure only the
>> split caller holds an extra pin. Open code it with
>> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>>
>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>> include/linux/huge_mm.h | 1 -
>> mm/huge_memory.c | 43 ++++++++++++++++-------------------------
>> mm/vmscan.c | 3 ++-
>> 3 files changed, 19 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 97686fb46e30..1ecaeccf39c9 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -369,7 +369,6 @@ enum split_type {
>> SPLIT_TYPE_NON_UNIFORM,
>> };
>>
>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>> int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>> unsigned int new_order);
>> int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index c1f1055165dd..6c821c1c0ac3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>> }
>> }
>>
>> -/* Racy check whether the huge page can be split */
>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>> -{
>> - int extra_pins;
>> -
>> - /* Additional pins from page cache */
>> - if (folio_test_anon(folio))
>> - extra_pins = folio_test_swapcache(folio) ?
>> - folio_nr_pages(folio) : 0;
>> - else
>> - extra_pins = folio_nr_pages(folio);
>> - if (pextra_pins)
>> - *pextra_pins = extra_pins;
>> - return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
>> - caller_pins;
>> -}
>> -
>> static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>> {
>> for (; nr_pages; page++, nr_pages--)
>> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>> return 0;
>> }
>>
>> +/* Number of folio references from the pagecache or the swapcache. */
>> +static unsigned int folio_cache_references(const struct folio *folio)
>
> folio_cache_ref_count?
Yes, makes sense.
>
>> +{
>> + if (folio_test_anon(folio) && !folio_test_swapcache(folio))
>> + return 0;
>> + return folio_nr_pages(folio);
>> +}
>> +>
> Does this belong to include/linux/mm.h with the other helpers?
Not for now I think, in particular, as we require earlier
!folio->mapping checks to give a correct answer. Most people should be
using folio_expected_ref_count().
--
Cheers
David
On 25 Nov 2025, at 3:55, David Hildenbrand (Red Hat) wrote:
> On 11/24/25 23:14, Balbir Singh wrote:
>> On 11/22/25 13:55, Zi Yan wrote:
>>> can_split_folio() is just a refcount comparison, making sure only the
>>> split caller holds an extra pin. Open code it with
>>> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>>> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>>>
>>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>> ---
>>> include/linux/huge_mm.h | 1 -
>>> mm/huge_memory.c | 43 ++++++++++++++++-------------------------
>>> mm/vmscan.c | 3 ++-
>>> 3 files changed, 19 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index 97686fb46e30..1ecaeccf39c9 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -369,7 +369,6 @@ enum split_type {
>>> SPLIT_TYPE_NON_UNIFORM,
>>> };
>>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>> int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>> unsigned int new_order);
>>> int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index c1f1055165dd..6c821c1c0ac3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>> }
>>> }
>>> -/* Racy check whether the huge page can be split */
>>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>> -{
>>> - int extra_pins;
>>> -
>>> - /* Additional pins from page cache */
>>> - if (folio_test_anon(folio))
>>> - extra_pins = folio_test_swapcache(folio) ?
>>> - folio_nr_pages(folio) : 0;
>>> - else
>>> - extra_pins = folio_nr_pages(folio);
>>> - if (pextra_pins)
>>> - *pextra_pins = extra_pins;
>>> - return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
>>> - caller_pins;
>>> -}
>>> -
>>> static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>>> {
>>> for (; nr_pages; page++, nr_pages--)
>>> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>> return 0;
>>> }
>>> +/* Number of folio references from the pagecache or the swapcache. */
>>> +static unsigned int folio_cache_references(const struct folio *folio)
>>
>> folio_cache_ref_count?
>
> Yes, makes sense.
>
>>
>>> +{
>>> + if (folio_test_anon(folio) && !folio_test_swapcache(folio))
>>> + return 0;
>>> + return folio_nr_pages(folio);
>>> +}
>>> +>
>> Does this belong to include/linux/mm.h with the other helpers?
>
> Not for now I think, in particular, as we require earlier !folio->mapping checks to give a correct answer. Most people should be using folio_expected_ref_count().
>
Got it. Will use folio_cache_ref_count() in the next version.
Best Regards,
Yan, Zi
On 11/22/25 03:55, Zi Yan wrote:
> can_split_folio() is just a refcount comparison, making sure only the
> split caller holds an extra pin. Open code it with
> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>
> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
> Signed-off-by: Zi Yan <ziy@nvidia.com>
> ---
> include/linux/huge_mm.h | 1 -
> mm/huge_memory.c | 43 ++++++++++++++++-------------------------
> mm/vmscan.c | 3 ++-
> 3 files changed, 19 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
> index 97686fb46e30..1ecaeccf39c9 100644
> --- a/include/linux/huge_mm.h
> +++ b/include/linux/huge_mm.h
> @@ -369,7 +369,6 @@ enum split_type {
> SPLIT_TYPE_NON_UNIFORM,
> };
>
> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
> int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
> unsigned int new_order);
> int folio_split_unmapped(struct folio *folio, unsigned int new_order);
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c1f1055165dd..6c821c1c0ac3 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
> }
> }
>
> -/* Racy check whether the huge page can be split */
> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
> -{
> - int extra_pins;
> -
> - /* Additional pins from page cache */
> - if (folio_test_anon(folio))
> - extra_pins = folio_test_swapcache(folio) ?
> - folio_nr_pages(folio) : 0;
> - else
> - extra_pins = folio_nr_pages(folio);
> - if (pextra_pins)
> - *pextra_pins = extra_pins;
> - return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
> - caller_pins;
> -}
> -
> static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
> {
> for (; nr_pages; page++, nr_pages--)
> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
> return 0;
> }
>
> +/* Number of folio references from the pagecache or the swapcache. */
> +static unsigned int folio_cache_references(const struct folio *folio)
> +{
> + if (folio_test_anon(folio) && !folio_test_swapcache(folio))
> + return 0;
> + return folio_nr_pages(folio);
> +}
> +
> static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
> struct page *split_at, struct xa_state *xas,
> struct address_space *mapping, bool do_lru,
> struct list_head *list, enum split_type split_type,
> - pgoff_t end, int *nr_shmem_dropped, int extra_pins)
> + pgoff_t end, int *nr_shmem_dropped)
> {
> struct folio *end_folio = folio_next(folio);
> struct folio *new_folio, *next;
> int old_order = folio_order(folio);
> int ret = 0;
> struct deferred_split *ds_queue;
> + int extra_pins = folio_cache_references(folio);
Can we just inline the call do folio_cache_references() and get rid of extra_pins.
(which is a bad name either way)
if (folio_ref_freeze(folio, folio_cache_references(folio) + 1) {
BTW, now that we have this helper, I wonder if we should then also do for
clarification on the unfreeze path:
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 0acdc2f26ee0c..7cbcf61b7971d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -3824,8 +3824,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
zone_device_private_split_cb(folio, new_folio);
- expected_refs = folio_expected_ref_count(new_folio) + 1;
- folio_ref_unfreeze(new_folio, expected_refs);
+ folio_ref_unfreeze(new_folio, folio_cache_references(new_folio) + 1);
if (do_lru)
lru_add_split_folio(folio, new_folio, lruvec, list);
@@ -3868,8 +3867,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
* Otherwise, a parallel folio_try_get() can grab @folio
* and its caller can see stale page cache entries.
*/
- expected_refs = folio_expected_ref_count(folio) + 1;
- folio_ref_unfreeze(folio, expected_refs);
+ folio_ref_unfreeze(folio, folio_cache_references(folio) + 1);
if (do_lru)
unlock_page_lruvec(lruvec);
--
Cheers
David
On 24 Nov 2025, at 5:41, David Hildenbrand (Red Hat) wrote:
> On 11/22/25 03:55, Zi Yan wrote:
>> can_split_folio() is just a refcount comparison, making sure only the
>> split caller holds an extra pin. Open code it with
>> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>>
>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>> ---
>> include/linux/huge_mm.h | 1 -
>> mm/huge_memory.c | 43 ++++++++++++++++-------------------------
>> mm/vmscan.c | 3 ++-
>> 3 files changed, 19 insertions(+), 28 deletions(-)
>>
>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>> index 97686fb46e30..1ecaeccf39c9 100644
>> --- a/include/linux/huge_mm.h
>> +++ b/include/linux/huge_mm.h
>> @@ -369,7 +369,6 @@ enum split_type {
>> SPLIT_TYPE_NON_UNIFORM,
>> };
>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>> int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>> unsigned int new_order);
>> int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index c1f1055165dd..6c821c1c0ac3 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>> }
>> }
>> -/* Racy check whether the huge page can be split */
>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>> -{
>> - int extra_pins;
>> -
>> - /* Additional pins from page cache */
>> - if (folio_test_anon(folio))
>> - extra_pins = folio_test_swapcache(folio) ?
>> - folio_nr_pages(folio) : 0;
>> - else
>> - extra_pins = folio_nr_pages(folio);
>> - if (pextra_pins)
>> - *pextra_pins = extra_pins;
>> - return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
>> - caller_pins;
>> -}
>> -
>> static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>> {
>> for (; nr_pages; page++, nr_pages--)
>> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>> return 0;
>> }
>> +/* Number of folio references from the pagecache or the swapcache. */
>> +static unsigned int folio_cache_references(const struct folio *folio)
>> +{
>> + if (folio_test_anon(folio) && !folio_test_swapcache(folio))
>> + return 0;
>> + return folio_nr_pages(folio);
>> +}
>> +
>> static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
>> struct page *split_at, struct xa_state *xas,
>> struct address_space *mapping, bool do_lru,
>> struct list_head *list, enum split_type split_type,
>> - pgoff_t end, int *nr_shmem_dropped, int extra_pins)
>> + pgoff_t end, int *nr_shmem_dropped)
>> {
>> struct folio *end_folio = folio_next(folio);
>> struct folio *new_folio, *next;
>> int old_order = folio_order(folio);
>> int ret = 0;
>> struct deferred_split *ds_queue;
>> + int extra_pins = folio_cache_references(folio);
>
> Can we just inline the call do folio_cache_references() and get rid of extra_pins.
> (which is a bad name either way)
>
>
> if (folio_ref_freeze(folio, folio_cache_references(folio) + 1) {
>
>
> BTW, now that we have this helper, I wonder if we should then also do for
> clarification on the unfreeze path:
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 0acdc2f26ee0c..7cbcf61b7971d 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -3824,8 +3824,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
> zone_device_private_split_cb(folio, new_folio);
> - expected_refs = folio_expected_ref_count(new_folio) + 1;
> - folio_ref_unfreeze(new_folio, expected_refs);
> + folio_ref_unfreeze(new_folio, folio_cache_references(new_folio) + 1);
> if (do_lru)
> lru_add_split_folio(folio, new_folio, lruvec, list);
> @@ -3868,8 +3867,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
> * Otherwise, a parallel folio_try_get() can grab @folio
> * and its caller can see stale page cache entries.
> */
> - expected_refs = folio_expected_ref_count(folio) + 1;
> - folio_ref_unfreeze(folio, expected_refs);
> + folio_ref_unfreeze(folio, folio_cache_references(folio) + 1);
> if (do_lru)
> unlock_page_lruvec(lruvec);
>
>
Both make sense to me. Will make the change.
By comparing folio_cache_references() with folio_expected_ref_count(),
one difference is that folio_expected_ref_count() does not give right
refcount for shmem in swapcache.
This is the folio_expected_ref_count() code:
if (folio_test_anon(folio)) {
/* One reference per page from the swapcache. */
ref_count += folio_test_swapcache(folio) << order;
} else {
/* One reference per page from the pagecache. */
ref_count += !!folio->mapping << order;
/* One reference from PG_private. */
ref_count += folio_test_private(folio);
}
shmem in swapcache mean !folio_test_anon(folio) && folio_test_swapcache(folio).
The above code gives 0, but folio_cache_references() gives folio_nr_pages(folio).
It should not cause any issue, since IIUC shmem in swapcache happens
when the folio has an additional ref,
folio_expected_ref_count() != folio_ref_count() anyway. For split, it is
not supported yet, so folio_expected_ref_count() in split code does not
affect shmem in swapcache. But folio_expected_ref_count() should be
fixed, right?
Like:
if (folio_test_anon(folio)) {
/* One reference per page from the swapcache. */
ref_count += folio_test_swapcache(folio) << order;
} else {
/* One reference per page from shmem in the swapcache. */
ref_count += folio_test_swapcache(folio) << order;
/* One reference per page from the pagecache. */
ref_count += !!folio->mapping << order;
/* One reference from PG_private. */
ref_count += folio_test_private(folio);
}
or simplified into
if (!folio_test_anon(folio)) {
/* One reference per page from the pagecache. */
ref_count += !!folio->mapping << order;
/* One reference from PG_private. */
ref_count += folio_test_private(folio);
}
/* One reference per page from the swapcache (anon or shmem). */
ref_count += folio_test_swapcache(folio) << order;
?
Best Regards,
Yan, Zi
On 11/24/25 18:05, Zi Yan wrote:
> On 24 Nov 2025, at 5:41, David Hildenbrand (Red Hat) wrote:
>
>> On 11/22/25 03:55, Zi Yan wrote:
>>> can_split_folio() is just a refcount comparison, making sure only the
>>> split caller holds an extra pin. Open code it with
>>> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>>> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>>>
>>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>> ---
>>> include/linux/huge_mm.h | 1 -
>>> mm/huge_memory.c | 43 ++++++++++++++++-------------------------
>>> mm/vmscan.c | 3 ++-
>>> 3 files changed, 19 insertions(+), 28 deletions(-)
>>>
>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>> index 97686fb46e30..1ecaeccf39c9 100644
>>> --- a/include/linux/huge_mm.h
>>> +++ b/include/linux/huge_mm.h
>>> @@ -369,7 +369,6 @@ enum split_type {
>>> SPLIT_TYPE_NON_UNIFORM,
>>> };
>>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>> int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>> unsigned int new_order);
>>> int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index c1f1055165dd..6c821c1c0ac3 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>> }
>>> }
>>> -/* Racy check whether the huge page can be split */
>>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>> -{
>>> - int extra_pins;
>>> -
>>> - /* Additional pins from page cache */
>>> - if (folio_test_anon(folio))
>>> - extra_pins = folio_test_swapcache(folio) ?
>>> - folio_nr_pages(folio) : 0;
>>> - else
>>> - extra_pins = folio_nr_pages(folio);
>>> - if (pextra_pins)
>>> - *pextra_pins = extra_pins;
>>> - return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
>>> - caller_pins;
>>> -}
>>> -
>>> static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>>> {
>>> for (; nr_pages; page++, nr_pages--)
>>> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>> return 0;
>>> }
>>> +/* Number of folio references from the pagecache or the swapcache. */
>>> +static unsigned int folio_cache_references(const struct folio *folio)
>>> +{
>>> + if (folio_test_anon(folio) && !folio_test_swapcache(folio))
>>> + return 0;
>>> + return folio_nr_pages(folio);
>>> +}
>>> +
>>> static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
>>> struct page *split_at, struct xa_state *xas,
>>> struct address_space *mapping, bool do_lru,
>>> struct list_head *list, enum split_type split_type,
>>> - pgoff_t end, int *nr_shmem_dropped, int extra_pins)
>>> + pgoff_t end, int *nr_shmem_dropped)
>>> {
>>> struct folio *end_folio = folio_next(folio);
>>> struct folio *new_folio, *next;
>>> int old_order = folio_order(folio);
>>> int ret = 0;
>>> struct deferred_split *ds_queue;
>>> + int extra_pins = folio_cache_references(folio);
>>
>> Can we just inline the call do folio_cache_references() and get rid of extra_pins.
>> (which is a bad name either way)
>>
>>
>> if (folio_ref_freeze(folio, folio_cache_references(folio) + 1) {
>>
>>
>> BTW, now that we have this helper, I wonder if we should then also do for
>> clarification on the unfreeze path:
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 0acdc2f26ee0c..7cbcf61b7971d 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -3824,8 +3824,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>> zone_device_private_split_cb(folio, new_folio);
>> - expected_refs = folio_expected_ref_count(new_folio) + 1;
>> - folio_ref_unfreeze(new_folio, expected_refs);
>> + folio_ref_unfreeze(new_folio, folio_cache_references(new_folio) + 1);
>> if (do_lru)
>> lru_add_split_folio(folio, new_folio, lruvec, list);
>> @@ -3868,8 +3867,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>> * Otherwise, a parallel folio_try_get() can grab @folio
>> * and its caller can see stale page cache entries.
>> */
>> - expected_refs = folio_expected_ref_count(folio) + 1;
>> - folio_ref_unfreeze(folio, expected_refs);
>> + folio_ref_unfreeze(folio, folio_cache_references(folio) + 1);
>> if (do_lru)
>> unlock_page_lruvec(lruvec);
>>
>>
>
> Both make sense to me. Will make the change.
>
> By comparing folio_cache_references() with folio_expected_ref_count(),
> one difference is that folio_expected_ref_count() does not give right
> refcount for shmem in swapcache.
Good point. Likely nobody runs into that right now because nobody can
really does anything with these folios before they were re-added to the
pagecache or mapped into page tables.
>
> This is the folio_expected_ref_count() code:
>
> if (folio_test_anon(folio)) {
> /* One reference per page from the swapcache. */
> ref_count += folio_test_swapcache(folio) << order;
> } else {
> /* One reference per page from the pagecache. */
> ref_count += !!folio->mapping << order;
> /* One reference from PG_private. */
> ref_count += folio_test_private(folio);
> }
>
> shmem in swapcache mean !folio_test_anon(folio) && folio_test_swapcache(folio).
See below, it's actually
folio_test_anon(folio) && folio_test_swapbacked(folio)&&
folio_test_swapcache(folio)
I think ...
> The above code gives 0, but folio_cache_references() gives folio_nr_pages(folio).
> It should not cause any issue, since IIUC shmem in swapcache happens
> when the folio has an additional ref,
> folio_expected_ref_count() != folio_ref_count() anyway. For split, it is
> not supported yet,
Right.
> so folio_expected_ref_count() in split code does not
> affect shmem in swapcache. But folio_expected_ref_count() should be
> fixed, right?
We should better handle it, agreed.
Staring at the history of folio_expected_ref_count() once again, back
when we had folio_expected_refs() in migration code we didn't seem to
handle it I think.
-static int folio_expected_refs(struct address_space *mapping,
- struct folio *folio)
-{
- int refs = 1;
- if (!mapping)
- return refs;
-
- refs += folio_nr_pages(folio);
- if (folio_test_private(folio))
- refs++;
-
- return refs;
-}
gup.c doesn't care, because the pages are still mapped.
khugepaged.c similarly.
memfd.c doesn't care because the pages are still in the pagecache.
So I suspect nothing is broken, but the migration case needs a second look.
>
> Like:
>
> if (folio_test_anon(folio)) {
> /* One reference per page from the swapcache. */
> ref_count += folio_test_swapcache(folio) << order;
> } else {
> /* One reference per page from shmem in the swapcache. */
> ref_count += folio_test_swapcache(folio) << order;
> /* One reference per page from the pagecache. */
> ref_count += !!folio->mapping << order;
> /* One reference from PG_private. */
> ref_count += folio_test_private(folio);
> }
>
> or simplified into
>
> if (!folio_test_anon(folio)) {
> /* One reference per page from the pagecache. */
> ref_count += !!folio->mapping << order;
> /* One reference from PG_private. */
> ref_count += folio_test_private(folio);
> }
> /* One reference per page from the swapcache (anon or shmem). */
> ref_count += folio_test_swapcache(folio) << order;
> ?
That is incorrect I think due to swapcache being able to give false
positives (PG_owner_priv_1).
--
Cheers
David
On 24 Nov 2025, at 14:22, David Hildenbrand (Red Hat) wrote:
> On 11/24/25 18:05, Zi Yan wrote:
>> On 24 Nov 2025, at 5:41, David Hildenbrand (Red Hat) wrote:
>>
>>> On 11/22/25 03:55, Zi Yan wrote:
>>>> can_split_folio() is just a refcount comparison, making sure only the
>>>> split caller holds an extra pin. Open code it with
>>>> folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins
>>>> used by folio_ref_freeze(), add folio_cache_references() to calculate it.
>>>>
>>>> Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org>
>>>> Signed-off-by: Zi Yan <ziy@nvidia.com>
>>>> ---
>>>> include/linux/huge_mm.h | 1 -
>>>> mm/huge_memory.c | 43 ++++++++++++++++-------------------------
>>>> mm/vmscan.c | 3 ++-
>>>> 3 files changed, 19 insertions(+), 28 deletions(-)
>>>>
>>>> diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
>>>> index 97686fb46e30..1ecaeccf39c9 100644
>>>> --- a/include/linux/huge_mm.h
>>>> +++ b/include/linux/huge_mm.h
>>>> @@ -369,7 +369,6 @@ enum split_type {
>>>> SPLIT_TYPE_NON_UNIFORM,
>>>> };
>>>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins);
>>>> int __split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
>>>> unsigned int new_order);
>>>> int folio_split_unmapped(struct folio *folio, unsigned int new_order);
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index c1f1055165dd..6c821c1c0ac3 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -3455,23 +3455,6 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>>> }
>>>> }
>>>> -/* Racy check whether the huge page can be split */
>>>> -bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
>>>> -{
>>>> - int extra_pins;
>>>> -
>>>> - /* Additional pins from page cache */
>>>> - if (folio_test_anon(folio))
>>>> - extra_pins = folio_test_swapcache(folio) ?
>>>> - folio_nr_pages(folio) : 0;
>>>> - else
>>>> - extra_pins = folio_nr_pages(folio);
>>>> - if (pextra_pins)
>>>> - *pextra_pins = extra_pins;
>>>> - return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
>>>> - caller_pins;
>>>> -}
>>>> -
>>>> static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
>>>> {
>>>> for (; nr_pages; page++, nr_pages--)
>>>> @@ -3776,17 +3759,26 @@ int folio_check_splittable(struct folio *folio, unsigned int new_order,
>>>> return 0;
>>>> }
>>>> +/* Number of folio references from the pagecache or the swapcache. */
>>>> +static unsigned int folio_cache_references(const struct folio *folio)
>>>> +{
>>>> + if (folio_test_anon(folio) && !folio_test_swapcache(folio))
>>>> + return 0;
>>>> + return folio_nr_pages(folio);
>>>> +}
>>>> +
>>>> static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int new_order,
>>>> struct page *split_at, struct xa_state *xas,
>>>> struct address_space *mapping, bool do_lru,
>>>> struct list_head *list, enum split_type split_type,
>>>> - pgoff_t end, int *nr_shmem_dropped, int extra_pins)
>>>> + pgoff_t end, int *nr_shmem_dropped)
>>>> {
>>>> struct folio *end_folio = folio_next(folio);
>>>> struct folio *new_folio, *next;
>>>> int old_order = folio_order(folio);
>>>> int ret = 0;
>>>> struct deferred_split *ds_queue;
>>>> + int extra_pins = folio_cache_references(folio);
>>>
>>> Can we just inline the call do folio_cache_references() and get rid of extra_pins.
>>> (which is a bad name either way)
>>>
>>>
>>> if (folio_ref_freeze(folio, folio_cache_references(folio) + 1) {
>>>
>>>
>>> BTW, now that we have this helper, I wonder if we should then also do for
>>> clarification on the unfreeze path:
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 0acdc2f26ee0c..7cbcf61b7971d 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -3824,8 +3824,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>>> zone_device_private_split_cb(folio, new_folio);
>>> - expected_refs = folio_expected_ref_count(new_folio) + 1;
>>> - folio_ref_unfreeze(new_folio, expected_refs);
>>> + folio_ref_unfreeze(new_folio, folio_cache_references(new_folio) + 1);
>>> if (do_lru)
>>> lru_add_split_folio(folio, new_folio, lruvec, list);
>>> @@ -3868,8 +3867,7 @@ static int __folio_freeze_and_split_unmapped(struct folio *folio, unsigned int n
>>> * Otherwise, a parallel folio_try_get() can grab @folio
>>> * and its caller can see stale page cache entries.
>>> */
>>> - expected_refs = folio_expected_ref_count(folio) + 1;
>>> - folio_ref_unfreeze(folio, expected_refs);
>>> + folio_ref_unfreeze(folio, folio_cache_references(folio) + 1);
>>> if (do_lru)
>>> unlock_page_lruvec(lruvec);
>>>
>>>
>>
>> Both make sense to me. Will make the change.
>>
>> By comparing folio_cache_references() with folio_expected_ref_count(),
>> one difference is that folio_expected_ref_count() does not give right
>> refcount for shmem in swapcache.
>
> Good point. Likely nobody runs into that right now because nobody can really does anything with these folios before they were re-added to the pagecache or mapped into page tables.
>
>>
>> This is the folio_expected_ref_count() code:
>>
>> if (folio_test_anon(folio)) {
>> /* One reference per page from the swapcache. */
>> ref_count += folio_test_swapcache(folio) << order;
>> } else {
>> /* One reference per page from the pagecache. */
>> ref_count += !!folio->mapping << order;
>> /* One reference from PG_private. */
>> ref_count += folio_test_private(folio);
>> }
>>
>> shmem in swapcache mean !folio_test_anon(folio) && folio_test_swapcache(folio).
>
> See below, it's actually
>
> folio_test_anon(folio) && folio_test_swapbacked(folio)&& folio_test_swapcache(folio)
!folio_test_anon(folio) && folio_test_swapbacked(folio)&&
folio_test_swapcache(folio)
Right?
>
> I think ...
>
>> The above code gives 0, but folio_cache_references() gives folio_nr_pages(folio).
>> It should not cause any issue, since IIUC shmem in swapcache happens
>> when the folio has an additional ref,
>> folio_expected_ref_count() != folio_ref_count() anyway. For split, it is
>> not supported yet,
>
> Right.
>
>> so folio_expected_ref_count() in split code does not
>> affect shmem in swapcache. But folio_expected_ref_count() should be
>> fixed, right?
>
> We should better handle it, agreed.
>
> Staring at the history of folio_expected_ref_count() once again, back when we had folio_expected_refs() in migration code we didn't seem to handle it I think.
>
> -static int folio_expected_refs(struct address_space *mapping,
> - struct folio *folio)
> -{
> - int refs = 1;
> - if (!mapping)
> - return refs;
> -
> - refs += folio_nr_pages(folio);
> - if (folio_test_private(folio))
> - refs++;
> -
> - return refs;
> -}
>
>
> gup.c doesn't care, because the pages are still mapped.
>
> khugepaged.c similarly.
>
> memfd.c doesn't care because the pages are still in the pagecache.
>
> So I suspect nothing is broken, but the migration case needs a second look.
For migration, shmem in swapcache happens in shmem_writeout(), where an
additional ref is placed on the folio. And migration caller places
a ref on the folio to before a migration. The folio has 2 refs and it is
not equal to folio_expected_ref_count()(returning 0) + 1 ,
or folio_expected_refs()(returning 1).
So it is safe.
>
>>
>> Like:
>>
>> if (folio_test_anon(folio)) {
>> /* One reference per page from the swapcache. */
>> ref_count += folio_test_swapcache(folio) << order;
>> } else {
>> /* One reference per page from shmem in the swapcache. */
>> ref_count += folio_test_swapcache(folio) << order;
>> /* One reference per page from the pagecache. */
>> ref_count += !!folio->mapping << order;
>> /* One reference from PG_private. */
>> ref_count += folio_test_private(folio);
>> }
>>
>> or simplified into
>>
>> if (!folio_test_anon(folio)) {
>> /* One reference per page from the pagecache. */
>> ref_count += !!folio->mapping << order;
>> /* One reference from PG_private. */
>> ref_count += folio_test_private(folio);
>> }
>> /* One reference per page from the swapcache (anon or shmem). */
>> ref_count += folio_test_swapcache(folio) << order;
>> ?
>
> That is incorrect I think due to swapcache being able to give false positives (PG_owner_priv_1).
Got it. So it should be:
if (folio_test_anon(folio)) {
/* One reference per page from the swapcache. */
ref_count += folio_test_swapcache(folio) << order;
} else {
/* One reference per page from shmem in the swapcache. */
ref_count += (folio_test_swapbacked (folio) &&
folio_test_swapcache(folio)) << order;
/* One reference per page from the pagecache. */
ref_count += !!folio->mapping << order;
/* One reference from PG_private. */
ref_count += folio_test_private(folio);
}
I wonder if we should have folio_test_shmem_in_swapcache() instead.
BTW, this page flag reuse is really confusing. I see PG_checked is
PG_owner_priv_1 too and __folio_migrate_mapping() uses folio_test_swapcache()
to decide the number of i_pages entries. Wouldn’t that cause any issue?
ext4 does not release_folio() for migration when PG_checked is set,
ubifs clears PG_checked in release_folio(). I have not checked all other FS
yet. Maybe later.
Best Regards,
Yan, Zi
On 2025/11/25 5:08, Zi Yan wrote:
> On 24 Nov 2025, at 14:22, David Hildenbrand (Red Hat) wrote:
>
<snip>
>
> BTW, this page flag reuse is really confusing. I see PG_checked is
> PG_owner_priv_1 too and __folio_migrate_mapping() uses folio_test_swapcache()
> to decide the number of i_pages entries. Wouldn’t that cause any issue?
> ext4 does not release_folio() for migration when PG_checked is set,
> ubifs clears PG_checked in release_folio(). I have not checked all other FS
> yet. Maybe later.
folio_test_swapbacked() is also checked in folio_test_swapcache:
static __always_inline bool folio_test_swapcache(struct folio *folio)
{
return folio_test_swapbacked(folio) &&
test_bit(PG_swapcache, folio_flags(folio, 0));
}
So IMHO the reuse of this page flag should work fine.
Thanks.
.
On 11/25/25 10:10, Miaohe Lin wrote:
> On 2025/11/25 5:08, Zi Yan wrote:
>> On 24 Nov 2025, at 14:22, David Hildenbrand (Red Hat) wrote:
>>
>
> <snip>
>
>>
>> BTW, this page flag reuse is really confusing. I see PG_checked is
>> PG_owner_priv_1 too and __folio_migrate_mapping() uses folio_test_swapcache()
>> to decide the number of i_pages entries. Wouldn’t that cause any issue?
>> ext4 does not release_folio() for migration when PG_checked is set,
>> ubifs clears PG_checked in release_folio(). I have not checked all other FS
>> yet. Maybe later.
>
> folio_test_swapbacked() is also checked in folio_test_swapcache:
>
> static __always_inline bool folio_test_swapcache(struct folio *folio)
> {
> return folio_test_swapbacked(folio) &&
> test_bit(PG_swapcache, folio_flags(folio, 0));
> }
>
> So IMHO the reuse of this page flag should work fine.
Ahh, thanks for pointing that out. Confusing, as usually the
folio_test_*() helper are straight bit tests,
All good then :)
--
Cheers
David
>>
>>>
>>> Like:
>>>
>>> if (folio_test_anon(folio)) {
>>> /* One reference per page from the swapcache. */
>>> ref_count += folio_test_swapcache(folio) << order;
>>> } else {
>>> /* One reference per page from shmem in the swapcache. */
>>> ref_count += folio_test_swapcache(folio) << order;
>>> /* One reference per page from the pagecache. */
>>> ref_count += !!folio->mapping << order;
>>> /* One reference from PG_private. */
>>> ref_count += folio_test_private(folio);
>>> }
>>>
>>> or simplified into
>>>
>>> if (!folio_test_anon(folio)) {
>>> /* One reference per page from the pagecache. */
>>> ref_count += !!folio->mapping << order;
>>> /* One reference from PG_private. */
>>> ref_count += folio_test_private(folio);
>>> }
>>> /* One reference per page from the swapcache (anon or shmem). */
>>> ref_count += folio_test_swapcache(folio) << order;
>>> ?
>>
>> That is incorrect I think due to swapcache being able to give false positives (PG_owner_priv_1).
>
> Got it. So it should be:
>
> if (folio_test_anon(folio)) {
> /* One reference per page from the swapcache. */
> ref_count += folio_test_swapcache(folio) << order;
> } else {
> /* One reference per page from shmem in the swapcache. */
> ref_count += (folio_test_swapbacked (folio) &&
> folio_test_swapcache(folio)) << order;
> /* One reference per page from the pagecache. */
> ref_count += !!folio->mapping << order;
> /* One reference from PG_private. */
> ref_count += folio_test_private(folio);
> }
Interestingly, I think we would then also take proper care of anon folios in the
swapcache that are not anon yet. See __read_swap_cache_async().
I wonder if we can clean that up a bit, to highlight that PG_private etc
do not apply.
if (folio_test_anon(folio)) {
/* One reference per page from the swapcache. */
ref_count += folio_test_swapcache(folio) << order;
} else if (folio_test_swapbacked (folio) && folio_test_swapcache(folio)) {
/* to-be-anon or shmem folio in the swapcache (!folio->mapping) */
ref_count += 1ul << order;
VM_WAN_ON_ONCE(folio->mapping);
} else {
/* One reference per page from the pagecache. */
ref_count += !!folio->mapping << order;
/* One reference from PG_private. */
ref_count += folio_test_private(folio);
}
Or maybe simply:
if (folio_test_swapbacked (folio) && folio_test_swapcache(folio)) {
/*
* (to-be) anon or shmem (!folio->mapping) folio in the swapcache:
* One reference per page from the swapcache.
*/
ref_count += 1 << order;
VM_WAN_ON_ONCE(!folio_test_anon(folio) && folio->mapping);
} else if (!folio_test_anon(folio)) {
/* One reference per page from the pagecache. */
ref_count += !!folio->mapping << order;
/* One reference from PG_private. */
ref_count += folio_test_private(folio);
}
>
> I wonder if we should have folio_test_shmem_in_swapcache() instead.
Interestingly, thinking about it, I think it would also match to-be anon folios
and anon folios.
folio_in_swapcache() maybe ?
>
> BTW, this page flag reuse is really confusing.
Yes ...
> I see PG_checked is
> PG_owner_priv_1 too and __folio_migrate_mapping() uses folio_test_swapcache()
> to decide the number of i_pages entries. Wouldn’t that cause any issue?
Maybe at that point all false positives were ruled out?
It is horrible TBH.
--
Cheers
David
On 25 Nov 2025, at 3:52, David Hildenbrand (Red Hat) wrote:
>>>
>>>>
>>>> Like:
>>>>
>>>> if (folio_test_anon(folio)) {
>>>> /* One reference per page from the swapcache. */
>>>> ref_count += folio_test_swapcache(folio) << order;
>>>> } else {
>>>> /* One reference per page from shmem in the swapcache. */
>>>> ref_count += folio_test_swapcache(folio) << order;
>>>> /* One reference per page from the pagecache. */
>>>> ref_count += !!folio->mapping << order;
>>>> /* One reference from PG_private. */
>>>> ref_count += folio_test_private(folio);
>>>> }
>>>>
>>>> or simplified into
>>>>
>>>> if (!folio_test_anon(folio)) {
>>>> /* One reference per page from the pagecache. */
>>>> ref_count += !!folio->mapping << order;
>>>> /* One reference from PG_private. */
>>>> ref_count += folio_test_private(folio);
>>>> }
>>>> /* One reference per page from the swapcache (anon or shmem). */
>>>> ref_count += folio_test_swapcache(folio) << order;
>>>> ?
>>>
>>> That is incorrect I think due to swapcache being able to give false positives (PG_owner_priv_1).
>>
>> Got it. So it should be:
>>
>> if (folio_test_anon(folio)) {
>> /* One reference per page from the swapcache. */
>> ref_count += folio_test_swapcache(folio) << order;
>> } else {
>> /* One reference per page from shmem in the swapcache. */
>> ref_count += (folio_test_swapbacked (folio) &&
>> folio_test_swapcache(folio)) << order;
>> /* One reference per page from the pagecache. */
>> ref_count += !!folio->mapping << order;
>> /* One reference from PG_private. */
>> ref_count += folio_test_private(folio);
>> }
>
> Interestingly, I think we would then also take proper care of anon folios in the
> swapcache that are not anon yet. See __read_swap_cache_async().
Right. After add_to_swap_cache() in __read_swap_cache_async(), the folio
there is in the same state as shmem in swapcache.
>
> I wonder if we can clean that up a bit, to highlight that PG_private etc
> do not apply.
>
> if (folio_test_anon(folio)) {
> /* One reference per page from the swapcache. */
> ref_count += folio_test_swapcache(folio) << order;
> } else if (folio_test_swapbacked (folio) && folio_test_swapcache(folio)) {
> /* to-be-anon or shmem folio in the swapcache (!folio->mapping) */
> ref_count += 1ul << order;
> VM_WAN_ON_ONCE(folio->mapping);
> } else {
> /* One reference per page from the pagecache. */
> ref_count += !!folio->mapping << order;
> /* One reference from PG_private. */
> ref_count += folio_test_private(folio);
> }
I like this better, will send a patch for folio_expected_ref_count()
separately. Since folio_test_swapcache(folio) implies
folio_test_swapbacked (folio) as Maolin pointed out in another
email, I will get rid of folio_test_swapbacked(folio) in the above code.
>
> Or maybe simply:
>
>
> if (folio_test_swapbacked (folio) && folio_test_swapcache(folio)) {
> /*
> * (to-be) anon or shmem (!folio->mapping) folio in the swapcache:
> * One reference per page from the swapcache.
> */
> ref_count += 1 << order;
> VM_WAN_ON_ONCE(!folio_test_anon(folio) && folio->mapping);
> } else if (!folio_test_anon(folio)) {
> /* One reference per page from the pagecache. */
> ref_count += !!folio->mapping << order;
> /* One reference from PG_private. */
> ref_count += folio_test_private(folio);
> }
>
>>
>> I wonder if we should have folio_test_shmem_in_swapcache() instead.
>
> Interestingly, thinking about it, I think it would also match to-be anon folios
> and anon folios.
>
> folio_in_swapcache() maybe ?
Yes, will come up with a patch for it and send along with
folio_expected_ref_count() patch.
>
>>
>> BTW, this page flag reuse is really confusing.
>
> Yes ...
>
>> I see PG_checked is
>> PG_owner_priv_1 too and __folio_migrate_mapping() uses folio_test_swapcache()
>> to decide the number of i_pages entries. Wouldn’t that cause any issue?
>
> Maybe at that point all false positives were ruled out?
>
> It is horrible TBH.
>
> --
> Cheers
>
> David
Best Regards,
Yan, Zi
On Fri, Nov 21, 2025 at 09:55:27PM -0500, Zi Yan wrote: >can_split_folio() is just a refcount comparison, making sure only the >split caller holds an extra pin. Open code it with >folio_expected_ref_count() != folio_ref_count() - 1. For the extra_pins >used by folio_ref_freeze(), add folio_cache_references() to calculate it. > >Suggested-by: David Hildenbrand (Red Hat) <david@kernel.org> >Signed-off-by: Zi Yan <ziy@nvidia.com> LGTM, Thanks Reviewed-by: Wei Yang <richard.weiyang@gmail.com> -- Wei Yang Help you, Help me
© 2016 - 2025 Red Hat, Inc.