Add support for splitting device-private THP folios, enabling fallback
to smaller page sizes when large page allocation or migration fails.
Key changes:
- split_huge_pmd(): Handle device-private PMD entries during splitting
- Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
- Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
don't support shared zero page semantics
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
---
mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++--------------
1 file changed, 98 insertions(+), 40 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 78166db72f4d..5291ee155a02 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page;
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
- bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
- bool anon_exclusive = false, dirty = false;
+ bool soft_dirty, uffd_wp = false, young = false, write = false;
+ bool anon_exclusive = false, dirty = false, present = false;
unsigned long addr;
pte_t *pte;
int i;
+ swp_entry_t swp_entry;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
+
+ VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd));
count_vm_event(THP_SPLIT_PMD);
@@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- pmd_migration = is_pmd_migration_entry(*pmd);
- if (unlikely(pmd_migration)) {
- swp_entry_t entry;
+ present = pmd_present(*pmd);
+ if (is_pmd_migration_entry(*pmd)) {
old_pmd = *pmd;
- entry = pmd_to_swp_entry(old_pmd);
- page = pfn_swap_entry_to_page(entry);
- write = is_writable_migration_entry(entry);
+ swp_entry = pmd_to_swp_entry(old_pmd);
+ page = pfn_swap_entry_to_page(swp_entry);
+ folio = page_folio(page);
+
+ soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+ write = is_writable_migration_entry(swp_entry);
if (PageAnon(page))
- anon_exclusive = is_readable_exclusive_migration_entry(entry);
- young = is_migration_entry_young(entry);
- dirty = is_migration_entry_dirty(entry);
+ anon_exclusive = is_readable_exclusive_migration_entry(swp_entry);
+ young = is_migration_entry_young(swp_entry);
+ dirty = is_migration_entry_dirty(swp_entry);
+ } else if (is_pmd_device_private_entry(*pmd)) {
+ old_pmd = *pmd;
+ swp_entry = pmd_to_swp_entry(old_pmd);
+ page = pfn_swap_entry_to_page(swp_entry);
+ folio = page_folio(page);
+
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+ write = is_writable_device_private_entry(swp_entry);
+ anon_exclusive = PageAnonExclusive(page);
+
+ if (freeze && anon_exclusive &&
+ folio_try_share_anon_rmap_pmd(folio, page))
+ freeze = false;
+ if (!freeze) {
+ rmap_t rmap_flags = RMAP_NONE;
+
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
+ if (anon_exclusive)
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
+ vma, haddr, rmap_flags);
+ }
} else {
/*
* Up to this point the pmd is present and huge and userland has
@@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* Note that NUMA hinting access restrictions are not transferred to
* avoid any possibility of altering permissions across VMAs.
*/
- if (freeze || pmd_migration) {
- for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
- pte_t entry;
- swp_entry_t swp_entry;
-
- if (write)
- swp_entry = make_writable_migration_entry(
- page_to_pfn(page + i));
- else if (anon_exclusive)
- swp_entry = make_readable_exclusive_migration_entry(
- page_to_pfn(page + i));
- else
- swp_entry = make_readable_migration_entry(
- page_to_pfn(page + i));
- if (young)
- swp_entry = make_migration_entry_young(swp_entry);
- if (dirty)
- swp_entry = make_migration_entry_dirty(swp_entry);
- entry = swp_entry_to_pte(swp_entry);
- if (soft_dirty)
- entry = pte_swp_mksoft_dirty(entry);
- if (uffd_wp)
- entry = pte_swp_mkuffd_wp(entry);
+ if (freeze || !present) {
+ pte_t entry;
- VM_WARN_ON(!pte_none(ptep_get(pte + i)));
- set_pte_at(mm, addr, pte + i, entry);
+ if (freeze || is_migration_entry(swp_entry)) {
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ if (write)
+ swp_entry = make_writable_migration_entry(
+ page_to_pfn(page + i));
+ else if (anon_exclusive)
+ swp_entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_migration_entry(
+ page_to_pfn(page + i));
+ if (young)
+ swp_entry = make_migration_entry_young(swp_entry);
+ if (dirty)
+ swp_entry = make_migration_entry_dirty(swp_entry);
+
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+ set_pte_at(mm, addr, pte + i, entry);
+ }
+ } else {
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ /*
+ * anon_exclusive was already propagated to the relevant
+ * pages corresponding to the pte entries when freeze
+ * is false.
+ */
+ if (write)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page + i));
+ /*
+ * Young and dirty bits are not progated via swp_entry
+ */
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+ set_pte_at(mm, addr, pte + i, entry);
+ }
}
} else {
pte_t entry;
@@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
pte_unmap(pte);
- if (!pmd_migration)
+ if (!is_pmd_migration_entry(*pmd))
folio_remove_rmap_pmd(folio, page, vma);
if (freeze)
put_page(page);
@@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze)
{
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
- if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
+ if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd))
__split_huge_pmd_locked(vma, pmd, address, freeze);
}
@@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
lockdep_assert_held(&lruvec->lru_lock);
+ if (folio_is_device_private(folio))
+ return;
+
if (list) {
/* page reclaim is reclaiming a huge page */
VM_WARN_ON(folio_test_lru(folio));
@@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
if (nr_shmem_dropped)
shmem_uncharge(mapping->host, nr_shmem_dropped);
- if (!ret && is_anon)
+ if (!ret && is_anon && !folio_is_device_private(folio))
remap_flags = RMP_USE_SHARED_ZEROPAGE;
+
remap_page(folio, 1 << order, remap_flags);
/*
--
2.50.1
On 16 Sep 2025, at 8:21, Balbir Singh wrote:
> Add support for splitting device-private THP folios, enabling fallback
> to smaller page sizes when large page allocation or migration fails.
>
> Key changes:
> - split_huge_pmd(): Handle device-private PMD entries during splitting
> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
> don't support shared zero page semantics
>
> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
> Cc: Rakie Kim <rakie.kim@sk.com>
> Cc: Byungchul Park <byungchul@sk.com>
> Cc: Gregory Price <gourry@gourry.net>
> Cc: Ying Huang <ying.huang@linux.alibaba.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> Cc: Nico Pache <npache@redhat.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Dev Jain <dev.jain@arm.com>
> Cc: Barry Song <baohua@kernel.org>
> Cc: Lyude Paul <lyude@redhat.com>
> Cc: Danilo Krummrich <dakr@kernel.org>
> Cc: David Airlie <airlied@gmail.com>
> Cc: Simona Vetter <simona@ffwll.ch>
> Cc: Ralph Campbell <rcampbell@nvidia.com>
> Cc: Mika Penttilä <mpenttil@redhat.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Francois Dugast <francois.dugast@intel.com>
> ---
> mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++--------------
> 1 file changed, 98 insertions(+), 40 deletions(-)
>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 78166db72f4d..5291ee155a02 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> struct page *page;
> pgtable_t pgtable;
> pmd_t old_pmd, _pmd;
> - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
> - bool anon_exclusive = false, dirty = false;
> + bool soft_dirty, uffd_wp = false, young = false, write = false;
> + bool anon_exclusive = false, dirty = false, present = false;
> unsigned long addr;
> pte_t *pte;
> int i;
> + swp_entry_t swp_entry;
>
> VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
> VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
> VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
> - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
> +
> + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd));
>
> count_vm_event(THP_SPLIT_PMD);
>
> @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> return __split_huge_zero_page_pmd(vma, haddr, pmd);
> }
>
> - pmd_migration = is_pmd_migration_entry(*pmd);
> - if (unlikely(pmd_migration)) {
> - swp_entry_t entry;
>
> + present = pmd_present(*pmd);
> + if (is_pmd_migration_entry(*pmd)) {
> old_pmd = *pmd;
> - entry = pmd_to_swp_entry(old_pmd);
> - page = pfn_swap_entry_to_page(entry);
> - write = is_writable_migration_entry(entry);
> + swp_entry = pmd_to_swp_entry(old_pmd);
> + page = pfn_swap_entry_to_page(swp_entry);
> + folio = page_folio(page);
> +
> + soft_dirty = pmd_swp_soft_dirty(old_pmd);
> + uffd_wp = pmd_swp_uffd_wp(old_pmd);
> +
> + write = is_writable_migration_entry(swp_entry);
> if (PageAnon(page))
> - anon_exclusive = is_readable_exclusive_migration_entry(entry);
> - young = is_migration_entry_young(entry);
> - dirty = is_migration_entry_dirty(entry);
> + anon_exclusive = is_readable_exclusive_migration_entry(swp_entry);
> + young = is_migration_entry_young(swp_entry);
> + dirty = is_migration_entry_dirty(swp_entry);
> + } else if (is_pmd_device_private_entry(*pmd)) {
> + old_pmd = *pmd;
> + swp_entry = pmd_to_swp_entry(old_pmd);
> + page = pfn_swap_entry_to_page(swp_entry);
> + folio = page_folio(page);
> +
> soft_dirty = pmd_swp_soft_dirty(old_pmd);
> uffd_wp = pmd_swp_uffd_wp(old_pmd);
> +
> + write = is_writable_device_private_entry(swp_entry);
> + anon_exclusive = PageAnonExclusive(page);
> +
> + if (freeze && anon_exclusive &&
> + folio_try_share_anon_rmap_pmd(folio, page))
> + freeze = false;
Why is it OK to change the freeze request? OK, it is replicating
the code for present PMD folios. Either add a comment to point
to the explanation in the comment below, or move
“if (is_pmd_device_private_entry(*pmd))“ branch in the else below
to deduplicate this code.
> + if (!freeze) {
> + rmap_t rmap_flags = RMAP_NONE;
> +
> + folio_ref_add(folio, HPAGE_PMD_NR - 1);
> + if (anon_exclusive)
> + rmap_flags |= RMAP_EXCLUSIVE;
> +
> + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
> + vma, haddr, rmap_flags);
> + }
> } else {
> /*
> * Up to this point the pmd is present and huge and userland has
> @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> * Note that NUMA hinting access restrictions are not transferred to
> * avoid any possibility of altering permissions across VMAs.
> */
> - if (freeze || pmd_migration) {
> - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
> - pte_t entry;
> - swp_entry_t swp_entry;
> -
> - if (write)
> - swp_entry = make_writable_migration_entry(
> - page_to_pfn(page + i));
> - else if (anon_exclusive)
> - swp_entry = make_readable_exclusive_migration_entry(
> - page_to_pfn(page + i));
> - else
> - swp_entry = make_readable_migration_entry(
> - page_to_pfn(page + i));
> - if (young)
> - swp_entry = make_migration_entry_young(swp_entry);
> - if (dirty)
> - swp_entry = make_migration_entry_dirty(swp_entry);
> - entry = swp_entry_to_pte(swp_entry);
> - if (soft_dirty)
> - entry = pte_swp_mksoft_dirty(entry);
> - if (uffd_wp)
> - entry = pte_swp_mkuffd_wp(entry);
> + if (freeze || !present) {
> + pte_t entry;
>
> - VM_WARN_ON(!pte_none(ptep_get(pte + i)));
> - set_pte_at(mm, addr, pte + i, entry);
> + if (freeze || is_migration_entry(swp_entry)) {
>
<snip>
> + } else {
<snip>
> }
> } else {
> pte_t entry;
David already pointed this out in v5. It can be done such as:
if (freeze || pmd_migration) {
...
} else if (is_pmd_device_private_entry(old_pmd)) {
...
} else {
/* for present, non freeze case */
}
> @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
> }
> pte_unmap(pte);
>
> - if (!pmd_migration)
> + if (!is_pmd_migration_entry(*pmd))
> folio_remove_rmap_pmd(folio, page, vma);
> if (freeze)
> put_page(page);
> @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
> pmd_t *pmd, bool freeze)
> {
> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
> - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
> + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd))
> __split_huge_pmd_locked(vma, pmd, address, freeze);
> }
>
> @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
> VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
> lockdep_assert_held(&lruvec->lru_lock);
>
> + if (folio_is_device_private(folio))
> + return;
> +
> if (list) {
> /* page reclaim is reclaiming a huge page */
> VM_WARN_ON(folio_test_lru(folio));
> @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
> if (nr_shmem_dropped)
> shmem_uncharge(mapping->host, nr_shmem_dropped);
>
> - if (!ret && is_anon)
> + if (!ret && is_anon && !folio_is_device_private(folio))
> remap_flags = RMP_USE_SHARED_ZEROPAGE;
> +
You should remove this and add
if (folio_is_device_private(folio))
return false;
in try_to_map_unused_to_zeropage(). Otherwise, no one would know
device private folios need to be excluded from mapping unused to
zero page.
> remap_page(folio, 1 << order, remap_flags);
>
> /*
> --
> 2.50.1
Best Regards,
Yan, Zi
On 9/23/25 07:09, Zi Yan wrote:
> On 16 Sep 2025, at 8:21, Balbir Singh wrote:
>
>> Add support for splitting device-private THP folios, enabling fallback
>> to smaller page sizes when large page allocation or migration fails.
>>
>> Key changes:
>> - split_huge_pmd(): Handle device-private PMD entries during splitting
>> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
>> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
>> don't support shared zero page semantics
>>
>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>> Cc: Rakie Kim <rakie.kim@sk.com>
>> Cc: Byungchul Park <byungchul@sk.com>
>> Cc: Gregory Price <gourry@gourry.net>
>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>> Cc: Alistair Popple <apopple@nvidia.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>> Cc: Nico Pache <npache@redhat.com>
>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>> Cc: Dev Jain <dev.jain@arm.com>
>> Cc: Barry Song <baohua@kernel.org>
>> Cc: Lyude Paul <lyude@redhat.com>
>> Cc: Danilo Krummrich <dakr@kernel.org>
>> Cc: David Airlie <airlied@gmail.com>
>> Cc: Simona Vetter <simona@ffwll.ch>
>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>> Cc: Mika Penttilä <mpenttil@redhat.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Francois Dugast <francois.dugast@intel.com>
>> ---
>> mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++--------------
>> 1 file changed, 98 insertions(+), 40 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 78166db72f4d..5291ee155a02 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>> struct page *page;
>> pgtable_t pgtable;
>> pmd_t old_pmd, _pmd;
>> - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
>> - bool anon_exclusive = false, dirty = false;
>> + bool soft_dirty, uffd_wp = false, young = false, write = false;
>> + bool anon_exclusive = false, dirty = false, present = false;
>> unsigned long addr;
>> pte_t *pte;
>> int i;
>> + swp_entry_t swp_entry;
>>
>> VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>> VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>> VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
>> - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
>> +
>> + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd));
>>
>> count_vm_event(THP_SPLIT_PMD);
>>
>> @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>> return __split_huge_zero_page_pmd(vma, haddr, pmd);
>> }
>>
>> - pmd_migration = is_pmd_migration_entry(*pmd);
>> - if (unlikely(pmd_migration)) {
>> - swp_entry_t entry;
>>
>> + present = pmd_present(*pmd);
>> + if (is_pmd_migration_entry(*pmd)) {
>> old_pmd = *pmd;
>> - entry = pmd_to_swp_entry(old_pmd);
>> - page = pfn_swap_entry_to_page(entry);
>> - write = is_writable_migration_entry(entry);
>> + swp_entry = pmd_to_swp_entry(old_pmd);
>> + page = pfn_swap_entry_to_page(swp_entry);
>> + folio = page_folio(page);
>> +
>> + soft_dirty = pmd_swp_soft_dirty(old_pmd);
>> + uffd_wp = pmd_swp_uffd_wp(old_pmd);
>> +
>> + write = is_writable_migration_entry(swp_entry);
>> if (PageAnon(page))
>> - anon_exclusive = is_readable_exclusive_migration_entry(entry);
>> - young = is_migration_entry_young(entry);
>> - dirty = is_migration_entry_dirty(entry);
>> + anon_exclusive = is_readable_exclusive_migration_entry(swp_entry);
>> + young = is_migration_entry_young(swp_entry);
>> + dirty = is_migration_entry_dirty(swp_entry);
>> + } else if (is_pmd_device_private_entry(*pmd)) {
>> + old_pmd = *pmd;
>> + swp_entry = pmd_to_swp_entry(old_pmd);
>> + page = pfn_swap_entry_to_page(swp_entry);
>> + folio = page_folio(page);
>> +
>> soft_dirty = pmd_swp_soft_dirty(old_pmd);
>> uffd_wp = pmd_swp_uffd_wp(old_pmd);
>> +
>> + write = is_writable_device_private_entry(swp_entry);
>> + anon_exclusive = PageAnonExclusive(page);
>> +
>> + if (freeze && anon_exclusive &&
>> + folio_try_share_anon_rmap_pmd(folio, page))
>> + freeze = false;
>
> Why is it OK to change the freeze request? OK, it is replicating
> the code for present PMD folios. Either add a comment to point
> to the explanation in the comment below, or move
> “if (is_pmd_device_private_entry(*pmd))“ branch in the else below
> to deduplicate this code.
Similar to the code for present pages, ideally folio_try_share_anon_rmap_pmd()
should never fail.
>
>> + if (!freeze) {
>> + rmap_t rmap_flags = RMAP_NONE;
>> +
>> + folio_ref_add(folio, HPAGE_PMD_NR - 1);
>> + if (anon_exclusive)
>> + rmap_flags |= RMAP_EXCLUSIVE;
>> +
>> + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
>> + vma, haddr, rmap_flags);
>> + }
>> } else {
>> /*
>> * Up to this point the pmd is present and huge and userland has
>> @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>> * Note that NUMA hinting access restrictions are not transferred to
>> * avoid any possibility of altering permissions across VMAs.
>> */
>> - if (freeze || pmd_migration) {
>> - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>> - pte_t entry;
>> - swp_entry_t swp_entry;
>> -
>> - if (write)
>> - swp_entry = make_writable_migration_entry(
>> - page_to_pfn(page + i));
>> - else if (anon_exclusive)
>> - swp_entry = make_readable_exclusive_migration_entry(
>> - page_to_pfn(page + i));
>> - else
>> - swp_entry = make_readable_migration_entry(
>> - page_to_pfn(page + i));
>> - if (young)
>> - swp_entry = make_migration_entry_young(swp_entry);
>> - if (dirty)
>> - swp_entry = make_migration_entry_dirty(swp_entry);
>> - entry = swp_entry_to_pte(swp_entry);
>> - if (soft_dirty)
>> - entry = pte_swp_mksoft_dirty(entry);
>> - if (uffd_wp)
>> - entry = pte_swp_mkuffd_wp(entry);
>> + if (freeze || !present) {
>> + pte_t entry;
>>
>> - VM_WARN_ON(!pte_none(ptep_get(pte + i)));
>> - set_pte_at(mm, addr, pte + i, entry);
>> + if (freeze || is_migration_entry(swp_entry)) {
>>
> <snip>
>> + } else {
> <snip>
>> }
>> } else {
>> pte_t entry;
>
> David already pointed this out in v5. It can be done such as:
>
> if (freeze || pmd_migration) {
> ...
> } else if (is_pmd_device_private_entry(old_pmd)) {
> ...
No.. freeze can be true for device private entries as well
> } else {
> /* for present, non freeze case */
> }
>
>> @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>> }
>> pte_unmap(pte);
>>
>> - if (!pmd_migration)
>> + if (!is_pmd_migration_entry(*pmd))
>> folio_remove_rmap_pmd(folio, page, vma);
>> if (freeze)
>> put_page(page);
>> @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>> pmd_t *pmd, bool freeze)
>> {
>> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
>> - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
>> + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd))
>> __split_huge_pmd_locked(vma, pmd, address, freeze);
>> }
>>
>> @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>> VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
>> lockdep_assert_held(&lruvec->lru_lock);
>>
>> + if (folio_is_device_private(folio))
>> + return;
>> +
>> if (list) {
>> /* page reclaim is reclaiming a huge page */
>> VM_WARN_ON(folio_test_lru(folio));
>> @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>> if (nr_shmem_dropped)
>> shmem_uncharge(mapping->host, nr_shmem_dropped);
>>
>> - if (!ret && is_anon)
>> + if (!ret && is_anon && !folio_is_device_private(folio))
>> remap_flags = RMP_USE_SHARED_ZEROPAGE;
>> +
>
> You should remove this and add
>
> if (folio_is_device_private(folio))
> return false;
>
> in try_to_map_unused_to_zeropage(). Otherwise, no one would know
> device private folios need to be excluded from mapping unused to
> zero page.
>
I had that upto v2 and then David asked me to remove it. FYI, this
is the only call site for RMP_USE_SHARED_ZEROPAGE
>> remap_page(folio, 1 << order, remap_flags);
>>
>> /*
>> --
>> 2.50.1
>
>
Thanks for the review
Balbir
On 22 Sep 2025, at 21:50, Balbir Singh wrote:
> On 9/23/25 07:09, Zi Yan wrote:
>> On 16 Sep 2025, at 8:21, Balbir Singh wrote:
>>
>>> Add support for splitting device-private THP folios, enabling fallback
>>> to smaller page sizes when large page allocation or migration fails.
>>>
>>> Key changes:
>>> - split_huge_pmd(): Handle device-private PMD entries during splitting
>>> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
>>> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
>>> don't support shared zero page semantics
>>>
>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>>> Cc: David Hildenbrand <david@redhat.com>
>>> Cc: Zi Yan <ziy@nvidia.com>
>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>>> Cc: Rakie Kim <rakie.kim@sk.com>
>>> Cc: Byungchul Park <byungchul@sk.com>
>>> Cc: Gregory Price <gourry@gourry.net>
>>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>>> Cc: Alistair Popple <apopple@nvidia.com>
>>> Cc: Oscar Salvador <osalvador@suse.de>
>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>>> Cc: Nico Pache <npache@redhat.com>
>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>> Cc: Dev Jain <dev.jain@arm.com>
>>> Cc: Barry Song <baohua@kernel.org>
>>> Cc: Lyude Paul <lyude@redhat.com>
>>> Cc: Danilo Krummrich <dakr@kernel.org>
>>> Cc: David Airlie <airlied@gmail.com>
>>> Cc: Simona Vetter <simona@ffwll.ch>
>>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>>> Cc: Mika Penttilä <mpenttil@redhat.com>
>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>> ---
>>> mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++--------------
>>> 1 file changed, 98 insertions(+), 40 deletions(-)
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 78166db72f4d..5291ee155a02 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> struct page *page;
>>> pgtable_t pgtable;
>>> pmd_t old_pmd, _pmd;
>>> - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
>>> - bool anon_exclusive = false, dirty = false;
>>> + bool soft_dirty, uffd_wp = false, young = false, write = false;
>>> + bool anon_exclusive = false, dirty = false, present = false;
>>> unsigned long addr;
>>> pte_t *pte;
>>> int i;
>>> + swp_entry_t swp_entry;
>>>
>>> VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>>> VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>>> VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
>>> - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
>>> +
>>> + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd));
>>>
>>> count_vm_event(THP_SPLIT_PMD);
>>>
>>> @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> return __split_huge_zero_page_pmd(vma, haddr, pmd);
>>> }
>>>
>>> - pmd_migration = is_pmd_migration_entry(*pmd);
>>> - if (unlikely(pmd_migration)) {
>>> - swp_entry_t entry;
>>>
>>> + present = pmd_present(*pmd);
>>> + if (is_pmd_migration_entry(*pmd)) {
>>> old_pmd = *pmd;
>>> - entry = pmd_to_swp_entry(old_pmd);
>>> - page = pfn_swap_entry_to_page(entry);
>>> - write = is_writable_migration_entry(entry);
>>> + swp_entry = pmd_to_swp_entry(old_pmd);
>>> + page = pfn_swap_entry_to_page(swp_entry);
>>> + folio = page_folio(page);
>>> +
>>> + soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>> + uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>> +
>>> + write = is_writable_migration_entry(swp_entry);
>>> if (PageAnon(page))
>>> - anon_exclusive = is_readable_exclusive_migration_entry(entry);
>>> - young = is_migration_entry_young(entry);
>>> - dirty = is_migration_entry_dirty(entry);
>>> + anon_exclusive = is_readable_exclusive_migration_entry(swp_entry);
>>> + young = is_migration_entry_young(swp_entry);
>>> + dirty = is_migration_entry_dirty(swp_entry);
>>> + } else if (is_pmd_device_private_entry(*pmd)) {
>>> + old_pmd = *pmd;
>>> + swp_entry = pmd_to_swp_entry(old_pmd);
>>> + page = pfn_swap_entry_to_page(swp_entry);
>>> + folio = page_folio(page);
>>> +
>>> soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>> uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>> +
>>> + write = is_writable_device_private_entry(swp_entry);
>>> + anon_exclusive = PageAnonExclusive(page);
>>> +
>>> + if (freeze && anon_exclusive &&
>>> + folio_try_share_anon_rmap_pmd(folio, page))
>>> + freeze = false;
>>
>> Why is it OK to change the freeze request? OK, it is replicating
>> the code for present PMD folios. Either add a comment to point
>> to the explanation in the comment below, or move
>> “if (is_pmd_device_private_entry(*pmd))“ branch in the else below
>> to deduplicate this code.
>
> Similar to the code for present pages, ideally folio_try_share_anon_rmap_pmd()
> should never fail.
anon_exclusive = PageAnonExclusive(page);
if (freeze && anon_exclusive &&
folio_try_share_anon_rmap_pmd(folio, page))
freeze = false;
if (!freeze) {
rmap_t rmap_flags = RMAP_NONE;
folio_ref_add(folio, HPAGE_PMD_NR - 1);
if (anon_exclusive)
rmap_flags |= RMAP_EXCLUSIVE;
folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
vma, haddr, rmap_flags);
}
are the same for both device private and present. Can it be deduplicated
by doing below?
if (is_pmd_migration_entry(*pmd)) {
...
} else {
if (is_pmd_device_private_entry(*pmd)) {
...
} else if (pmd_present()) {
...
}
/* the above code */
}
If not, at least adding a comment in the device private copy of the code
pointing to the present copy's comment.
>
>>
>>> + if (!freeze) {
>>> + rmap_t rmap_flags = RMAP_NONE;
>>> +
>>> + folio_ref_add(folio, HPAGE_PMD_NR - 1);
>>> + if (anon_exclusive)
>>> + rmap_flags |= RMAP_EXCLUSIVE;
>>> +
>>> + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
>>> + vma, haddr, rmap_flags);
>>> + }
>>> } else {
>>> /*
>>> * Up to this point the pmd is present and huge and userland has
>>> @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> * Note that NUMA hinting access restrictions are not transferred to
>>> * avoid any possibility of altering permissions across VMAs.
>>> */
>>> - if (freeze || pmd_migration) {
>>> - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>>> - pte_t entry;
>>> - swp_entry_t swp_entry;
>>> -
>>> - if (write)
>>> - swp_entry = make_writable_migration_entry(
>>> - page_to_pfn(page + i));
>>> - else if (anon_exclusive)
>>> - swp_entry = make_readable_exclusive_migration_entry(
>>> - page_to_pfn(page + i));
>>> - else
>>> - swp_entry = make_readable_migration_entry(
>>> - page_to_pfn(page + i));
>>> - if (young)
>>> - swp_entry = make_migration_entry_young(swp_entry);
>>> - if (dirty)
>>> - swp_entry = make_migration_entry_dirty(swp_entry);
>>> - entry = swp_entry_to_pte(swp_entry);
>>> - if (soft_dirty)
>>> - entry = pte_swp_mksoft_dirty(entry);
>>> - if (uffd_wp)
>>> - entry = pte_swp_mkuffd_wp(entry);
>>> + if (freeze || !present) {
>>> + pte_t entry;
>>>
>>> - VM_WARN_ON(!pte_none(ptep_get(pte + i)));
>>> - set_pte_at(mm, addr, pte + i, entry);
>>> + if (freeze || is_migration_entry(swp_entry)) {
>>>
>> <snip>
>>> + } else {
>> <snip>
>>> }
>>> } else {
>>> pte_t entry;
>>
>> David already pointed this out in v5. It can be done such as:
>>
>> if (freeze || pmd_migration) {
>> ...
>> } else if (is_pmd_device_private_entry(old_pmd)) {
>> ...
>
> No.. freeze can be true for device private entries as well
When freeze is true, migration entry is installed in place of
device private entry, since the "if (freeze || pmd_migration)"
branch is taken. This proposal is same as your code. What is
the difference?
>
>> } else {
>> /* for present, non freeze case */
>> }
>>
>>> @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>> }
>>> pte_unmap(pte);
>>>
>>> - if (!pmd_migration)
>>> + if (!is_pmd_migration_entry(*pmd))
>>> folio_remove_rmap_pmd(folio, page, vma);
>>> if (freeze)
>>> put_page(page);
>>> @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>>> pmd_t *pmd, bool freeze)
>>> {
>>> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
>>> - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
>>> + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd))
>>> __split_huge_pmd_locked(vma, pmd, address, freeze);
>>> }
>>>
>>> @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>> VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
>>> lockdep_assert_held(&lruvec->lru_lock);
>>>
>>> + if (folio_is_device_private(folio))
>>> + return;
>>> +
>>> if (list) {
>>> /* page reclaim is reclaiming a huge page */
>>> VM_WARN_ON(folio_test_lru(folio));
>>> @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>>> if (nr_shmem_dropped)
>>> shmem_uncharge(mapping->host, nr_shmem_dropped);
>>>
>>> - if (!ret && is_anon)
>>> + if (!ret && is_anon && !folio_is_device_private(folio))
>>> remap_flags = RMP_USE_SHARED_ZEROPAGE;
>>> +
>>
>> You should remove this and add
>>
>> if (folio_is_device_private(folio))
>> return false;
>>
>> in try_to_map_unused_to_zeropage(). Otherwise, no one would know
>> device private folios need to be excluded from mapping unused to
>> zero page.
>>
>
> I had that upto v2 and then David asked me to remove it. FYI, this
> is the only call site for RMP_USE_SHARED_ZEROPAGE
Can you provide a link?
Even if this is the only call site, there is no guarantee that
there will be none in the future. I am not sure why we want caller
to handle this special case. Who is going to tell the next user
of RMP_USE_SHARED_ZEROPAGE or caller to try_to_map_unused_to_zeropage()
that device private is incompatible with them?
>
>>> remap_page(folio, 1 << order, remap_flags);
>>>
>>> /*
>>> --
>>> 2.50.1
>>
>>
>
> Thanks for the review
> Balbir
--
Best Regards,
Yan, Zi
On 9/23/25 12:09, Zi Yan wrote:
> On 22 Sep 2025, at 21:50, Balbir Singh wrote:
>
>> On 9/23/25 07:09, Zi Yan wrote:
>>> On 16 Sep 2025, at 8:21, Balbir Singh wrote:
>>>
>>>> Add support for splitting device-private THP folios, enabling fallback
>>>> to smaller page sizes when large page allocation or migration fails.
>>>>
>>>> Key changes:
>>>> - split_huge_pmd(): Handle device-private PMD entries during splitting
>>>> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
>>>> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
>>>> don't support shared zero page semantics
>>>>
>>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>>>> Cc: David Hildenbrand <david@redhat.com>
>>>> Cc: Zi Yan <ziy@nvidia.com>
>>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>>>> Cc: Rakie Kim <rakie.kim@sk.com>
>>>> Cc: Byungchul Park <byungchul@sk.com>
>>>> Cc: Gregory Price <gourry@gourry.net>
>>>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>>>> Cc: Alistair Popple <apopple@nvidia.com>
>>>> Cc: Oscar Salvador <osalvador@suse.de>
>>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>>>> Cc: Nico Pache <npache@redhat.com>
>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>>> Cc: Dev Jain <dev.jain@arm.com>
>>>> Cc: Barry Song <baohua@kernel.org>
>>>> Cc: Lyude Paul <lyude@redhat.com>
>>>> Cc: Danilo Krummrich <dakr@kernel.org>
>>>> Cc: David Airlie <airlied@gmail.com>
>>>> Cc: Simona Vetter <simona@ffwll.ch>
>>>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>>>> Cc: Mika Penttilä <mpenttil@redhat.com>
>>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>>> ---
>>>> mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++--------------
>>>> 1 file changed, 98 insertions(+), 40 deletions(-)
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 78166db72f4d..5291ee155a02 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>> struct page *page;
>>>> pgtable_t pgtable;
>>>> pmd_t old_pmd, _pmd;
>>>> - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
>>>> - bool anon_exclusive = false, dirty = false;
>>>> + bool soft_dirty, uffd_wp = false, young = false, write = false;
>>>> + bool anon_exclusive = false, dirty = false, present = false;
>>>> unsigned long addr;
>>>> pte_t *pte;
>>>> int i;
>>>> + swp_entry_t swp_entry;
>>>>
>>>> VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>>>> VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>>>> VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
>>>> - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
>>>> +
>>>> + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd));
>>>>
>>>> count_vm_event(THP_SPLIT_PMD);
>>>>
>>>> @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>> return __split_huge_zero_page_pmd(vma, haddr, pmd);
>>>> }
>>>>
>>>> - pmd_migration = is_pmd_migration_entry(*pmd);
>>>> - if (unlikely(pmd_migration)) {
>>>> - swp_entry_t entry;
>>>>
>>>> + present = pmd_present(*pmd);
>>>> + if (is_pmd_migration_entry(*pmd)) {
>>>> old_pmd = *pmd;
>>>> - entry = pmd_to_swp_entry(old_pmd);
>>>> - page = pfn_swap_entry_to_page(entry);
>>>> - write = is_writable_migration_entry(entry);
>>>> + swp_entry = pmd_to_swp_entry(old_pmd);
>>>> + page = pfn_swap_entry_to_page(swp_entry);
>>>> + folio = page_folio(page);
>>>> +
>>>> + soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>> + uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>> +
>>>> + write = is_writable_migration_entry(swp_entry);
>>>> if (PageAnon(page))
>>>> - anon_exclusive = is_readable_exclusive_migration_entry(entry);
>>>> - young = is_migration_entry_young(entry);
>>>> - dirty = is_migration_entry_dirty(entry);
>>>> + anon_exclusive = is_readable_exclusive_migration_entry(swp_entry);
>>>> + young = is_migration_entry_young(swp_entry);
>>>> + dirty = is_migration_entry_dirty(swp_entry);
>>>> + } else if (is_pmd_device_private_entry(*pmd)) {
>>>> + old_pmd = *pmd;
>>>> + swp_entry = pmd_to_swp_entry(old_pmd);
>>>> + page = pfn_swap_entry_to_page(swp_entry);
>>>> + folio = page_folio(page);
>>>> +
>>>> soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>> uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>> +
>>>> + write = is_writable_device_private_entry(swp_entry);
>>>> + anon_exclusive = PageAnonExclusive(page);
>>>> +
>>>> + if (freeze && anon_exclusive &&
>>>> + folio_try_share_anon_rmap_pmd(folio, page))
>>>> + freeze = false;
>>>
>>> Why is it OK to change the freeze request? OK, it is replicating
>>> the code for present PMD folios. Either add a comment to point
>>> to the explanation in the comment below, or move
>>> “if (is_pmd_device_private_entry(*pmd))“ branch in the else below
>>> to deduplicate this code.
>>
>> Similar to the code for present pages, ideally folio_try_share_anon_rmap_pmd()
>> should never fail.
>
> anon_exclusive = PageAnonExclusive(page);
> if (freeze && anon_exclusive &&
> folio_try_share_anon_rmap_pmd(folio, page))
> freeze = false;
> if (!freeze) {
> rmap_t rmap_flags = RMAP_NONE;
>
> folio_ref_add(folio, HPAGE_PMD_NR - 1);
> if (anon_exclusive)
> rmap_flags |= RMAP_EXCLUSIVE;
> folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
> vma, haddr, rmap_flags);
> }
>
> are the same for both device private and present. Can it be deduplicated
> by doing below?
>
> if (is_pmd_migration_entry(*pmd)) {
> ...
> } else {
> if (is_pmd_device_private_entry(*pmd)) {
> ...
> } else if (pmd_present()) {
> ...
> }
>
> /* the above code */
> }
>
> If not, at least adding a comment in the device private copy of the code
> pointing to the present copy's comment.
>
>>
>>>
>>>> + if (!freeze) {
>>>> + rmap_t rmap_flags = RMAP_NONE;
>>>> +
>>>> + folio_ref_add(folio, HPAGE_PMD_NR - 1);
>>>> + if (anon_exclusive)
>>>> + rmap_flags |= RMAP_EXCLUSIVE;
>>>> +
>>>> + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
>>>> + vma, haddr, rmap_flags);
>>>> + }
>>>> } else {
>>>> /*
>>>> * Up to this point the pmd is present and huge and userland has
>>>> @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>> * Note that NUMA hinting access restrictions are not transferred to
>>>> * avoid any possibility of altering permissions across VMAs.
>>>> */
>>>> - if (freeze || pmd_migration) {
>>>> - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>>>> - pte_t entry;
>>>> - swp_entry_t swp_entry;
>>>> -
>>>> - if (write)
>>>> - swp_entry = make_writable_migration_entry(
>>>> - page_to_pfn(page + i));
>>>> - else if (anon_exclusive)
>>>> - swp_entry = make_readable_exclusive_migration_entry(
>>>> - page_to_pfn(page + i));
>>>> - else
>>>> - swp_entry = make_readable_migration_entry(
>>>> - page_to_pfn(page + i));
>>>> - if (young)
>>>> - swp_entry = make_migration_entry_young(swp_entry);
>>>> - if (dirty)
>>>> - swp_entry = make_migration_entry_dirty(swp_entry);
>>>> - entry = swp_entry_to_pte(swp_entry);
>>>> - if (soft_dirty)
>>>> - entry = pte_swp_mksoft_dirty(entry);
>>>> - if (uffd_wp)
>>>> - entry = pte_swp_mkuffd_wp(entry);
>>>> + if (freeze || !present) {
>>>> + pte_t entry;
>>>>
>>>> - VM_WARN_ON(!pte_none(ptep_get(pte + i)));
>>>> - set_pte_at(mm, addr, pte + i, entry);
>>>> + if (freeze || is_migration_entry(swp_entry)) {
>>>>
>>> <snip>
>>>> + } else {
>>> <snip>
>>>> }
>>>> } else {
>>>> pte_t entry;
>>>
>>> David already pointed this out in v5. It can be done such as:
>>>
>>> if (freeze || pmd_migration) {
>>> ...
>>> } else if (is_pmd_device_private_entry(old_pmd)) {
>>> ...
>>
>> No.. freeze can be true for device private entries as well
>
> When freeze is true, migration entry is installed in place of
> device private entry, since the "if (freeze || pmd_migration)"
> branch is taken. This proposal is same as your code. What is
> the difference?
>
I read the else if incorrectly, I'll simplify
>>
>>> } else {
>>> /* for present, non freeze case */
>>> }
>>>
>>>> @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>> }
>>>> pte_unmap(pte);
>>>>
>>>> - if (!pmd_migration)
>>>> + if (!is_pmd_migration_entry(*pmd))
>>>> folio_remove_rmap_pmd(folio, page, vma);
>>>> if (freeze)
>>>> put_page(page);
>>>> @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>>>> pmd_t *pmd, bool freeze)
>>>> {
>>>> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
>>>> - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
>>>> + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd))
>>>> __split_huge_pmd_locked(vma, pmd, address, freeze);
>>>> }
>>>>
>>>> @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>>> VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
>>>> lockdep_assert_held(&lruvec->lru_lock);
>>>>
>>>> + if (folio_is_device_private(folio))
>>>> + return;
>>>> +
>>>> if (list) {
>>>> /* page reclaim is reclaiming a huge page */
>>>> VM_WARN_ON(folio_test_lru(folio));
>>>> @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>>>> if (nr_shmem_dropped)
>>>> shmem_uncharge(mapping->host, nr_shmem_dropped);
>>>>
>>>> - if (!ret && is_anon)
>>>> + if (!ret && is_anon && !folio_is_device_private(folio))
>>>> remap_flags = RMP_USE_SHARED_ZEROPAGE;
>>>> +
>>>
>>> You should remove this and add
>>>
>>> if (folio_is_device_private(folio))
>>> return false;
>>>
>>> in try_to_map_unused_to_zeropage(). Otherwise, no one would know
>>> device private folios need to be excluded from mapping unused to
>>> zero page.
>>>
>>
>> I had that upto v2 and then David asked me to remove it. FYI, this
>> is the only call site for RMP_USE_SHARED_ZEROPAGE
>
> Can you provide a link?
>
Please see https://lore.kernel.org/linux-mm/20250306044239.3874247-3-balbirs@nvidia.com/T/
> Even if this is the only call site, there is no guarantee that
> there will be none in the future. I am not sure why we want caller
> to handle this special case. Who is going to tell the next user
> of RMP_USE_SHARED_ZEROPAGE or caller to try_to_map_unused_to_zeropage()
> that device private is incompatible with them?
>
I don't disagree, but the question was why are device private pages even making
it to try_to_map_unused_to_zeropage()>>
>>>> remap_page(folio, 1 << order, remap_flags);
>>>>
>>>> /*
>>>> --
>>>> 2.50.1
>>>
>>>
>>
>> Thanks for the review
>> Balbir
Thanks,
Balbir
On 23 Sep 2025, at 0:04, Balbir Singh wrote:
> On 9/23/25 12:09, Zi Yan wrote:
>> On 22 Sep 2025, at 21:50, Balbir Singh wrote:
>>
>>> On 9/23/25 07:09, Zi Yan wrote:
>>>> On 16 Sep 2025, at 8:21, Balbir Singh wrote:
>>>>
>>>>> Add support for splitting device-private THP folios, enabling fallback
>>>>> to smaller page sizes when large page allocation or migration fails.
>>>>>
>>>>> Key changes:
>>>>> - split_huge_pmd(): Handle device-private PMD entries during splitting
>>>>> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
>>>>> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
>>>>> don't support shared zero page semantics
>>>>>
>>>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>>>>> Cc: David Hildenbrand <david@redhat.com>
>>>>> Cc: Zi Yan <ziy@nvidia.com>
>>>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>>>>> Cc: Rakie Kim <rakie.kim@sk.com>
>>>>> Cc: Byungchul Park <byungchul@sk.com>
>>>>> Cc: Gregory Price <gourry@gourry.net>
>>>>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>>>>> Cc: Alistair Popple <apopple@nvidia.com>
>>>>> Cc: Oscar Salvador <osalvador@suse.de>
>>>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>>>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>>>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>>>>> Cc: Nico Pache <npache@redhat.com>
>>>>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>>>>> Cc: Dev Jain <dev.jain@arm.com>
>>>>> Cc: Barry Song <baohua@kernel.org>
>>>>> Cc: Lyude Paul <lyude@redhat.com>
>>>>> Cc: Danilo Krummrich <dakr@kernel.org>
>>>>> Cc: David Airlie <airlied@gmail.com>
>>>>> Cc: Simona Vetter <simona@ffwll.ch>
>>>>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>>>>> Cc: Mika Penttilä <mpenttil@redhat.com>
>>>>> Cc: Matthew Brost <matthew.brost@intel.com>
>>>>> Cc: Francois Dugast <francois.dugast@intel.com>
>>>>> ---
>>>>> mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++--------------
>>>>> 1 file changed, 98 insertions(+), 40 deletions(-)
>>>>>
>>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>>> index 78166db72f4d..5291ee155a02 100644
>>>>> --- a/mm/huge_memory.c
>>>>> +++ b/mm/huge_memory.c
>>>>> @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>> struct page *page;
>>>>> pgtable_t pgtable;
>>>>> pmd_t old_pmd, _pmd;
>>>>> - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
>>>>> - bool anon_exclusive = false, dirty = false;
>>>>> + bool soft_dirty, uffd_wp = false, young = false, write = false;
>>>>> + bool anon_exclusive = false, dirty = false, present = false;
>>>>> unsigned long addr;
>>>>> pte_t *pte;
>>>>> int i;
>>>>> + swp_entry_t swp_entry;
>>>>>
>>>>> VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>>>>> VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>>>>> VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
>>>>> - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
>>>>> +
>>>>> + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd));
>>>>>
>>>>> count_vm_event(THP_SPLIT_PMD);
>>>>>
>>>>> @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>> return __split_huge_zero_page_pmd(vma, haddr, pmd);
>>>>> }
>>>>>
>>>>> - pmd_migration = is_pmd_migration_entry(*pmd);
>>>>> - if (unlikely(pmd_migration)) {
>>>>> - swp_entry_t entry;
>>>>>
>>>>> + present = pmd_present(*pmd);
>>>>> + if (is_pmd_migration_entry(*pmd)) {
>>>>> old_pmd = *pmd;
>>>>> - entry = pmd_to_swp_entry(old_pmd);
>>>>> - page = pfn_swap_entry_to_page(entry);
>>>>> - write = is_writable_migration_entry(entry);
>>>>> + swp_entry = pmd_to_swp_entry(old_pmd);
>>>>> + page = pfn_swap_entry_to_page(swp_entry);
>>>>> + folio = page_folio(page);
>>>>> +
>>>>> + soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>>> + uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>>> +
>>>>> + write = is_writable_migration_entry(swp_entry);
>>>>> if (PageAnon(page))
>>>>> - anon_exclusive = is_readable_exclusive_migration_entry(entry);
>>>>> - young = is_migration_entry_young(entry);
>>>>> - dirty = is_migration_entry_dirty(entry);
>>>>> + anon_exclusive = is_readable_exclusive_migration_entry(swp_entry);
>>>>> + young = is_migration_entry_young(swp_entry);
>>>>> + dirty = is_migration_entry_dirty(swp_entry);
>>>>> + } else if (is_pmd_device_private_entry(*pmd)) {
>>>>> + old_pmd = *pmd;
>>>>> + swp_entry = pmd_to_swp_entry(old_pmd);
>>>>> + page = pfn_swap_entry_to_page(swp_entry);
>>>>> + folio = page_folio(page);
>>>>> +
>>>>> soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>>> uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>>> +
>>>>> + write = is_writable_device_private_entry(swp_entry);
>>>>> + anon_exclusive = PageAnonExclusive(page);
>>>>> +
>>>>> + if (freeze && anon_exclusive &&
>>>>> + folio_try_share_anon_rmap_pmd(folio, page))
>>>>> + freeze = false;
>>>>
>>>> Why is it OK to change the freeze request? OK, it is replicating
>>>> the code for present PMD folios. Either add a comment to point
>>>> to the explanation in the comment below, or move
>>>> “if (is_pmd_device_private_entry(*pmd))“ branch in the else below
>>>> to deduplicate this code.
>>>
>>> Similar to the code for present pages, ideally folio_try_share_anon_rmap_pmd()
>>> should never fail.
>>
>> anon_exclusive = PageAnonExclusive(page);
>> if (freeze && anon_exclusive &&
>> folio_try_share_anon_rmap_pmd(folio, page))
>> freeze = false;
>> if (!freeze) {
>> rmap_t rmap_flags = RMAP_NONE;
>>
>> folio_ref_add(folio, HPAGE_PMD_NR - 1);
>> if (anon_exclusive)
>> rmap_flags |= RMAP_EXCLUSIVE;
>> folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
>> vma, haddr, rmap_flags);
>> }
>>
>> are the same for both device private and present. Can it be deduplicated
>> by doing below?
>>
>> if (is_pmd_migration_entry(*pmd)) {
>> ...
>> } else {
>> if (is_pmd_device_private_entry(*pmd)) {
>> ...
>> } else if (pmd_present()) {
>> ...
>> }
>>
>> /* the above code */
>> }
>>
>> If not, at least adding a comment in the device private copy of the code
>> pointing to the present copy's comment.
>>
>>>
>>>>
>>>>> + if (!freeze) {
>>>>> + rmap_t rmap_flags = RMAP_NONE;
>>>>> +
>>>>> + folio_ref_add(folio, HPAGE_PMD_NR - 1);
>>>>> + if (anon_exclusive)
>>>>> + rmap_flags |= RMAP_EXCLUSIVE;
>>>>> +
>>>>> + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
>>>>> + vma, haddr, rmap_flags);
>>>>> + }
>>>>> } else {
>>>>> /*
>>>>> * Up to this point the pmd is present and huge and userland has
>>>>> @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>> * Note that NUMA hinting access restrictions are not transferred to
>>>>> * avoid any possibility of altering permissions across VMAs.
>>>>> */
>>>>> - if (freeze || pmd_migration) {
>>>>> - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>>>>> - pte_t entry;
>>>>> - swp_entry_t swp_entry;
>>>>> -
>>>>> - if (write)
>>>>> - swp_entry = make_writable_migration_entry(
>>>>> - page_to_pfn(page + i));
>>>>> - else if (anon_exclusive)
>>>>> - swp_entry = make_readable_exclusive_migration_entry(
>>>>> - page_to_pfn(page + i));
>>>>> - else
>>>>> - swp_entry = make_readable_migration_entry(
>>>>> - page_to_pfn(page + i));
>>>>> - if (young)
>>>>> - swp_entry = make_migration_entry_young(swp_entry);
>>>>> - if (dirty)
>>>>> - swp_entry = make_migration_entry_dirty(swp_entry);
>>>>> - entry = swp_entry_to_pte(swp_entry);
>>>>> - if (soft_dirty)
>>>>> - entry = pte_swp_mksoft_dirty(entry);
>>>>> - if (uffd_wp)
>>>>> - entry = pte_swp_mkuffd_wp(entry);
>>>>> + if (freeze || !present) {
>>>>> + pte_t entry;
>>>>>
>>>>> - VM_WARN_ON(!pte_none(ptep_get(pte + i)));
>>>>> - set_pte_at(mm, addr, pte + i, entry);
>>>>> + if (freeze || is_migration_entry(swp_entry)) {
>>>>>
>>>> <snip>
>>>>> + } else {
>>>> <snip>
>>>>> }
>>>>> } else {
>>>>> pte_t entry;
>>>>
>>>> David already pointed this out in v5. It can be done such as:
>>>>
>>>> if (freeze || pmd_migration) {
>>>> ...
>>>> } else if (is_pmd_device_private_entry(old_pmd)) {
>>>> ...
>>>
>>> No.. freeze can be true for device private entries as well
>>
>> When freeze is true, migration entry is installed in place of
>> device private entry, since the "if (freeze || pmd_migration)"
>> branch is taken. This proposal is same as your code. What is
>> the difference?
>>
>
> I read the else if incorrectly, I'll simplify
>
>>>
>>>> } else {
>>>> /* for present, non freeze case */
>>>> }
>>>>
>>>>> @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>>> }
>>>>> pte_unmap(pte);
>>>>>
>>>>> - if (!pmd_migration)
>>>>> + if (!is_pmd_migration_entry(*pmd))
>>>>> folio_remove_rmap_pmd(folio, page, vma);
>>>>> if (freeze)
>>>>> put_page(page);
>>>>> @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>>>>> pmd_t *pmd, bool freeze)
>>>>> {
>>>>> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
>>>>> - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
>>>>> + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd))
>>>>> __split_huge_pmd_locked(vma, pmd, address, freeze);
>>>>> }
>>>>>
>>>>> @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
>>>>> VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
>>>>> lockdep_assert_held(&lruvec->lru_lock);
>>>>>
>>>>> + if (folio_is_device_private(folio))
>>>>> + return;
>>>>> +
>>>>> if (list) {
>>>>> /* page reclaim is reclaiming a huge page */
>>>>> VM_WARN_ON(folio_test_lru(folio));
>>>>> @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
>>>>> if (nr_shmem_dropped)
>>>>> shmem_uncharge(mapping->host, nr_shmem_dropped);
>>>>>
>>>>> - if (!ret && is_anon)
>>>>> + if (!ret && is_anon && !folio_is_device_private(folio))
>>>>> remap_flags = RMP_USE_SHARED_ZEROPAGE;
>>>>> +
>>>>
>>>> You should remove this and add
>>>>
>>>> if (folio_is_device_private(folio))
>>>> return false;
>>>>
>>>> in try_to_map_unused_to_zeropage(). Otherwise, no one would know
>>>> device private folios need to be excluded from mapping unused to
>>>> zero page.
>>>>
>>>
>>> I had that upto v2 and then David asked me to remove it. FYI, this
>>> is the only call site for RMP_USE_SHARED_ZEROPAGE
>>
>> Can you provide a link?
>>
>
> Please see https://lore.kernel.org/linux-mm/20250306044239.3874247-3-balbirs@nvidia.com/T/
I do not see any comment on removing device private folio check
in try_to_map_unused_to_zeropage(). Can you try again?
>
>> Even if this is the only call site, there is no guarantee that
>> there will be none in the future. I am not sure why we want caller
>> to handle this special case. Who is going to tell the next user
>> of RMP_USE_SHARED_ZEROPAGE or caller to try_to_map_unused_to_zeropage()
>> that device private is incompatible with them?
>>
>
> I don't disagree, but the question was why are device private pages even making
> it to try_to_map_unused_to_zeropage()>>
Then, it could be done in remove_migration_pte():
if (rmap_walk_arg->map_unused_to_zeropage &&
!folio_is_device_private(folio) &&
try_to_map_unused_to_zeropage(&pvmw, folio, idx))
continue;
Maybe I am too hung up on this and someone else could pat on my back and
tell me it is OK to just do this at the only caller instead. :)
>>>>> remap_page(folio, 1 << order, remap_flags);
>>>>>
>>>>> /*
>>>>> --
>>>>> 2.50.1
>>>>
>>>>
>>>
>>> Thanks for the review
>>> Balbir
>
> Thanks,
> Balbir
Best Regards,
Yan, Zi
© 2016 - 2026 Red Hat, Inc.