Add support for splitting device-private THP folios, enabling fallback
to smaller page sizes when large page allocation or migration fails.
Key changes:
- split_huge_pmd(): Handle device-private PMD entries during splitting
- Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
- Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
don't support shared zero page semantics
Signed-off-by: Balbir Singh <balbirs@nvidia.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>
---
mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++--------------
1 file changed, 98 insertions(+), 40 deletions(-)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 78166db72f4d..5291ee155a02 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
struct page *page;
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
- bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
- bool anon_exclusive = false, dirty = false;
+ bool soft_dirty, uffd_wp = false, young = false, write = false;
+ bool anon_exclusive = false, dirty = false, present = false;
unsigned long addr;
pte_t *pte;
int i;
+ swp_entry_t swp_entry;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
- VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
+
+ VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd));
count_vm_event(THP_SPLIT_PMD);
@@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
return __split_huge_zero_page_pmd(vma, haddr, pmd);
}
- pmd_migration = is_pmd_migration_entry(*pmd);
- if (unlikely(pmd_migration)) {
- swp_entry_t entry;
+ present = pmd_present(*pmd);
+ if (is_pmd_migration_entry(*pmd)) {
old_pmd = *pmd;
- entry = pmd_to_swp_entry(old_pmd);
- page = pfn_swap_entry_to_page(entry);
- write = is_writable_migration_entry(entry);
+ swp_entry = pmd_to_swp_entry(old_pmd);
+ page = pfn_swap_entry_to_page(swp_entry);
+ folio = page_folio(page);
+
+ soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+ write = is_writable_migration_entry(swp_entry);
if (PageAnon(page))
- anon_exclusive = is_readable_exclusive_migration_entry(entry);
- young = is_migration_entry_young(entry);
- dirty = is_migration_entry_dirty(entry);
+ anon_exclusive = is_readable_exclusive_migration_entry(swp_entry);
+ young = is_migration_entry_young(swp_entry);
+ dirty = is_migration_entry_dirty(swp_entry);
+ } else if (is_pmd_device_private_entry(*pmd)) {
+ old_pmd = *pmd;
+ swp_entry = pmd_to_swp_entry(old_pmd);
+ page = pfn_swap_entry_to_page(swp_entry);
+ folio = page_folio(page);
+
soft_dirty = pmd_swp_soft_dirty(old_pmd);
uffd_wp = pmd_swp_uffd_wp(old_pmd);
+
+ write = is_writable_device_private_entry(swp_entry);
+ anon_exclusive = PageAnonExclusive(page);
+
+ if (freeze && anon_exclusive &&
+ folio_try_share_anon_rmap_pmd(folio, page))
+ freeze = false;
+ if (!freeze) {
+ rmap_t rmap_flags = RMAP_NONE;
+
+ folio_ref_add(folio, HPAGE_PMD_NR - 1);
+ if (anon_exclusive)
+ rmap_flags |= RMAP_EXCLUSIVE;
+
+ folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
+ vma, haddr, rmap_flags);
+ }
} else {
/*
* Up to this point the pmd is present and huge and userland has
@@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* Note that NUMA hinting access restrictions are not transferred to
* avoid any possibility of altering permissions across VMAs.
*/
- if (freeze || pmd_migration) {
- for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
- pte_t entry;
- swp_entry_t swp_entry;
-
- if (write)
- swp_entry = make_writable_migration_entry(
- page_to_pfn(page + i));
- else if (anon_exclusive)
- swp_entry = make_readable_exclusive_migration_entry(
- page_to_pfn(page + i));
- else
- swp_entry = make_readable_migration_entry(
- page_to_pfn(page + i));
- if (young)
- swp_entry = make_migration_entry_young(swp_entry);
- if (dirty)
- swp_entry = make_migration_entry_dirty(swp_entry);
- entry = swp_entry_to_pte(swp_entry);
- if (soft_dirty)
- entry = pte_swp_mksoft_dirty(entry);
- if (uffd_wp)
- entry = pte_swp_mkuffd_wp(entry);
+ if (freeze || !present) {
+ pte_t entry;
- VM_WARN_ON(!pte_none(ptep_get(pte + i)));
- set_pte_at(mm, addr, pte + i, entry);
+ if (freeze || is_migration_entry(swp_entry)) {
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ if (write)
+ swp_entry = make_writable_migration_entry(
+ page_to_pfn(page + i));
+ else if (anon_exclusive)
+ swp_entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_migration_entry(
+ page_to_pfn(page + i));
+ if (young)
+ swp_entry = make_migration_entry_young(swp_entry);
+ if (dirty)
+ swp_entry = make_migration_entry_dirty(swp_entry);
+
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+ set_pte_at(mm, addr, pte + i, entry);
+ }
+ } else {
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
+ /*
+ * anon_exclusive was already propagated to the relevant
+ * pages corresponding to the pte entries when freeze
+ * is false.
+ */
+ if (write)
+ swp_entry = make_writable_device_private_entry(
+ page_to_pfn(page + i));
+ else
+ swp_entry = make_readable_device_private_entry(
+ page_to_pfn(page + i));
+ /*
+ * Young and dirty bits are not progated via swp_entry
+ */
+ entry = swp_entry_to_pte(swp_entry);
+ if (soft_dirty)
+ entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
+ VM_WARN_ON(!pte_none(ptep_get(pte + i)));
+ set_pte_at(mm, addr, pte + i, entry);
+ }
}
} else {
pte_t entry;
@@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
pte_unmap(pte);
- if (!pmd_migration)
+ if (!is_pmd_migration_entry(*pmd))
folio_remove_rmap_pmd(folio, page, vma);
if (freeze)
put_page(page);
@@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, bool freeze)
{
VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
- if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
+ if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd))
__split_huge_pmd_locked(vma, pmd, address, freeze);
}
@@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
lockdep_assert_held(&lruvec->lru_lock);
+ if (folio_is_device_private(folio))
+ return;
+
if (list) {
/* page reclaim is reclaiming a huge page */
VM_WARN_ON(folio_test_lru(folio));
@@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
if (nr_shmem_dropped)
shmem_uncharge(mapping->host, nr_shmem_dropped);
- if (!ret && is_anon)
+ if (!ret && is_anon && !folio_is_device_private(folio))
remap_flags = RMP_USE_SHARED_ZEROPAGE;
+
remap_page(folio, 1 << order, remap_flags);
/*
--
2.50.1
On 16 Sep 2025, at 8:21, Balbir Singh wrote: > Add support for splitting device-private THP folios, enabling fallback > to smaller page sizes when large page allocation or migration fails. > > Key changes: > - split_huge_pmd(): Handle device-private PMD entries during splitting > - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios > - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they > don't support shared zero page semantics > > Signed-off-by: Balbir Singh <balbirs@nvidia.com> > Cc: David Hildenbrand <david@redhat.com> > Cc: Zi Yan <ziy@nvidia.com> > Cc: Joshua Hahn <joshua.hahnjy@gmail.com> > Cc: Rakie Kim <rakie.kim@sk.com> > Cc: Byungchul Park <byungchul@sk.com> > Cc: Gregory Price <gourry@gourry.net> > Cc: Ying Huang <ying.huang@linux.alibaba.com> > Cc: Alistair Popple <apopple@nvidia.com> > Cc: Oscar Salvador <osalvador@suse.de> > Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > Cc: Baolin Wang <baolin.wang@linux.alibaba.com> > Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> > Cc: Nico Pache <npache@redhat.com> > Cc: Ryan Roberts <ryan.roberts@arm.com> > Cc: Dev Jain <dev.jain@arm.com> > Cc: Barry Song <baohua@kernel.org> > Cc: Lyude Paul <lyude@redhat.com> > Cc: Danilo Krummrich <dakr@kernel.org> > Cc: David Airlie <airlied@gmail.com> > Cc: Simona Vetter <simona@ffwll.ch> > Cc: Ralph Campbell <rcampbell@nvidia.com> > Cc: Mika Penttilä <mpenttil@redhat.com> > Cc: Matthew Brost <matthew.brost@intel.com> > Cc: Francois Dugast <francois.dugast@intel.com> > --- > mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++-------------- > 1 file changed, 98 insertions(+), 40 deletions(-) > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > index 78166db72f4d..5291ee155a02 100644 > --- a/mm/huge_memory.c > +++ b/mm/huge_memory.c > @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > struct page *page; > pgtable_t pgtable; > pmd_t old_pmd, _pmd; > - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; > - bool anon_exclusive = false, dirty = false; > + bool soft_dirty, uffd_wp = false, young = false, write = false; > + bool anon_exclusive = false, dirty = false, present = false; > unsigned long addr; > pte_t *pte; > int i; > + swp_entry_t swp_entry; > > VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); > VM_BUG_ON_VMA(vma->vm_start > haddr, vma); > VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); > - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); > + > + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd)); > > count_vm_event(THP_SPLIT_PMD); > > @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > return __split_huge_zero_page_pmd(vma, haddr, pmd); > } > > - pmd_migration = is_pmd_migration_entry(*pmd); > - if (unlikely(pmd_migration)) { > - swp_entry_t entry; > > + present = pmd_present(*pmd); > + if (is_pmd_migration_entry(*pmd)) { > old_pmd = *pmd; > - entry = pmd_to_swp_entry(old_pmd); > - page = pfn_swap_entry_to_page(entry); > - write = is_writable_migration_entry(entry); > + swp_entry = pmd_to_swp_entry(old_pmd); > + page = pfn_swap_entry_to_page(swp_entry); > + folio = page_folio(page); > + > + soft_dirty = pmd_swp_soft_dirty(old_pmd); > + uffd_wp = pmd_swp_uffd_wp(old_pmd); > + > + write = is_writable_migration_entry(swp_entry); > if (PageAnon(page)) > - anon_exclusive = is_readable_exclusive_migration_entry(entry); > - young = is_migration_entry_young(entry); > - dirty = is_migration_entry_dirty(entry); > + anon_exclusive = is_readable_exclusive_migration_entry(swp_entry); > + young = is_migration_entry_young(swp_entry); > + dirty = is_migration_entry_dirty(swp_entry); > + } else if (is_pmd_device_private_entry(*pmd)) { > + old_pmd = *pmd; > + swp_entry = pmd_to_swp_entry(old_pmd); > + page = pfn_swap_entry_to_page(swp_entry); > + folio = page_folio(page); > + > soft_dirty = pmd_swp_soft_dirty(old_pmd); > uffd_wp = pmd_swp_uffd_wp(old_pmd); > + > + write = is_writable_device_private_entry(swp_entry); > + anon_exclusive = PageAnonExclusive(page); > + > + if (freeze && anon_exclusive && > + folio_try_share_anon_rmap_pmd(folio, page)) > + freeze = false; Why is it OK to change the freeze request? OK, it is replicating the code for present PMD folios. Either add a comment to point to the explanation in the comment below, or move “if (is_pmd_device_private_entry(*pmd))“ branch in the else below to deduplicate this code. > + if (!freeze) { > + rmap_t rmap_flags = RMAP_NONE; > + > + folio_ref_add(folio, HPAGE_PMD_NR - 1); > + if (anon_exclusive) > + rmap_flags |= RMAP_EXCLUSIVE; > + > + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, > + vma, haddr, rmap_flags); > + } > } else { > /* > * Up to this point the pmd is present and huge and userland has > @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > * Note that NUMA hinting access restrictions are not transferred to > * avoid any possibility of altering permissions across VMAs. > */ > - if (freeze || pmd_migration) { > - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { > - pte_t entry; > - swp_entry_t swp_entry; > - > - if (write) > - swp_entry = make_writable_migration_entry( > - page_to_pfn(page + i)); > - else if (anon_exclusive) > - swp_entry = make_readable_exclusive_migration_entry( > - page_to_pfn(page + i)); > - else > - swp_entry = make_readable_migration_entry( > - page_to_pfn(page + i)); > - if (young) > - swp_entry = make_migration_entry_young(swp_entry); > - if (dirty) > - swp_entry = make_migration_entry_dirty(swp_entry); > - entry = swp_entry_to_pte(swp_entry); > - if (soft_dirty) > - entry = pte_swp_mksoft_dirty(entry); > - if (uffd_wp) > - entry = pte_swp_mkuffd_wp(entry); > + if (freeze || !present) { > + pte_t entry; > > - VM_WARN_ON(!pte_none(ptep_get(pte + i))); > - set_pte_at(mm, addr, pte + i, entry); > + if (freeze || is_migration_entry(swp_entry)) { > <snip> > + } else { <snip> > } > } else { > pte_t entry; David already pointed this out in v5. It can be done such as: if (freeze || pmd_migration) { ... } else if (is_pmd_device_private_entry(old_pmd)) { ... } else { /* for present, non freeze case */ } > @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, > } > pte_unmap(pte); > > - if (!pmd_migration) > + if (!is_pmd_migration_entry(*pmd)) > folio_remove_rmap_pmd(folio, page, vma); > if (freeze) > put_page(page); > @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, > pmd_t *pmd, bool freeze) > { > VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); > - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) > + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd)) > __split_huge_pmd_locked(vma, pmd, address, freeze); > } > > @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, > VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); > lockdep_assert_held(&lruvec->lru_lock); > > + if (folio_is_device_private(folio)) > + return; > + > if (list) { > /* page reclaim is reclaiming a huge page */ > VM_WARN_ON(folio_test_lru(folio)); > @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, > if (nr_shmem_dropped) > shmem_uncharge(mapping->host, nr_shmem_dropped); > > - if (!ret && is_anon) > + if (!ret && is_anon && !folio_is_device_private(folio)) > remap_flags = RMP_USE_SHARED_ZEROPAGE; > + You should remove this and add if (folio_is_device_private(folio)) return false; in try_to_map_unused_to_zeropage(). Otherwise, no one would know device private folios need to be excluded from mapping unused to zero page. > remap_page(folio, 1 << order, remap_flags); > > /* > -- > 2.50.1 Best Regards, Yan, Zi
On 9/23/25 07:09, Zi Yan wrote: > On 16 Sep 2025, at 8:21, Balbir Singh wrote: > >> Add support for splitting device-private THP folios, enabling fallback >> to smaller page sizes when large page allocation or migration fails. >> >> Key changes: >> - split_huge_pmd(): Handle device-private PMD entries during splitting >> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios >> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they >> don't support shared zero page semantics >> >> Signed-off-by: Balbir Singh <balbirs@nvidia.com> >> Cc: David Hildenbrand <david@redhat.com> >> Cc: Zi Yan <ziy@nvidia.com> >> Cc: Joshua Hahn <joshua.hahnjy@gmail.com> >> Cc: Rakie Kim <rakie.kim@sk.com> >> Cc: Byungchul Park <byungchul@sk.com> >> Cc: Gregory Price <gourry@gourry.net> >> Cc: Ying Huang <ying.huang@linux.alibaba.com> >> Cc: Alistair Popple <apopple@nvidia.com> >> Cc: Oscar Salvador <osalvador@suse.de> >> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> >> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> >> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> >> Cc: Nico Pache <npache@redhat.com> >> Cc: Ryan Roberts <ryan.roberts@arm.com> >> Cc: Dev Jain <dev.jain@arm.com> >> Cc: Barry Song <baohua@kernel.org> >> Cc: Lyude Paul <lyude@redhat.com> >> Cc: Danilo Krummrich <dakr@kernel.org> >> Cc: David Airlie <airlied@gmail.com> >> Cc: Simona Vetter <simona@ffwll.ch> >> Cc: Ralph Campbell <rcampbell@nvidia.com> >> Cc: Mika Penttilä <mpenttil@redhat.com> >> Cc: Matthew Brost <matthew.brost@intel.com> >> Cc: Francois Dugast <francois.dugast@intel.com> >> --- >> mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++-------------- >> 1 file changed, 98 insertions(+), 40 deletions(-) >> >> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >> index 78166db72f4d..5291ee155a02 100644 >> --- a/mm/huge_memory.c >> +++ b/mm/huge_memory.c >> @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >> struct page *page; >> pgtable_t pgtable; >> pmd_t old_pmd, _pmd; >> - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; >> - bool anon_exclusive = false, dirty = false; >> + bool soft_dirty, uffd_wp = false, young = false, write = false; >> + bool anon_exclusive = false, dirty = false, present = false; >> unsigned long addr; >> pte_t *pte; >> int i; >> + swp_entry_t swp_entry; >> >> VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); >> VM_BUG_ON_VMA(vma->vm_start > haddr, vma); >> VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); >> - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); >> + >> + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd)); >> >> count_vm_event(THP_SPLIT_PMD); >> >> @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >> return __split_huge_zero_page_pmd(vma, haddr, pmd); >> } >> >> - pmd_migration = is_pmd_migration_entry(*pmd); >> - if (unlikely(pmd_migration)) { >> - swp_entry_t entry; >> >> + present = pmd_present(*pmd); >> + if (is_pmd_migration_entry(*pmd)) { >> old_pmd = *pmd; >> - entry = pmd_to_swp_entry(old_pmd); >> - page = pfn_swap_entry_to_page(entry); >> - write = is_writable_migration_entry(entry); >> + swp_entry = pmd_to_swp_entry(old_pmd); >> + page = pfn_swap_entry_to_page(swp_entry); >> + folio = page_folio(page); >> + >> + soft_dirty = pmd_swp_soft_dirty(old_pmd); >> + uffd_wp = pmd_swp_uffd_wp(old_pmd); >> + >> + write = is_writable_migration_entry(swp_entry); >> if (PageAnon(page)) >> - anon_exclusive = is_readable_exclusive_migration_entry(entry); >> - young = is_migration_entry_young(entry); >> - dirty = is_migration_entry_dirty(entry); >> + anon_exclusive = is_readable_exclusive_migration_entry(swp_entry); >> + young = is_migration_entry_young(swp_entry); >> + dirty = is_migration_entry_dirty(swp_entry); >> + } else if (is_pmd_device_private_entry(*pmd)) { >> + old_pmd = *pmd; >> + swp_entry = pmd_to_swp_entry(old_pmd); >> + page = pfn_swap_entry_to_page(swp_entry); >> + folio = page_folio(page); >> + >> soft_dirty = pmd_swp_soft_dirty(old_pmd); >> uffd_wp = pmd_swp_uffd_wp(old_pmd); >> + >> + write = is_writable_device_private_entry(swp_entry); >> + anon_exclusive = PageAnonExclusive(page); >> + >> + if (freeze && anon_exclusive && >> + folio_try_share_anon_rmap_pmd(folio, page)) >> + freeze = false; > > Why is it OK to change the freeze request? OK, it is replicating > the code for present PMD folios. Either add a comment to point > to the explanation in the comment below, or move > “if (is_pmd_device_private_entry(*pmd))“ branch in the else below > to deduplicate this code. Similar to the code for present pages, ideally folio_try_share_anon_rmap_pmd() should never fail. > >> + if (!freeze) { >> + rmap_t rmap_flags = RMAP_NONE; >> + >> + folio_ref_add(folio, HPAGE_PMD_NR - 1); >> + if (anon_exclusive) >> + rmap_flags |= RMAP_EXCLUSIVE; >> + >> + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, >> + vma, haddr, rmap_flags); >> + } >> } else { >> /* >> * Up to this point the pmd is present and huge and userland has >> @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >> * Note that NUMA hinting access restrictions are not transferred to >> * avoid any possibility of altering permissions across VMAs. >> */ >> - if (freeze || pmd_migration) { >> - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { >> - pte_t entry; >> - swp_entry_t swp_entry; >> - >> - if (write) >> - swp_entry = make_writable_migration_entry( >> - page_to_pfn(page + i)); >> - else if (anon_exclusive) >> - swp_entry = make_readable_exclusive_migration_entry( >> - page_to_pfn(page + i)); >> - else >> - swp_entry = make_readable_migration_entry( >> - page_to_pfn(page + i)); >> - if (young) >> - swp_entry = make_migration_entry_young(swp_entry); >> - if (dirty) >> - swp_entry = make_migration_entry_dirty(swp_entry); >> - entry = swp_entry_to_pte(swp_entry); >> - if (soft_dirty) >> - entry = pte_swp_mksoft_dirty(entry); >> - if (uffd_wp) >> - entry = pte_swp_mkuffd_wp(entry); >> + if (freeze || !present) { >> + pte_t entry; >> >> - VM_WARN_ON(!pte_none(ptep_get(pte + i))); >> - set_pte_at(mm, addr, pte + i, entry); >> + if (freeze || is_migration_entry(swp_entry)) { >> > <snip> >> + } else { > <snip> >> } >> } else { >> pte_t entry; > > David already pointed this out in v5. It can be done such as: > > if (freeze || pmd_migration) { > ... > } else if (is_pmd_device_private_entry(old_pmd)) { > ... No.. freeze can be true for device private entries as well > } else { > /* for present, non freeze case */ > } > >> @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >> } >> pte_unmap(pte); >> >> - if (!pmd_migration) >> + if (!is_pmd_migration_entry(*pmd)) >> folio_remove_rmap_pmd(folio, page, vma); >> if (freeze) >> put_page(page); >> @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, >> pmd_t *pmd, bool freeze) >> { >> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); >> - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) >> + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd)) >> __split_huge_pmd_locked(vma, pmd, address, freeze); >> } >> >> @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, >> VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); >> lockdep_assert_held(&lruvec->lru_lock); >> >> + if (folio_is_device_private(folio)) >> + return; >> + >> if (list) { >> /* page reclaim is reclaiming a huge page */ >> VM_WARN_ON(folio_test_lru(folio)); >> @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, >> if (nr_shmem_dropped) >> shmem_uncharge(mapping->host, nr_shmem_dropped); >> >> - if (!ret && is_anon) >> + if (!ret && is_anon && !folio_is_device_private(folio)) >> remap_flags = RMP_USE_SHARED_ZEROPAGE; >> + > > You should remove this and add > > if (folio_is_device_private(folio)) > return false; > > in try_to_map_unused_to_zeropage(). Otherwise, no one would know > device private folios need to be excluded from mapping unused to > zero page. > I had that upto v2 and then David asked me to remove it. FYI, this is the only call site for RMP_USE_SHARED_ZEROPAGE >> remap_page(folio, 1 << order, remap_flags); >> >> /* >> -- >> 2.50.1 > > Thanks for the review Balbir
On 22 Sep 2025, at 21:50, Balbir Singh wrote: > On 9/23/25 07:09, Zi Yan wrote: >> On 16 Sep 2025, at 8:21, Balbir Singh wrote: >> >>> Add support for splitting device-private THP folios, enabling fallback >>> to smaller page sizes when large page allocation or migration fails. >>> >>> Key changes: >>> - split_huge_pmd(): Handle device-private PMD entries during splitting >>> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios >>> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they >>> don't support shared zero page semantics >>> >>> Signed-off-by: Balbir Singh <balbirs@nvidia.com> >>> Cc: David Hildenbrand <david@redhat.com> >>> Cc: Zi Yan <ziy@nvidia.com> >>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com> >>> Cc: Rakie Kim <rakie.kim@sk.com> >>> Cc: Byungchul Park <byungchul@sk.com> >>> Cc: Gregory Price <gourry@gourry.net> >>> Cc: Ying Huang <ying.huang@linux.alibaba.com> >>> Cc: Alistair Popple <apopple@nvidia.com> >>> Cc: Oscar Salvador <osalvador@suse.de> >>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> >>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> >>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> >>> Cc: Nico Pache <npache@redhat.com> >>> Cc: Ryan Roberts <ryan.roberts@arm.com> >>> Cc: Dev Jain <dev.jain@arm.com> >>> Cc: Barry Song <baohua@kernel.org> >>> Cc: Lyude Paul <lyude@redhat.com> >>> Cc: Danilo Krummrich <dakr@kernel.org> >>> Cc: David Airlie <airlied@gmail.com> >>> Cc: Simona Vetter <simona@ffwll.ch> >>> Cc: Ralph Campbell <rcampbell@nvidia.com> >>> Cc: Mika Penttilä <mpenttil@redhat.com> >>> Cc: Matthew Brost <matthew.brost@intel.com> >>> Cc: Francois Dugast <francois.dugast@intel.com> >>> --- >>> mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++-------------- >>> 1 file changed, 98 insertions(+), 40 deletions(-) >>> >>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>> index 78166db72f4d..5291ee155a02 100644 >>> --- a/mm/huge_memory.c >>> +++ b/mm/huge_memory.c >>> @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>> struct page *page; >>> pgtable_t pgtable; >>> pmd_t old_pmd, _pmd; >>> - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; >>> - bool anon_exclusive = false, dirty = false; >>> + bool soft_dirty, uffd_wp = false, young = false, write = false; >>> + bool anon_exclusive = false, dirty = false, present = false; >>> unsigned long addr; >>> pte_t *pte; >>> int i; >>> + swp_entry_t swp_entry; >>> >>> VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); >>> VM_BUG_ON_VMA(vma->vm_start > haddr, vma); >>> VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); >>> - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); >>> + >>> + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd)); >>> >>> count_vm_event(THP_SPLIT_PMD); >>> >>> @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>> return __split_huge_zero_page_pmd(vma, haddr, pmd); >>> } >>> >>> - pmd_migration = is_pmd_migration_entry(*pmd); >>> - if (unlikely(pmd_migration)) { >>> - swp_entry_t entry; >>> >>> + present = pmd_present(*pmd); >>> + if (is_pmd_migration_entry(*pmd)) { >>> old_pmd = *pmd; >>> - entry = pmd_to_swp_entry(old_pmd); >>> - page = pfn_swap_entry_to_page(entry); >>> - write = is_writable_migration_entry(entry); >>> + swp_entry = pmd_to_swp_entry(old_pmd); >>> + page = pfn_swap_entry_to_page(swp_entry); >>> + folio = page_folio(page); >>> + >>> + soft_dirty = pmd_swp_soft_dirty(old_pmd); >>> + uffd_wp = pmd_swp_uffd_wp(old_pmd); >>> + >>> + write = is_writable_migration_entry(swp_entry); >>> if (PageAnon(page)) >>> - anon_exclusive = is_readable_exclusive_migration_entry(entry); >>> - young = is_migration_entry_young(entry); >>> - dirty = is_migration_entry_dirty(entry); >>> + anon_exclusive = is_readable_exclusive_migration_entry(swp_entry); >>> + young = is_migration_entry_young(swp_entry); >>> + dirty = is_migration_entry_dirty(swp_entry); >>> + } else if (is_pmd_device_private_entry(*pmd)) { >>> + old_pmd = *pmd; >>> + swp_entry = pmd_to_swp_entry(old_pmd); >>> + page = pfn_swap_entry_to_page(swp_entry); >>> + folio = page_folio(page); >>> + >>> soft_dirty = pmd_swp_soft_dirty(old_pmd); >>> uffd_wp = pmd_swp_uffd_wp(old_pmd); >>> + >>> + write = is_writable_device_private_entry(swp_entry); >>> + anon_exclusive = PageAnonExclusive(page); >>> + >>> + if (freeze && anon_exclusive && >>> + folio_try_share_anon_rmap_pmd(folio, page)) >>> + freeze = false; >> >> Why is it OK to change the freeze request? OK, it is replicating >> the code for present PMD folios. Either add a comment to point >> to the explanation in the comment below, or move >> “if (is_pmd_device_private_entry(*pmd))“ branch in the else below >> to deduplicate this code. > > Similar to the code for present pages, ideally folio_try_share_anon_rmap_pmd() > should never fail. anon_exclusive = PageAnonExclusive(page); if (freeze && anon_exclusive && folio_try_share_anon_rmap_pmd(folio, page)) freeze = false; if (!freeze) { rmap_t rmap_flags = RMAP_NONE; folio_ref_add(folio, HPAGE_PMD_NR - 1); if (anon_exclusive) rmap_flags |= RMAP_EXCLUSIVE; folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, vma, haddr, rmap_flags); } are the same for both device private and present. Can it be deduplicated by doing below? if (is_pmd_migration_entry(*pmd)) { ... } else { if (is_pmd_device_private_entry(*pmd)) { ... } else if (pmd_present()) { ... } /* the above code */ } If not, at least adding a comment in the device private copy of the code pointing to the present copy's comment. > >> >>> + if (!freeze) { >>> + rmap_t rmap_flags = RMAP_NONE; >>> + >>> + folio_ref_add(folio, HPAGE_PMD_NR - 1); >>> + if (anon_exclusive) >>> + rmap_flags |= RMAP_EXCLUSIVE; >>> + >>> + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, >>> + vma, haddr, rmap_flags); >>> + } >>> } else { >>> /* >>> * Up to this point the pmd is present and huge and userland has >>> @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>> * Note that NUMA hinting access restrictions are not transferred to >>> * avoid any possibility of altering permissions across VMAs. >>> */ >>> - if (freeze || pmd_migration) { >>> - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { >>> - pte_t entry; >>> - swp_entry_t swp_entry; >>> - >>> - if (write) >>> - swp_entry = make_writable_migration_entry( >>> - page_to_pfn(page + i)); >>> - else if (anon_exclusive) >>> - swp_entry = make_readable_exclusive_migration_entry( >>> - page_to_pfn(page + i)); >>> - else >>> - swp_entry = make_readable_migration_entry( >>> - page_to_pfn(page + i)); >>> - if (young) >>> - swp_entry = make_migration_entry_young(swp_entry); >>> - if (dirty) >>> - swp_entry = make_migration_entry_dirty(swp_entry); >>> - entry = swp_entry_to_pte(swp_entry); >>> - if (soft_dirty) >>> - entry = pte_swp_mksoft_dirty(entry); >>> - if (uffd_wp) >>> - entry = pte_swp_mkuffd_wp(entry); >>> + if (freeze || !present) { >>> + pte_t entry; >>> >>> - VM_WARN_ON(!pte_none(ptep_get(pte + i))); >>> - set_pte_at(mm, addr, pte + i, entry); >>> + if (freeze || is_migration_entry(swp_entry)) { >>> >> <snip> >>> + } else { >> <snip> >>> } >>> } else { >>> pte_t entry; >> >> David already pointed this out in v5. It can be done such as: >> >> if (freeze || pmd_migration) { >> ... >> } else if (is_pmd_device_private_entry(old_pmd)) { >> ... > > No.. freeze can be true for device private entries as well When freeze is true, migration entry is installed in place of device private entry, since the "if (freeze || pmd_migration)" branch is taken. This proposal is same as your code. What is the difference? > >> } else { >> /* for present, non freeze case */ >> } >> >>> @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>> } >>> pte_unmap(pte); >>> >>> - if (!pmd_migration) >>> + if (!is_pmd_migration_entry(*pmd)) >>> folio_remove_rmap_pmd(folio, page, vma); >>> if (freeze) >>> put_page(page); >>> @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, >>> pmd_t *pmd, bool freeze) >>> { >>> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); >>> - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) >>> + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd)) >>> __split_huge_pmd_locked(vma, pmd, address, freeze); >>> } >>> >>> @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, >>> VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); >>> lockdep_assert_held(&lruvec->lru_lock); >>> >>> + if (folio_is_device_private(folio)) >>> + return; >>> + >>> if (list) { >>> /* page reclaim is reclaiming a huge page */ >>> VM_WARN_ON(folio_test_lru(folio)); >>> @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, >>> if (nr_shmem_dropped) >>> shmem_uncharge(mapping->host, nr_shmem_dropped); >>> >>> - if (!ret && is_anon) >>> + if (!ret && is_anon && !folio_is_device_private(folio)) >>> remap_flags = RMP_USE_SHARED_ZEROPAGE; >>> + >> >> You should remove this and add >> >> if (folio_is_device_private(folio)) >> return false; >> >> in try_to_map_unused_to_zeropage(). Otherwise, no one would know >> device private folios need to be excluded from mapping unused to >> zero page. >> > > I had that upto v2 and then David asked me to remove it. FYI, this > is the only call site for RMP_USE_SHARED_ZEROPAGE Can you provide a link? Even if this is the only call site, there is no guarantee that there will be none in the future. I am not sure why we want caller to handle this special case. Who is going to tell the next user of RMP_USE_SHARED_ZEROPAGE or caller to try_to_map_unused_to_zeropage() that device private is incompatible with them? > >>> remap_page(folio, 1 << order, remap_flags); >>> >>> /* >>> -- >>> 2.50.1 >> >> > > Thanks for the review > Balbir -- Best Regards, Yan, Zi
On 9/23/25 12:09, Zi Yan wrote: > On 22 Sep 2025, at 21:50, Balbir Singh wrote: > >> On 9/23/25 07:09, Zi Yan wrote: >>> On 16 Sep 2025, at 8:21, Balbir Singh wrote: >>> >>>> Add support for splitting device-private THP folios, enabling fallback >>>> to smaller page sizes when large page allocation or migration fails. >>>> >>>> Key changes: >>>> - split_huge_pmd(): Handle device-private PMD entries during splitting >>>> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios >>>> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they >>>> don't support shared zero page semantics >>>> >>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com> >>>> Cc: David Hildenbrand <david@redhat.com> >>>> Cc: Zi Yan <ziy@nvidia.com> >>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com> >>>> Cc: Rakie Kim <rakie.kim@sk.com> >>>> Cc: Byungchul Park <byungchul@sk.com> >>>> Cc: Gregory Price <gourry@gourry.net> >>>> Cc: Ying Huang <ying.huang@linux.alibaba.com> >>>> Cc: Alistair Popple <apopple@nvidia.com> >>>> Cc: Oscar Salvador <osalvador@suse.de> >>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> >>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> >>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> >>>> Cc: Nico Pache <npache@redhat.com> >>>> Cc: Ryan Roberts <ryan.roberts@arm.com> >>>> Cc: Dev Jain <dev.jain@arm.com> >>>> Cc: Barry Song <baohua@kernel.org> >>>> Cc: Lyude Paul <lyude@redhat.com> >>>> Cc: Danilo Krummrich <dakr@kernel.org> >>>> Cc: David Airlie <airlied@gmail.com> >>>> Cc: Simona Vetter <simona@ffwll.ch> >>>> Cc: Ralph Campbell <rcampbell@nvidia.com> >>>> Cc: Mika Penttilä <mpenttil@redhat.com> >>>> Cc: Matthew Brost <matthew.brost@intel.com> >>>> Cc: Francois Dugast <francois.dugast@intel.com> >>>> --- >>>> mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++-------------- >>>> 1 file changed, 98 insertions(+), 40 deletions(-) >>>> >>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>>> index 78166db72f4d..5291ee155a02 100644 >>>> --- a/mm/huge_memory.c >>>> +++ b/mm/huge_memory.c >>>> @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>>> struct page *page; >>>> pgtable_t pgtable; >>>> pmd_t old_pmd, _pmd; >>>> - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; >>>> - bool anon_exclusive = false, dirty = false; >>>> + bool soft_dirty, uffd_wp = false, young = false, write = false; >>>> + bool anon_exclusive = false, dirty = false, present = false; >>>> unsigned long addr; >>>> pte_t *pte; >>>> int i; >>>> + swp_entry_t swp_entry; >>>> >>>> VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); >>>> VM_BUG_ON_VMA(vma->vm_start > haddr, vma); >>>> VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); >>>> - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); >>>> + >>>> + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd)); >>>> >>>> count_vm_event(THP_SPLIT_PMD); >>>> >>>> @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>>> return __split_huge_zero_page_pmd(vma, haddr, pmd); >>>> } >>>> >>>> - pmd_migration = is_pmd_migration_entry(*pmd); >>>> - if (unlikely(pmd_migration)) { >>>> - swp_entry_t entry; >>>> >>>> + present = pmd_present(*pmd); >>>> + if (is_pmd_migration_entry(*pmd)) { >>>> old_pmd = *pmd; >>>> - entry = pmd_to_swp_entry(old_pmd); >>>> - page = pfn_swap_entry_to_page(entry); >>>> - write = is_writable_migration_entry(entry); >>>> + swp_entry = pmd_to_swp_entry(old_pmd); >>>> + page = pfn_swap_entry_to_page(swp_entry); >>>> + folio = page_folio(page); >>>> + >>>> + soft_dirty = pmd_swp_soft_dirty(old_pmd); >>>> + uffd_wp = pmd_swp_uffd_wp(old_pmd); >>>> + >>>> + write = is_writable_migration_entry(swp_entry); >>>> if (PageAnon(page)) >>>> - anon_exclusive = is_readable_exclusive_migration_entry(entry); >>>> - young = is_migration_entry_young(entry); >>>> - dirty = is_migration_entry_dirty(entry); >>>> + anon_exclusive = is_readable_exclusive_migration_entry(swp_entry); >>>> + young = is_migration_entry_young(swp_entry); >>>> + dirty = is_migration_entry_dirty(swp_entry); >>>> + } else if (is_pmd_device_private_entry(*pmd)) { >>>> + old_pmd = *pmd; >>>> + swp_entry = pmd_to_swp_entry(old_pmd); >>>> + page = pfn_swap_entry_to_page(swp_entry); >>>> + folio = page_folio(page); >>>> + >>>> soft_dirty = pmd_swp_soft_dirty(old_pmd); >>>> uffd_wp = pmd_swp_uffd_wp(old_pmd); >>>> + >>>> + write = is_writable_device_private_entry(swp_entry); >>>> + anon_exclusive = PageAnonExclusive(page); >>>> + >>>> + if (freeze && anon_exclusive && >>>> + folio_try_share_anon_rmap_pmd(folio, page)) >>>> + freeze = false; >>> >>> Why is it OK to change the freeze request? OK, it is replicating >>> the code for present PMD folios. Either add a comment to point >>> to the explanation in the comment below, or move >>> “if (is_pmd_device_private_entry(*pmd))“ branch in the else below >>> to deduplicate this code. >> >> Similar to the code for present pages, ideally folio_try_share_anon_rmap_pmd() >> should never fail. > > anon_exclusive = PageAnonExclusive(page); > if (freeze && anon_exclusive && > folio_try_share_anon_rmap_pmd(folio, page)) > freeze = false; > if (!freeze) { > rmap_t rmap_flags = RMAP_NONE; > > folio_ref_add(folio, HPAGE_PMD_NR - 1); > if (anon_exclusive) > rmap_flags |= RMAP_EXCLUSIVE; > folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, > vma, haddr, rmap_flags); > } > > are the same for both device private and present. Can it be deduplicated > by doing below? > > if (is_pmd_migration_entry(*pmd)) { > ... > } else { > if (is_pmd_device_private_entry(*pmd)) { > ... > } else if (pmd_present()) { > ... > } > > /* the above code */ > } > > If not, at least adding a comment in the device private copy of the code > pointing to the present copy's comment. > >> >>> >>>> + if (!freeze) { >>>> + rmap_t rmap_flags = RMAP_NONE; >>>> + >>>> + folio_ref_add(folio, HPAGE_PMD_NR - 1); >>>> + if (anon_exclusive) >>>> + rmap_flags |= RMAP_EXCLUSIVE; >>>> + >>>> + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, >>>> + vma, haddr, rmap_flags); >>>> + } >>>> } else { >>>> /* >>>> * Up to this point the pmd is present and huge and userland has >>>> @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>>> * Note that NUMA hinting access restrictions are not transferred to >>>> * avoid any possibility of altering permissions across VMAs. >>>> */ >>>> - if (freeze || pmd_migration) { >>>> - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { >>>> - pte_t entry; >>>> - swp_entry_t swp_entry; >>>> - >>>> - if (write) >>>> - swp_entry = make_writable_migration_entry( >>>> - page_to_pfn(page + i)); >>>> - else if (anon_exclusive) >>>> - swp_entry = make_readable_exclusive_migration_entry( >>>> - page_to_pfn(page + i)); >>>> - else >>>> - swp_entry = make_readable_migration_entry( >>>> - page_to_pfn(page + i)); >>>> - if (young) >>>> - swp_entry = make_migration_entry_young(swp_entry); >>>> - if (dirty) >>>> - swp_entry = make_migration_entry_dirty(swp_entry); >>>> - entry = swp_entry_to_pte(swp_entry); >>>> - if (soft_dirty) >>>> - entry = pte_swp_mksoft_dirty(entry); >>>> - if (uffd_wp) >>>> - entry = pte_swp_mkuffd_wp(entry); >>>> + if (freeze || !present) { >>>> + pte_t entry; >>>> >>>> - VM_WARN_ON(!pte_none(ptep_get(pte + i))); >>>> - set_pte_at(mm, addr, pte + i, entry); >>>> + if (freeze || is_migration_entry(swp_entry)) { >>>> >>> <snip> >>>> + } else { >>> <snip> >>>> } >>>> } else { >>>> pte_t entry; >>> >>> David already pointed this out in v5. It can be done such as: >>> >>> if (freeze || pmd_migration) { >>> ... >>> } else if (is_pmd_device_private_entry(old_pmd)) { >>> ... >> >> No.. freeze can be true for device private entries as well > > When freeze is true, migration entry is installed in place of > device private entry, since the "if (freeze || pmd_migration)" > branch is taken. This proposal is same as your code. What is > the difference? > I read the else if incorrectly, I'll simplify >> >>> } else { >>> /* for present, non freeze case */ >>> } >>> >>>> @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>>> } >>>> pte_unmap(pte); >>>> >>>> - if (!pmd_migration) >>>> + if (!is_pmd_migration_entry(*pmd)) >>>> folio_remove_rmap_pmd(folio, page, vma); >>>> if (freeze) >>>> put_page(page); >>>> @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, >>>> pmd_t *pmd, bool freeze) >>>> { >>>> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); >>>> - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) >>>> + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd)) >>>> __split_huge_pmd_locked(vma, pmd, address, freeze); >>>> } >>>> >>>> @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, >>>> VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); >>>> lockdep_assert_held(&lruvec->lru_lock); >>>> >>>> + if (folio_is_device_private(folio)) >>>> + return; >>>> + >>>> if (list) { >>>> /* page reclaim is reclaiming a huge page */ >>>> VM_WARN_ON(folio_test_lru(folio)); >>>> @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, >>>> if (nr_shmem_dropped) >>>> shmem_uncharge(mapping->host, nr_shmem_dropped); >>>> >>>> - if (!ret && is_anon) >>>> + if (!ret && is_anon && !folio_is_device_private(folio)) >>>> remap_flags = RMP_USE_SHARED_ZEROPAGE; >>>> + >>> >>> You should remove this and add >>> >>> if (folio_is_device_private(folio)) >>> return false; >>> >>> in try_to_map_unused_to_zeropage(). Otherwise, no one would know >>> device private folios need to be excluded from mapping unused to >>> zero page. >>> >> >> I had that upto v2 and then David asked me to remove it. FYI, this >> is the only call site for RMP_USE_SHARED_ZEROPAGE > > Can you provide a link? > Please see https://lore.kernel.org/linux-mm/20250306044239.3874247-3-balbirs@nvidia.com/T/ > Even if this is the only call site, there is no guarantee that > there will be none in the future. I am not sure why we want caller > to handle this special case. Who is going to tell the next user > of RMP_USE_SHARED_ZEROPAGE or caller to try_to_map_unused_to_zeropage() > that device private is incompatible with them? > I don't disagree, but the question was why are device private pages even making it to try_to_map_unused_to_zeropage()>> >>>> remap_page(folio, 1 << order, remap_flags); >>>> >>>> /* >>>> -- >>>> 2.50.1 >>> >>> >> >> Thanks for the review >> Balbir Thanks, Balbir
On 23 Sep 2025, at 0:04, Balbir Singh wrote: > On 9/23/25 12:09, Zi Yan wrote: >> On 22 Sep 2025, at 21:50, Balbir Singh wrote: >> >>> On 9/23/25 07:09, Zi Yan wrote: >>>> On 16 Sep 2025, at 8:21, Balbir Singh wrote: >>>> >>>>> Add support for splitting device-private THP folios, enabling fallback >>>>> to smaller page sizes when large page allocation or migration fails. >>>>> >>>>> Key changes: >>>>> - split_huge_pmd(): Handle device-private PMD entries during splitting >>>>> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios >>>>> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they >>>>> don't support shared zero page semantics >>>>> >>>>> Signed-off-by: Balbir Singh <balbirs@nvidia.com> >>>>> Cc: David Hildenbrand <david@redhat.com> >>>>> Cc: Zi Yan <ziy@nvidia.com> >>>>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com> >>>>> Cc: Rakie Kim <rakie.kim@sk.com> >>>>> Cc: Byungchul Park <byungchul@sk.com> >>>>> Cc: Gregory Price <gourry@gourry.net> >>>>> Cc: Ying Huang <ying.huang@linux.alibaba.com> >>>>> Cc: Alistair Popple <apopple@nvidia.com> >>>>> Cc: Oscar Salvador <osalvador@suse.de> >>>>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> >>>>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> >>>>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> >>>>> Cc: Nico Pache <npache@redhat.com> >>>>> Cc: Ryan Roberts <ryan.roberts@arm.com> >>>>> Cc: Dev Jain <dev.jain@arm.com> >>>>> Cc: Barry Song <baohua@kernel.org> >>>>> Cc: Lyude Paul <lyude@redhat.com> >>>>> Cc: Danilo Krummrich <dakr@kernel.org> >>>>> Cc: David Airlie <airlied@gmail.com> >>>>> Cc: Simona Vetter <simona@ffwll.ch> >>>>> Cc: Ralph Campbell <rcampbell@nvidia.com> >>>>> Cc: Mika Penttilä <mpenttil@redhat.com> >>>>> Cc: Matthew Brost <matthew.brost@intel.com> >>>>> Cc: Francois Dugast <francois.dugast@intel.com> >>>>> --- >>>>> mm/huge_memory.c | 138 +++++++++++++++++++++++++++++++++-------------- >>>>> 1 file changed, 98 insertions(+), 40 deletions(-) >>>>> >>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c >>>>> index 78166db72f4d..5291ee155a02 100644 >>>>> --- a/mm/huge_memory.c >>>>> +++ b/mm/huge_memory.c >>>>> @@ -2872,16 +2872,18 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>>>> struct page *page; >>>>> pgtable_t pgtable; >>>>> pmd_t old_pmd, _pmd; >>>>> - bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; >>>>> - bool anon_exclusive = false, dirty = false; >>>>> + bool soft_dirty, uffd_wp = false, young = false, write = false; >>>>> + bool anon_exclusive = false, dirty = false, present = false; >>>>> unsigned long addr; >>>>> pte_t *pte; >>>>> int i; >>>>> + swp_entry_t swp_entry; >>>>> >>>>> VM_BUG_ON(haddr & ~HPAGE_PMD_MASK); >>>>> VM_BUG_ON_VMA(vma->vm_start > haddr, vma); >>>>> VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma); >>>>> - VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd)); >>>>> + >>>>> + VM_WARN_ON(!is_pmd_non_present_folio_entry(*pmd) && !pmd_trans_huge(*pmd)); >>>>> >>>>> count_vm_event(THP_SPLIT_PMD); >>>>> >>>>> @@ -2929,20 +2931,47 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>>>> return __split_huge_zero_page_pmd(vma, haddr, pmd); >>>>> } >>>>> >>>>> - pmd_migration = is_pmd_migration_entry(*pmd); >>>>> - if (unlikely(pmd_migration)) { >>>>> - swp_entry_t entry; >>>>> >>>>> + present = pmd_present(*pmd); >>>>> + if (is_pmd_migration_entry(*pmd)) { >>>>> old_pmd = *pmd; >>>>> - entry = pmd_to_swp_entry(old_pmd); >>>>> - page = pfn_swap_entry_to_page(entry); >>>>> - write = is_writable_migration_entry(entry); >>>>> + swp_entry = pmd_to_swp_entry(old_pmd); >>>>> + page = pfn_swap_entry_to_page(swp_entry); >>>>> + folio = page_folio(page); >>>>> + >>>>> + soft_dirty = pmd_swp_soft_dirty(old_pmd); >>>>> + uffd_wp = pmd_swp_uffd_wp(old_pmd); >>>>> + >>>>> + write = is_writable_migration_entry(swp_entry); >>>>> if (PageAnon(page)) >>>>> - anon_exclusive = is_readable_exclusive_migration_entry(entry); >>>>> - young = is_migration_entry_young(entry); >>>>> - dirty = is_migration_entry_dirty(entry); >>>>> + anon_exclusive = is_readable_exclusive_migration_entry(swp_entry); >>>>> + young = is_migration_entry_young(swp_entry); >>>>> + dirty = is_migration_entry_dirty(swp_entry); >>>>> + } else if (is_pmd_device_private_entry(*pmd)) { >>>>> + old_pmd = *pmd; >>>>> + swp_entry = pmd_to_swp_entry(old_pmd); >>>>> + page = pfn_swap_entry_to_page(swp_entry); >>>>> + folio = page_folio(page); >>>>> + >>>>> soft_dirty = pmd_swp_soft_dirty(old_pmd); >>>>> uffd_wp = pmd_swp_uffd_wp(old_pmd); >>>>> + >>>>> + write = is_writable_device_private_entry(swp_entry); >>>>> + anon_exclusive = PageAnonExclusive(page); >>>>> + >>>>> + if (freeze && anon_exclusive && >>>>> + folio_try_share_anon_rmap_pmd(folio, page)) >>>>> + freeze = false; >>>> >>>> Why is it OK to change the freeze request? OK, it is replicating >>>> the code for present PMD folios. Either add a comment to point >>>> to the explanation in the comment below, or move >>>> “if (is_pmd_device_private_entry(*pmd))“ branch in the else below >>>> to deduplicate this code. >>> >>> Similar to the code for present pages, ideally folio_try_share_anon_rmap_pmd() >>> should never fail. >> >> anon_exclusive = PageAnonExclusive(page); >> if (freeze && anon_exclusive && >> folio_try_share_anon_rmap_pmd(folio, page)) >> freeze = false; >> if (!freeze) { >> rmap_t rmap_flags = RMAP_NONE; >> >> folio_ref_add(folio, HPAGE_PMD_NR - 1); >> if (anon_exclusive) >> rmap_flags |= RMAP_EXCLUSIVE; >> folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, >> vma, haddr, rmap_flags); >> } >> >> are the same for both device private and present. Can it be deduplicated >> by doing below? >> >> if (is_pmd_migration_entry(*pmd)) { >> ... >> } else { >> if (is_pmd_device_private_entry(*pmd)) { >> ... >> } else if (pmd_present()) { >> ... >> } >> >> /* the above code */ >> } >> >> If not, at least adding a comment in the device private copy of the code >> pointing to the present copy's comment. >> >>> >>>> >>>>> + if (!freeze) { >>>>> + rmap_t rmap_flags = RMAP_NONE; >>>>> + >>>>> + folio_ref_add(folio, HPAGE_PMD_NR - 1); >>>>> + if (anon_exclusive) >>>>> + rmap_flags |= RMAP_EXCLUSIVE; >>>>> + >>>>> + folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR, >>>>> + vma, haddr, rmap_flags); >>>>> + } >>>>> } else { >>>>> /* >>>>> * Up to this point the pmd is present and huge and userland has >>>>> @@ -3026,32 +3055,57 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>>>> * Note that NUMA hinting access restrictions are not transferred to >>>>> * avoid any possibility of altering permissions across VMAs. >>>>> */ >>>>> - if (freeze || pmd_migration) { >>>>> - for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) { >>>>> - pte_t entry; >>>>> - swp_entry_t swp_entry; >>>>> - >>>>> - if (write) >>>>> - swp_entry = make_writable_migration_entry( >>>>> - page_to_pfn(page + i)); >>>>> - else if (anon_exclusive) >>>>> - swp_entry = make_readable_exclusive_migration_entry( >>>>> - page_to_pfn(page + i)); >>>>> - else >>>>> - swp_entry = make_readable_migration_entry( >>>>> - page_to_pfn(page + i)); >>>>> - if (young) >>>>> - swp_entry = make_migration_entry_young(swp_entry); >>>>> - if (dirty) >>>>> - swp_entry = make_migration_entry_dirty(swp_entry); >>>>> - entry = swp_entry_to_pte(swp_entry); >>>>> - if (soft_dirty) >>>>> - entry = pte_swp_mksoft_dirty(entry); >>>>> - if (uffd_wp) >>>>> - entry = pte_swp_mkuffd_wp(entry); >>>>> + if (freeze || !present) { >>>>> + pte_t entry; >>>>> >>>>> - VM_WARN_ON(!pte_none(ptep_get(pte + i))); >>>>> - set_pte_at(mm, addr, pte + i, entry); >>>>> + if (freeze || is_migration_entry(swp_entry)) { >>>>> >>>> <snip> >>>>> + } else { >>>> <snip> >>>>> } >>>>> } else { >>>>> pte_t entry; >>>> >>>> David already pointed this out in v5. It can be done such as: >>>> >>>> if (freeze || pmd_migration) { >>>> ... >>>> } else if (is_pmd_device_private_entry(old_pmd)) { >>>> ... >>> >>> No.. freeze can be true for device private entries as well >> >> When freeze is true, migration entry is installed in place of >> device private entry, since the "if (freeze || pmd_migration)" >> branch is taken. This proposal is same as your code. What is >> the difference? >> > > I read the else if incorrectly, I'll simplify > >>> >>>> } else { >>>> /* for present, non freeze case */ >>>> } >>>> >>>>> @@ -3076,7 +3130,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, >>>>> } >>>>> pte_unmap(pte); >>>>> >>>>> - if (!pmd_migration) >>>>> + if (!is_pmd_migration_entry(*pmd)) >>>>> folio_remove_rmap_pmd(folio, page, vma); >>>>> if (freeze) >>>>> put_page(page); >>>>> @@ -3089,7 +3143,7 @@ void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address, >>>>> pmd_t *pmd, bool freeze) >>>>> { >>>>> VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE)); >>>>> - if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd)) >>>>> + if (pmd_trans_huge(*pmd) || is_pmd_non_present_folio_entry(*pmd)) >>>>> __split_huge_pmd_locked(vma, pmd, address, freeze); >>>>> } >>>>> >>>>> @@ -3268,6 +3322,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio, >>>>> VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio); >>>>> lockdep_assert_held(&lruvec->lru_lock); >>>>> >>>>> + if (folio_is_device_private(folio)) >>>>> + return; >>>>> + >>>>> if (list) { >>>>> /* page reclaim is reclaiming a huge page */ >>>>> VM_WARN_ON(folio_test_lru(folio)); >>>>> @@ -3885,8 +3942,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order, >>>>> if (nr_shmem_dropped) >>>>> shmem_uncharge(mapping->host, nr_shmem_dropped); >>>>> >>>>> - if (!ret && is_anon) >>>>> + if (!ret && is_anon && !folio_is_device_private(folio)) >>>>> remap_flags = RMP_USE_SHARED_ZEROPAGE; >>>>> + >>>> >>>> You should remove this and add >>>> >>>> if (folio_is_device_private(folio)) >>>> return false; >>>> >>>> in try_to_map_unused_to_zeropage(). Otherwise, no one would know >>>> device private folios need to be excluded from mapping unused to >>>> zero page. >>>> >>> >>> I had that upto v2 and then David asked me to remove it. FYI, this >>> is the only call site for RMP_USE_SHARED_ZEROPAGE >> >> Can you provide a link? >> > > Please see https://lore.kernel.org/linux-mm/20250306044239.3874247-3-balbirs@nvidia.com/T/ I do not see any comment on removing device private folio check in try_to_map_unused_to_zeropage(). Can you try again? > >> Even if this is the only call site, there is no guarantee that >> there will be none in the future. I am not sure why we want caller >> to handle this special case. Who is going to tell the next user >> of RMP_USE_SHARED_ZEROPAGE or caller to try_to_map_unused_to_zeropage() >> that device private is incompatible with them? >> > > I don't disagree, but the question was why are device private pages even making > it to try_to_map_unused_to_zeropage()>> Then, it could be done in remove_migration_pte(): if (rmap_walk_arg->map_unused_to_zeropage && !folio_is_device_private(folio) && try_to_map_unused_to_zeropage(&pvmw, folio, idx)) continue; Maybe I am too hung up on this and someone else could pat on my back and tell me it is OK to just do this at the only caller instead. :) >>>>> remap_page(folio, 1 << order, remap_flags); >>>>> >>>>> /* >>>>> -- >>>>> 2.50.1 >>>> >>>> >>> >>> Thanks for the review >>> Balbir > > Thanks, > Balbir Best Regards, Yan, Zi
© 2016 - 2025 Red Hat, Inc.