[v5] mm: support device-private THP

[v5 04/15] mm/huge_memory: implement device-private THP splitting

Posted by Balbir Singh 5 months ago

Add support for splitting device-private THP folios, enabling fallback
to smaller page sizes when large page allocation or migration fails.

Key changes:
- split_huge_pmd(): Handle device-private PMD entries during splitting
- Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
- Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
  don't support shared zero page semantics

Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Hildenbrand <david@redhat.com>
Cc: Zi Yan <ziy@nvidia.com>
Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
Cc: Rakie Kim <rakie.kim@sk.com>
Cc: Byungchul Park <byungchul@sk.com>
Cc: Gregory Price <gourry@gourry.net>
Cc: Ying Huang <ying.huang@linux.alibaba.com>
Cc: Alistair Popple <apopple@nvidia.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
Cc: Nico Pache <npache@redhat.com>
Cc: Ryan Roberts <ryan.roberts@arm.com>
Cc: Dev Jain <dev.jain@arm.com>
Cc: Barry Song <baohua@kernel.org>
Cc: Lyude Paul <lyude@redhat.com>
Cc: Danilo Krummrich <dakr@kernel.org>
Cc: David Airlie <airlied@gmail.com>
Cc: Simona Vetter <simona@ffwll.ch>
Cc: Ralph Campbell <rcampbell@nvidia.com>
Cc: Mika Penttilä <mpenttil@redhat.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Francois Dugast <francois.dugast@intel.com>

Signed-off-by: Balbir Singh <balbirs@nvidia.com>
---
 mm/huge_memory.c | 129 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 91 insertions(+), 38 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 337d8e3dd837..b720870c04b2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2880,16 +2880,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	struct page *page;
 	pgtable_t pgtable;
 	pmd_t old_pmd, _pmd;
-	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
-	bool anon_exclusive = false, dirty = false;
+	bool young, write, soft_dirty, uffd_wp = false;
+	bool anon_exclusive = false, dirty = false, present = false;
 	unsigned long addr;
 	pte_t *pte;
 	int i;
+	swp_entry_t swp_entry;
 
 	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
 	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
 	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
-	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
+
+	VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) &&
+			!is_pmd_device_private_entry(*pmd));
 
 	count_vm_event(THP_SPLIT_PMD);
 
@@ -2937,18 +2940,43 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 		return __split_huge_zero_page_pmd(vma, haddr, pmd);
 	}
 
-	pmd_migration = is_pmd_migration_entry(*pmd);
-	if (unlikely(pmd_migration)) {
-		swp_entry_t entry;
 
+	present = pmd_present(*pmd);
+	if (unlikely(!present)) {
+		swp_entry = pmd_to_swp_entry(*pmd);
 		old_pmd = *pmd;
-		entry = pmd_to_swp_entry(old_pmd);
-		page = pfn_swap_entry_to_page(entry);
-		write = is_writable_migration_entry(entry);
-		if (PageAnon(page))
-			anon_exclusive = is_readable_exclusive_migration_entry(entry);
-		young = is_migration_entry_young(entry);
-		dirty = is_migration_entry_dirty(entry);
+
+		folio = pfn_swap_entry_folio(swp_entry);
+		VM_WARN_ON(!is_migration_entry(swp_entry) &&
+				!is_device_private_entry(swp_entry));
+		page = pfn_swap_entry_to_page(swp_entry);
+
+		if (is_pmd_migration_entry(old_pmd)) {
+			write = is_writable_migration_entry(swp_entry);
+			if (PageAnon(page))
+				anon_exclusive =
+					is_readable_exclusive_migration_entry(
+								swp_entry);
+			young = is_migration_entry_young(swp_entry);
+			dirty = is_migration_entry_dirty(swp_entry);
+		} else if (is_pmd_device_private_entry(old_pmd)) {
+			write = is_writable_device_private_entry(swp_entry);
+			anon_exclusive = PageAnonExclusive(page);
+			if (freeze && anon_exclusive &&
+			    folio_try_share_anon_rmap_pmd(folio, page))
+				freeze = false;
+			if (!freeze) {
+				rmap_t rmap_flags = RMAP_NONE;
+
+				folio_ref_add(folio, HPAGE_PMD_NR - 1);
+				if (anon_exclusive)
+					rmap_flags |= RMAP_EXCLUSIVE;
+
+				folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
+						 vma, haddr, rmap_flags);
+			}
+		}
+
 		soft_dirty = pmd_swp_soft_dirty(old_pmd);
 		uffd_wp = pmd_swp_uffd_wp(old_pmd);
 	} else {
@@ -3034,30 +3062,49 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	 * Note that NUMA hinting access restrictions are not transferred to
 	 * avoid any possibility of altering permissions across VMAs.
 	 */
-	if (freeze || pmd_migration) {
+	if (freeze || !present) {
 		for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
 			pte_t entry;
-			swp_entry_t swp_entry;
-
-			if (write)
-				swp_entry = make_writable_migration_entry(
-							page_to_pfn(page + i));
-			else if (anon_exclusive)
-				swp_entry = make_readable_exclusive_migration_entry(
-							page_to_pfn(page + i));
-			else
-				swp_entry = make_readable_migration_entry(
-							page_to_pfn(page + i));
-			if (young)
-				swp_entry = make_migration_entry_young(swp_entry);
-			if (dirty)
-				swp_entry = make_migration_entry_dirty(swp_entry);
-			entry = swp_entry_to_pte(swp_entry);
-			if (soft_dirty)
-				entry = pte_swp_mksoft_dirty(entry);
-			if (uffd_wp)
-				entry = pte_swp_mkuffd_wp(entry);
-
+			if (freeze || is_migration_entry(swp_entry)) {
+				if (write)
+					swp_entry = make_writable_migration_entry(
+								page_to_pfn(page + i));
+				else if (anon_exclusive)
+					swp_entry = make_readable_exclusive_migration_entry(
+								page_to_pfn(page + i));
+				else
+					swp_entry = make_readable_migration_entry(
+								page_to_pfn(page + i));
+				if (young)
+					swp_entry = make_migration_entry_young(swp_entry);
+				if (dirty)
+					swp_entry = make_migration_entry_dirty(swp_entry);
+				entry = swp_entry_to_pte(swp_entry);
+				if (soft_dirty)
+					entry = pte_swp_mksoft_dirty(entry);
+				if (uffd_wp)
+					entry = pte_swp_mkuffd_wp(entry);
+			} else {
+				/*
+				 * anon_exclusive was already propagated to the relevant
+				 * pages corresponding to the pte entries when freeze
+				 * is false.
+				 */
+				if (write)
+					swp_entry = make_writable_device_private_entry(
+								page_to_pfn(page + i));
+				else
+					swp_entry = make_readable_device_private_entry(
+								page_to_pfn(page + i));
+				/*
+				 * Young and dirty bits are not progated via swp_entry
+				 */
+				entry = swp_entry_to_pte(swp_entry);
+				if (soft_dirty)
+					entry = pte_swp_mksoft_dirty(entry);
+				if (uffd_wp)
+					entry = pte_swp_mkuffd_wp(entry);
+			}
 			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
 			set_pte_at(mm, addr, pte + i, entry);
 		}
@@ -3084,7 +3131,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	}
 	pte_unmap(pte);
 
-	if (!pmd_migration)
+	if (!is_pmd_migration_entry(*pmd))
 		folio_remove_rmap_pmd(folio, page, vma);
 	if (freeze)
 		put_page(page);
@@ -3096,8 +3143,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
 			   pmd_t *pmd, bool freeze)
 {
+
 	VM_WARN_ON_ONCE(!IS_ALIGNED(address, HPAGE_PMD_SIZE));
-	if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd))
+	if (pmd_trans_huge(*pmd) || is_pmd_migration_entry(*pmd) ||
+		is_pmd_device_private_entry(*pmd))
 		__split_huge_pmd_locked(vma, pmd, address, freeze);
 }
 
@@ -3276,6 +3325,9 @@ static void lru_add_split_folio(struct folio *folio, struct folio *new_folio,
 	VM_BUG_ON_FOLIO(folio_test_lru(new_folio), folio);
 	lockdep_assert_held(&lruvec->lru_lock);
 
+	if (folio_is_device_private(folio))
+		return;
+
 	if (list) {
 		/* page reclaim is reclaiming a huge page */
 		VM_WARN_ON(folio_test_lru(folio));
@@ -3896,8 +3948,9 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
 	if (nr_shmem_dropped)
 		shmem_uncharge(mapping->host, nr_shmem_dropped);
 
-	if (!ret && is_anon)
+	if (!ret && is_anon && !folio_is_device_private(folio))
 		remap_flags = RMP_USE_SHARED_ZEROPAGE;
+
 	remap_page(folio, 1 << order, remap_flags);
 
 	/*
-- 
2.50.1

Re: [v5 04/15] mm/huge_memory: implement device-private THP splitting

Posted by David Hildenbrand 5 months ago

On 08.09.25 02:04, Balbir Singh wrote:
> Add support for splitting device-private THP folios, enabling fallback
> to smaller page sizes when large page allocation or migration fails.
> 
> Key changes:
> - split_huge_pmd(): Handle device-private PMD entries during splitting
> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
>    don't support shared zero page semantics
> 
> Cc: Andrew Morton <akpm@linux-foundation.org>
> Cc: David Hildenbrand <david@redhat.com>
> Cc: Zi Yan <ziy@nvidia.com>
> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
> Cc: Rakie Kim <rakie.kim@sk.com>
> Cc: Byungchul Park <byungchul@sk.com>
> Cc: Gregory Price <gourry@gourry.net>
> Cc: Ying Huang <ying.huang@linux.alibaba.com>
> Cc: Alistair Popple <apopple@nvidia.com>
> Cc: Oscar Salvador <osalvador@suse.de>
> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
> Cc: Nico Pache <npache@redhat.com>
> Cc: Ryan Roberts <ryan.roberts@arm.com>
> Cc: Dev Jain <dev.jain@arm.com>
> Cc: Barry Song <baohua@kernel.org>
> Cc: Lyude Paul <lyude@redhat.com>
> Cc: Danilo Krummrich <dakr@kernel.org>
> Cc: David Airlie <airlied@gmail.com>
> Cc: Simona Vetter <simona@ffwll.ch>
> Cc: Ralph Campbell <rcampbell@nvidia.com>
> Cc: Mika Penttilä <mpenttil@redhat.com>
> Cc: Matthew Brost <matthew.brost@intel.com>
> Cc: Francois Dugast <francois.dugast@intel.com>
> 
> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
> ---
>   mm/huge_memory.c | 129 +++++++++++++++++++++++++++++++++--------------
>   1 file changed, 91 insertions(+), 38 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 337d8e3dd837..b720870c04b2 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -2880,16 +2880,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   	struct page *page;
>   	pgtable_t pgtable;
>   	pmd_t old_pmd, _pmd;
> -	bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
> -	bool anon_exclusive = false, dirty = false;
> +	bool young, write, soft_dirty, uffd_wp = false;
> +	bool anon_exclusive = false, dirty = false, present = false;
>   	unsigned long addr;
>   	pte_t *pte;
>   	int i;
> +	swp_entry_t swp_entry;
>   
>   	VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>   	VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>   	VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
> -	VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
> +
> +	VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) &&
> +			!is_pmd_device_private_entry(*pmd));
>   

Indentation. But I do wonder if we want a helper to do a more
efficient

is_pmd_migration_entry() || is_pmd_device_private_entry()

If only I could come up with a good name ... any ideas?

is_non_present_folio_entry()

maybe?

Well, there is device-exclusive .... but that would not be reachable on 
these paths yet, ever.


>   	count_vm_event(THP_SPLIT_PMD);
>   
> @@ -2937,18 +2940,43 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   		return __split_huge_zero_page_pmd(vma, haddr, pmd);
>   	}
>   
> -	pmd_migration = is_pmd_migration_entry(*pmd);
> -	if (unlikely(pmd_migration)) {
> -		swp_entry_t entry;
>   
> +	present = pmd_present(*pmd);
> +	if (unlikely(!present)) {

I hate this whole function. But maybe in this case it's better
to just have here

if (is_pmd_migration_entry(old_pmd)) {

} else if (is_pmd_device_private_entry(old_pmd)) {

There is not much shared code and the helps reduce the indentation level.

> +		swp_entry = pmd_to_swp_entry(*pmd);
>   		old_pmd = *pmd;
> -		entry = pmd_to_swp_entry(old_pmd);
> -		page = pfn_swap_entry_to_page(entry);
> -		write = is_writable_migration_entry(entry);
> -		if (PageAnon(page))
> -			anon_exclusive = is_readable_exclusive_migration_entry(entry);
> -		young = is_migration_entry_young(entry);
> -		dirty = is_migration_entry_dirty(entry);
> +
> +		folio = pfn_swap_entry_folio(swp_entry);
> +		VM_WARN_ON(!is_migration_entry(swp_entry) &&
> +				!is_device_private_entry(swp_entry));

Indentation.

> +		page = pfn_swap_entry_to_page(swp_entry);
> +
> +		if (is_pmd_migration_entry(old_pmd)) {
> +			write = is_writable_migration_entry(swp_entry);
> +			if (PageAnon(page))
> +				anon_exclusive =
> +					is_readable_exclusive_migration_entry(
> +								swp_entry);

Single line please, this is unreadable.

> +			young = is_migration_entry_young(swp_entry);
> +			dirty = is_migration_entry_dirty(swp_entry);
> +		} else if (is_pmd_device_private_entry(old_pmd)) {
> +			write = is_writable_device_private_entry(swp_entry);
> +			anon_exclusive = PageAnonExclusive(page);
> +			if (freeze && anon_exclusive &&
> +			    folio_try_share_anon_rmap_pmd(folio, page))
> +				freeze = false;
> +			if (!freeze) {
> +				rmap_t rmap_flags = RMAP_NONE;
> +
> +				folio_ref_add(folio, HPAGE_PMD_NR - 1);
> +				if (anon_exclusive)
> +					rmap_flags |= RMAP_EXCLUSIVE;
> +
> +				folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
> +						 vma, haddr, rmap_flags);
> +			}
> +		}
> +
>   		soft_dirty = pmd_swp_soft_dirty(old_pmd);
>   		uffd_wp = pmd_swp_uffd_wp(old_pmd);
>   	} else {
> @@ -3034,30 +3062,49 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   	 * Note that NUMA hinting access restrictions are not transferred to
>   	 * avoid any possibility of altering permissions across VMAs.
>   	 */
> -	if (freeze || pmd_migration) {
> +	if (freeze || !present) {

Here too, I wonder if we should just handle device-private completely 
separately for now.

>   		for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>   			pte_t entry;
> -			swp_entry_t swp_entry;
> -
> -			if (write)
> -				swp_entry = make_writable_migration_entry(
> -							page_to_pfn(page + i));
> -			else if (anon_exclusive)
> -				swp_entry = make_readable_exclusive_migration_entry(
> -							page_to_pfn(page + i));
> -			else
> -				swp_entry = make_readable_migration_entry(
> -							page_to_pfn(page + i));
> -			if (young)
> -				swp_entry = make_migration_entry_young(swp_entry);
> -			if (dirty)
> -				swp_entry = make_migration_entry_dirty(swp_entry);
> -			entry = swp_entry_to_pte(swp_entry);
> -			if (soft_dirty)
> -				entry = pte_swp_mksoft_dirty(entry);
> -			if (uffd_wp)
> -				entry = pte_swp_mkuffd_wp(entry);
> -
> +			if (freeze || is_migration_entry(swp_entry)) {
> +				if (write)
> +					swp_entry = make_writable_migration_entry(
> +								page_to_pfn(page + i));
> +				else if (anon_exclusive)
> +					swp_entry = make_readable_exclusive_migration_entry(
> +								page_to_pfn(page + i));
> +				else
> +					swp_entry = make_readable_migration_entry(
> +								page_to_pfn(page + i));
> +				if (young)
> +					swp_entry = make_migration_entry_young(swp_entry);
> +				if (dirty)
> +					swp_entry = make_migration_entry_dirty(swp_entry);
> +				entry = swp_entry_to_pte(swp_entry);
> +				if (soft_dirty)
> +					entry = pte_swp_mksoft_dirty(entry);
> +				if (uffd_wp)
> +					entry = pte_swp_mkuffd_wp(entry);
> +			} else {
> +				/*
> +				 * anon_exclusive was already propagated to the relevant
> +				 * pages corresponding to the pte entries when freeze
> +				 * is false.
> +				 */
> +				if (write)
> +					swp_entry = make_writable_device_private_entry(
> +								page_to_pfn(page + i));
> +				else
> +					swp_entry = make_readable_device_private_entry(
> +								page_to_pfn(page + i));
> +				/*
> +				 * Young and dirty bits are not progated via swp_entry
> +				 */
> +				entry = swp_entry_to_pte(swp_entry);
> +				if (soft_dirty)
> +					entry = pte_swp_mksoft_dirty(entry);
> +				if (uffd_wp)
> +					entry = pte_swp_mkuffd_wp(entry);
> +			}
>   			VM_WARN_ON(!pte_none(ptep_get(pte + i)));
>   			set_pte_at(mm, addr, pte + i, entry);
>   		}
> @@ -3084,7 +3131,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   	}
>   	pte_unmap(pte);
>   
> -	if (!pmd_migration)
> +	if (!is_pmd_migration_entry(*pmd))
>   		folio_remove_rmap_pmd(folio, page, vma);
>   	if (freeze)
>   		put_page(page);
> @@ -3096,8 +3143,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>   void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>   			   pmd_t *pmd, bool freeze)
>   {
> +

Unrelated change.



-- 
Cheers

David / dhildenb

Re: [v5 04/15] mm/huge_memory: implement device-private THP splitting

Posted by Balbir Singh 4 months, 3 weeks ago

On 9/11/25 22:31, David Hildenbrand wrote:
> On 08.09.25 02:04, Balbir Singh wrote:
>> Add support for splitting device-private THP folios, enabling fallback
>> to smaller page sizes when large page allocation or migration fails.
>>
>> Key changes:
>> - split_huge_pmd(): Handle device-private PMD entries during splitting
>> - Preserve RMAP_EXCLUSIVE semantics for anonymous exclusive folios
>> - Skip RMP_USE_SHARED_ZEROPAGE for device-private entries as they
>>    don't support shared zero page semantics
>>
>> Cc: Andrew Morton <akpm@linux-foundation.org>
>> Cc: David Hildenbrand <david@redhat.com>
>> Cc: Zi Yan <ziy@nvidia.com>
>> Cc: Joshua Hahn <joshua.hahnjy@gmail.com>
>> Cc: Rakie Kim <rakie.kim@sk.com>
>> Cc: Byungchul Park <byungchul@sk.com>
>> Cc: Gregory Price <gourry@gourry.net>
>> Cc: Ying Huang <ying.huang@linux.alibaba.com>
>> Cc: Alistair Popple <apopple@nvidia.com>
>> Cc: Oscar Salvador <osalvador@suse.de>
>> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>> Cc: Baolin Wang <baolin.wang@linux.alibaba.com>
>> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com>
>> Cc: Nico Pache <npache@redhat.com>
>> Cc: Ryan Roberts <ryan.roberts@arm.com>
>> Cc: Dev Jain <dev.jain@arm.com>
>> Cc: Barry Song <baohua@kernel.org>
>> Cc: Lyude Paul <lyude@redhat.com>
>> Cc: Danilo Krummrich <dakr@kernel.org>
>> Cc: David Airlie <airlied@gmail.com>
>> Cc: Simona Vetter <simona@ffwll.ch>
>> Cc: Ralph Campbell <rcampbell@nvidia.com>
>> Cc: Mika Penttilä <mpenttil@redhat.com>
>> Cc: Matthew Brost <matthew.brost@intel.com>
>> Cc: Francois Dugast <francois.dugast@intel.com>
>>
>> Signed-off-by: Balbir Singh <balbirs@nvidia.com>
>> ---
>>   mm/huge_memory.c | 129 +++++++++++++++++++++++++++++++++--------------
>>   1 file changed, 91 insertions(+), 38 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 337d8e3dd837..b720870c04b2 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -2880,16 +2880,19 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>       struct page *page;
>>       pgtable_t pgtable;
>>       pmd_t old_pmd, _pmd;
>> -    bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
>> -    bool anon_exclusive = false, dirty = false;
>> +    bool young, write, soft_dirty, uffd_wp = false;
>> +    bool anon_exclusive = false, dirty = false, present = false;
>>       unsigned long addr;
>>       pte_t *pte;
>>       int i;
>> +    swp_entry_t swp_entry;
>>         VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>>       VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>>       VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
>> -    VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
>> +
>> +    VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) &&
>> +            !is_pmd_device_private_entry(*pmd));
>>   
> 
> Indentation. But I do wonder if we want a helper to do a more
> efficient
> 

The indentation at my end is fine, do you mean you want to see the conditions aligned?

> is_pmd_migration_entry() || is_pmd_device_private_entry()
> 
> If only I could come up with a good name ... any ideas?
> 
> is_non_present_folio_entry()
> 
> maybe?
> 

There is is_pfn_swap_entry(), but that includes hwpoison entries as well.


> Well, there is device-exclusive .... but that would not be reachable on these paths yet, ever.
> 
> 
>>       count_vm_event(THP_SPLIT_PMD);
>>   @@ -2937,18 +2940,43 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>           return __split_huge_zero_page_pmd(vma, haddr, pmd);
>>       }
>>   -    pmd_migration = is_pmd_migration_entry(*pmd);
>> -    if (unlikely(pmd_migration)) {
>> -        swp_entry_t entry;
>>   +    present = pmd_present(*pmd);
>> +    if (unlikely(!present)) {
> 
> I hate this whole function. But maybe in this case it's better
> to just have here
> 
> if (is_pmd_migration_entry(old_pmd)) {
> 
> } else if (is_pmd_device_private_entry(old_pmd)) {
> 
> There is not much shared code and the helps reduce the indentation level.
> 

We can definitely try this

>> +        swp_entry = pmd_to_swp_entry(*pmd);
>>           old_pmd = *pmd;
>> -        entry = pmd_to_swp_entry(old_pmd);
>> -        page = pfn_swap_entry_to_page(entry);
>> -        write = is_writable_migration_entry(entry);
>> -        if (PageAnon(page))
>> -            anon_exclusive = is_readable_exclusive_migration_entry(entry);
>> -        young = is_migration_entry_young(entry);
>> -        dirty = is_migration_entry_dirty(entry);
>> +
>> +        folio = pfn_swap_entry_folio(swp_entry);
>> +        VM_WARN_ON(!is_migration_entry(swp_entry) &&
>> +                !is_device_private_entry(swp_entry));
> 
> Indentation.

Same as above, checkpatch.pl does not seem to complain

> 
>> +        page = pfn_swap_entry_to_page(swp_entry);
>> +
>> +        if (is_pmd_migration_entry(old_pmd)) {
>> +            write = is_writable_migration_entry(swp_entry);
>> +            if (PageAnon(page))
>> +                anon_exclusive =
>> +                    is_readable_exclusive_migration_entry(
>> +                                swp_entry);
> 
> Single line please, this is unreadable.
> 

Sure, will do, I double checked and it seems like 100 columns is fine as per checkpatch.pl

>> +            young = is_migration_entry_young(swp_entry);
>> +            dirty = is_migration_entry_dirty(swp_entry);
>> +        } else if (is_pmd_device_private_entry(old_pmd)) {
>> +            write = is_writable_device_private_entry(swp_entry);
>> +            anon_exclusive = PageAnonExclusive(page);
>> +            if (freeze && anon_exclusive &&
>> +                folio_try_share_anon_rmap_pmd(folio, page))
>> +                freeze = false;
>> +            if (!freeze) {
>> +                rmap_t rmap_flags = RMAP_NONE;
>> +
>> +                folio_ref_add(folio, HPAGE_PMD_NR - 1);
>> +                if (anon_exclusive)
>> +                    rmap_flags |= RMAP_EXCLUSIVE;
>> +
>> +                folio_add_anon_rmap_ptes(folio, page, HPAGE_PMD_NR,
>> +                         vma, haddr, rmap_flags);
>> +            }
>> +        }
>> +
>>           soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>           uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>       } else {
>> @@ -3034,30 +3062,49 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>        * Note that NUMA hinting access restrictions are not transferred to
>>        * avoid any possibility of altering permissions across VMAs.
>>        */
>> -    if (freeze || pmd_migration) {
>> +    if (freeze || !present) {
> 
> Here too, I wonder if we should just handle device-private completely separately for now.
> 

For all practical purposes they are inside the if statement. freeze and is_migration need to go together.


>>           for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
>>               pte_t entry;
>> -            swp_entry_t swp_entry;
>> -
>> -            if (write)
>> -                swp_entry = make_writable_migration_entry(
>> -                            page_to_pfn(page + i));
>> -            else if (anon_exclusive)
>> -                swp_entry = make_readable_exclusive_migration_entry(
>> -                            page_to_pfn(page + i));
>> -            else
>> -                swp_entry = make_readable_migration_entry(
>> -                            page_to_pfn(page + i));
>> -            if (young)
>> -                swp_entry = make_migration_entry_young(swp_entry);
>> -            if (dirty)
>> -                swp_entry = make_migration_entry_dirty(swp_entry);
>> -            entry = swp_entry_to_pte(swp_entry);
>> -            if (soft_dirty)
>> -                entry = pte_swp_mksoft_dirty(entry);
>> -            if (uffd_wp)
>> -                entry = pte_swp_mkuffd_wp(entry);
>> -
>> +            if (freeze || is_migration_entry(swp_entry)) {
>> +                if (write)
>> +                    swp_entry = make_writable_migration_entry(
>> +                                page_to_pfn(page + i));
>> +                else if (anon_exclusive)
>> +                    swp_entry = make_readable_exclusive_migration_entry(
>> +                                page_to_pfn(page + i));
>> +                else
>> +                    swp_entry = make_readable_migration_entry(
>> +                                page_to_pfn(page + i));
>> +                if (young)
>> +                    swp_entry = make_migration_entry_young(swp_entry);
>> +                if (dirty)
>> +                    swp_entry = make_migration_entry_dirty(swp_entry);
>> +                entry = swp_entry_to_pte(swp_entry);
>> +                if (soft_dirty)
>> +                    entry = pte_swp_mksoft_dirty(entry);
>> +                if (uffd_wp)
>> +                    entry = pte_swp_mkuffd_wp(entry);
>> +            } else {
>> +                /*
>> +                 * anon_exclusive was already propagated to the relevant
>> +                 * pages corresponding to the pte entries when freeze
>> +                 * is false.
>> +                 */
>> +                if (write)
>> +                    swp_entry = make_writable_device_private_entry(
>> +                                page_to_pfn(page + i));
>> +                else
>> +                    swp_entry = make_readable_device_private_entry(
>> +                                page_to_pfn(page + i));
>> +                /*
>> +                 * Young and dirty bits are not progated via swp_entry
>> +                 */
>> +                entry = swp_entry_to_pte(swp_entry);
>> +                if (soft_dirty)
>> +                    entry = pte_swp_mksoft_dirty(entry);
>> +                if (uffd_wp)
>> +                    entry = pte_swp_mkuffd_wp(entry);
>> +            }
>>               VM_WARN_ON(!pte_none(ptep_get(pte + i)));
>>               set_pte_at(mm, addr, pte + i, entry);
>>           }
>> @@ -3084,7 +3131,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>       }
>>       pte_unmap(pte);
>>   -    if (!pmd_migration)
>> +    if (!is_pmd_migration_entry(*pmd))
>>           folio_remove_rmap_pmd(folio, page, vma);
>>       if (freeze)
>>           put_page(page);
>> @@ -3096,8 +3143,10 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>   void split_huge_pmd_locked(struct vm_area_struct *vma, unsigned long address,
>>                  pmd_t *pmd, bool freeze)
>>   {
>> +
> 
> Unrelated change.
> 

Will do

Thanks!
Balbir

Re: [v5 04/15] mm/huge_memory: implement device-private THP splitting

Posted by David Hildenbrand 4 months, 3 weeks ago

[...]

>>>          VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
>>>        VM_BUG_ON_VMA(vma->vm_start > haddr, vma);
>>>        VM_BUG_ON_VMA(vma->vm_end < haddr + HPAGE_PMD_SIZE, vma);
>>> -    VM_BUG_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd));
>>> +
>>> +    VM_WARN_ON(!is_pmd_migration_entry(*pmd) && !pmd_trans_huge(*pmd) &&
>>> +            !is_pmd_device_private_entry(*pmd));
>>>    
>>
>> Indentation. But I do wonder if we want a helper to do a more
>> efficient
>>
> 
> The indentation at my end is fine, do you mean you want to see the conditions aligned?
> 

If that's the case then all good (sometimes the added +/-/" " in the 
diff mess it up and confuse me)

>> is_pmd_migration_entry() || is_pmd_device_private_entry()
>>
>> If only I could come up with a good name ... any ideas?
>>
>> is_non_present_folio_entry()
>>
>> maybe?
>>
> 
> There is is_pfn_swap_entry(), but that includes hwpoison entries as well.

Yes, we could just use that I guess. Or alternatively add a

is_non_present_folio_entry() that does not include hwpoison (because 
they are special).

> 
> 
>> Well, there is device-exclusive .... but that would not be reachable on these paths yet, ever.
>>
>>
>>>        count_vm_event(THP_SPLIT_PMD);
>>>    @@ -2937,18 +2940,43 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>            return __split_huge_zero_page_pmd(vma, haddr, pmd);
>>>        }
>>>    -    pmd_migration = is_pmd_migration_entry(*pmd);
>>> -    if (unlikely(pmd_migration)) {
>>> -        swp_entry_t entry;
>>>    +    present = pmd_present(*pmd);
>>> +    if (unlikely(!present)) {
>>
>> I hate this whole function. But maybe in this case it's better
>> to just have here
>>
>> if (is_pmd_migration_entry(old_pmd)) {
>>
>> } else if (is_pmd_device_private_entry(old_pmd)) {
>>
>> There is not much shared code and the helps reduce the indentation level.
>>
> 
> We can definitely try this
> 
>>> +        swp_entry = pmd_to_swp_entry(*pmd);
>>>            old_pmd = *pmd;
>>> -        entry = pmd_to_swp_entry(old_pmd);
>>> -        page = pfn_swap_entry_to_page(entry);
>>> -        write = is_writable_migration_entry(entry);
>>> -        if (PageAnon(page))
>>> -            anon_exclusive = is_readable_exclusive_migration_entry(entry);
>>> -        young = is_migration_entry_young(entry);
>>> -        dirty = is_migration_entry_dirty(entry);
>>> +
>>> +        folio = pfn_swap_entry_folio(swp_entry);
>>> +        VM_WARN_ON(!is_migration_entry(swp_entry) &&
>>> +                !is_device_private_entry(swp_entry));
>>
>> Indentation.
> 
> Same as above, checkpatch.pl does not seem to complain

Make sure that both "!" are equally indented. checkpatch does not catch 
what we usually do in MM (if you read other code).

[...]

>>>            soft_dirty = pmd_swp_soft_dirty(old_pmd);
>>>            uffd_wp = pmd_swp_uffd_wp(old_pmd);
>>>        } else {
>>> @@ -3034,30 +3062,49 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
>>>         * Note that NUMA hinting access restrictions are not transferred to
>>>         * avoid any possibility of altering permissions across VMAs.
>>>         */
>>> -    if (freeze || pmd_migration) {
>>> +    if (freeze || !present) {
>>
>> Here too, I wonder if we should just handle device-private completely separately for now.
>>
> 
> For all practical purposes they are inside the if statement. freeze and is_migration need to go together.
> 

Right, but there is only the loop part share between

freeze || is_migration_entry(swp_entry)

and device_private entries.

So we can just avoid that churn and handle device_private entries 
separately, right?


-- 
Cheers

David / dhildenb