[PATCH v5 1/2] mm: Abstract THP allocation

Dev Jain posted 2 patches 2 months ago
There is a newer version of this series
[PATCH v5 1/2] mm: Abstract THP allocation
Posted by Dev Jain 2 months ago
In preparation for the second patch, abstract away the THP allocation
logic present in the create_huge_pmd() path, which corresponds to the
faulting case when no page is present.

There should be no functional change as a result of applying this patch,
except that, as David notes at [1], a PMD-aligned address should
be passed to update_mmu_cache_pmd().

[1]: https://lore.kernel.org/all/ddd3fcd2-48b3-4170-bcaa-2fe66e093f43@redhat.com/

Acked-by: David Hildenbrand <david@redhat.com> 
Signed-off-by: Dev Jain <dev.jain@arm.com>
---
 mm/huge_memory.c | 98 ++++++++++++++++++++++++++++--------------------
 1 file changed, 57 insertions(+), 41 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 4e34b7f89daf..bdbf67c18f6c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1148,47 +1148,81 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
 }
 EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
 
-static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
-			struct page *page, gfp_t gfp)
+static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
+					      unsigned long addr)
 {
-	struct vm_area_struct *vma = vmf->vma;
-	struct folio *folio = page_folio(page);
-	pgtable_t pgtable;
-	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-	vm_fault_t ret = 0;
+	unsigned long haddr = addr & HPAGE_PMD_MASK;
+	gfp_t gfp = vma_thp_gfp_mask(vma);
+	const int order = HPAGE_PMD_ORDER;
+	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
 
-	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+	if (unlikely(!folio)) {
+		count_vm_event(THP_FAULT_FALLBACK);
+		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+		goto out;
+	}
 
+	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
 	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
 		folio_put(folio);
 		count_vm_event(THP_FAULT_FALLBACK);
 		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
-		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
-		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
-		return VM_FAULT_FALLBACK;
+		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
+		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
+		return NULL;
 	}
 	folio_throttle_swaprate(folio, gfp);
 
-	pgtable = pte_alloc_one(vma->vm_mm);
-	if (unlikely(!pgtable)) {
-		ret = VM_FAULT_OOM;
-		goto release;
-	}
-
-	folio_zero_user(folio, vmf->address);
+	folio_zero_user(folio, addr);
 	/*
 	 * The memory barrier inside __folio_mark_uptodate makes sure that
 	 * folio_zero_user writes become visible before the set_pmd_at()
 	 * write.
 	 */
 	__folio_mark_uptodate(folio);
+out:
+	return folio;
+}
+
+static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
+			       struct vm_area_struct *vma, unsigned long haddr)
+{
+	pmd_t entry;
+
+	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
+	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
+	folio_add_lru_vma(folio, vma);
+	set_pmd_at(vma->vm_mm, haddr, pmd, entry);
+	update_mmu_cache_pmd(vma, haddr, pmd);
+	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+	count_vm_event(THP_FAULT_ALLOC);
+	count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
+	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
+}
+
+static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
+{
+	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+	struct vm_area_struct *vma = vmf->vma;
+	struct folio *folio;
+	pgtable_t pgtable;
+	vm_fault_t ret = 0;
+
+	folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
+	if (unlikely(!folio))
+		return VM_FAULT_FALLBACK;
+
+	pgtable = pte_alloc_one(vma->vm_mm);
+	if (unlikely(!pgtable)) {
+		ret = VM_FAULT_OOM;
+		goto release;
+	}
 
 	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
 	if (unlikely(!pmd_none(*vmf->pmd))) {
 		goto unlock_release;
 	} else {
-		pmd_t entry;
-
 		ret = check_stable_address_space(vma->vm_mm);
 		if (ret)
 			goto unlock_release;
@@ -1202,21 +1236,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
 			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
 			return ret;
 		}
-
-		entry = mk_huge_pmd(page, vma->vm_page_prot);
-		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-		folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
-		folio_add_lru_vma(folio, vma);
 		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
-		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
-		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+		map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
 		mm_inc_nr_ptes(vma->vm_mm);
 		deferred_split_folio(folio, false);
 		spin_unlock(vmf->ptl);
-		count_vm_event(THP_FAULT_ALLOC);
-		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
-		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
 	}
 
 	return 0;
@@ -1283,8 +1307,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
 vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
-	gfp_t gfp;
-	struct folio *folio;
 	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
 	vm_fault_t ret;
 
@@ -1335,14 +1357,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 		}
 		return ret;
 	}
-	gfp = vma_thp_gfp_mask(vma);
-	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
-	if (unlikely(!folio)) {
-		count_vm_event(THP_FAULT_FALLBACK);
-		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
-		return VM_FAULT_FALLBACK;
-	}
-	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
+
+	return __do_huge_pmd_anonymous_page(vmf);
 }
 
 static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
-- 
2.30.2
Re: [PATCH v5 1/2] mm: Abstract THP allocation
Posted by Kefeng Wang 2 months ago

On 2024/9/24 18:16, Dev Jain wrote:
> In preparation for the second patch, abstract away the THP allocation
> logic present in the create_huge_pmd() path, which corresponds to the
> faulting case when no page is present.
> 
> There should be no functional change as a result of applying this patch,
> except that, as David notes at [1], a PMD-aligned address should
> be passed to update_mmu_cache_pmd().
> 
> [1]: https://lore.kernel.org/all/ddd3fcd2-48b3-4170-bcaa-2fe66e093f43@redhat.com/
> 
> Acked-by: David Hildenbrand <david@redhat.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> ---
>   mm/huge_memory.c | 98 ++++++++++++++++++++++++++++--------------------
>   1 file changed, 57 insertions(+), 41 deletions(-)
> 
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index 4e34b7f89daf..bdbf67c18f6c 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1148,47 +1148,81 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
>   }
>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>   
> -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
> -			struct page *page, gfp_t gfp)
> +static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
> +					      unsigned long addr)
>   {
> -	struct vm_area_struct *vma = vmf->vma;
> -	struct folio *folio = page_folio(page);
> -	pgtable_t pgtable;
> -	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> -	vm_fault_t ret = 0;
> +	unsigned long haddr = addr & HPAGE_PMD_MASK;
> +	gfp_t gfp = vma_thp_gfp_mask(vma);
> +	const int order = HPAGE_PMD_ORDER;
> +	struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);

There is a warning without NUMA,

../mm/huge_memory.c: In function ‘vma_alloc_anon_folio_pmd’:
../mm/huge_memory.c:1154:16: warning: unused variable ‘haddr’ 
[-Wunused-variable]
  1154 |  unsigned long haddr = addr & HPAGE_PMD_MASK;
       |                ^~~~~


diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c584e77efe10..147a6e069c71 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1151,11 +1151,11 @@ EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
  static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma,
                                               unsigned long addr)
  {
-       unsigned long haddr = addr & HPAGE_PMD_MASK;
         gfp_t gfp = vma_thp_gfp_mask(vma);
         const int order = HPAGE_PMD_ORDER;
-       struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, true);
+       struct folio *folio;

+       folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, 
true);
         if (unlikely(!folio)) {
                 count_vm_event(THP_FAULT_FALLBACK);
                 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);

>   
> -	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
> +	if (unlikely(!folio)) {
> +		count_vm_event(THP_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		goto out;

Maybe return NULL to omit the out?


Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>


> +	}
>   
> +	VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>   	if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>   		folio_put(folio);
>   		count_vm_event(THP_FAULT_FALLBACK);
>   		count_vm_event(THP_FAULT_FALLBACK_CHARGE);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> -		return VM_FAULT_FALLBACK;
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
> +		count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
> +		return NULL;
>   	}
>   	folio_throttle_swaprate(folio, gfp);
>   
> -	pgtable = pte_alloc_one(vma->vm_mm);
> -	if (unlikely(!pgtable)) {
> -		ret = VM_FAULT_OOM;
> -		goto release;
> -	}
> -
> -	folio_zero_user(folio, vmf->address);
> +	folio_zero_user(folio, addr);
>   	/*
>   	 * The memory barrier inside __folio_mark_uptodate makes sure that
>   	 * folio_zero_user writes become visible before the set_pmd_at()
>   	 * write.
>   	 */
>   	__folio_mark_uptodate(folio);
> +out:
> +	return folio;
> +}
> +
> +static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
> +			       struct vm_area_struct *vma, unsigned long haddr)
> +{
> +	pmd_t entry;
> +
> +	entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
> +	entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> +	folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
> +	folio_add_lru_vma(folio, vma);
> +	set_pmd_at(vma->vm_mm, haddr, pmd, entry);
> +	update_mmu_cache_pmd(vma, haddr, pmd);
> +	add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> +	count_vm_event(THP_FAULT_ALLOC);
> +	count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
> +	count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
> +}
> +
> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
> +{
> +	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
> +	struct vm_area_struct *vma = vmf->vma;
> +	struct folio *folio;
> +	pgtable_t pgtable;
> +	vm_fault_t ret = 0;
> +
> +	folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
> +	if (unlikely(!folio))
> +		return VM_FAULT_FALLBACK;
> +
> +	pgtable = pte_alloc_one(vma->vm_mm);
> +	if (unlikely(!pgtable)) {
> +		ret = VM_FAULT_OOM;
> +		goto release;
> +	}
>   
>   	vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>   	if (unlikely(!pmd_none(*vmf->pmd))) {
>   		goto unlock_release;
>   	} else {
> -		pmd_t entry;
> -
>   		ret = check_stable_address_space(vma->vm_mm);
>   		if (ret)
>   			goto unlock_release;
> @@ -1202,21 +1236,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>   			VM_BUG_ON(ret & VM_FAULT_FALLBACK);
>   			return ret;
>   		}
> -
> -		entry = mk_huge_pmd(page, vma->vm_page_prot);
> -		entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
> -		folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
> -		folio_add_lru_vma(folio, vma);
>   		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> -		set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
> -		update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
> -		add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
> +		map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
>   		mm_inc_nr_ptes(vma->vm_mm);
>   		deferred_split_folio(folio, false);
>   		spin_unlock(vmf->ptl);
> -		count_vm_event(THP_FAULT_ALLOC);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
> -		count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>   	}
>   
>   	return 0;
> @@ -1283,8 +1307,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm,
>   vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>   {
>   	struct vm_area_struct *vma = vmf->vma;
> -	gfp_t gfp;
> -	struct folio *folio;
>   	unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>   	vm_fault_t ret;
>   
> @@ -1335,14 +1357,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>   		}
>   		return ret;
>   	}
> -	gfp = vma_thp_gfp_mask(vma);
> -	folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
> -	if (unlikely(!folio)) {
> -		count_vm_event(THP_FAULT_FALLBACK);
> -		count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK);
> -		return VM_FAULT_FALLBACK;
> -	}
> -	return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
> +
> +	return __do_huge_pmd_anonymous_page(vmf);
>   }
>   
>   static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,

Re: [PATCH v5 1/2] mm: Abstract THP allocation
Posted by Dev Jain 2 months ago
On 9/24/24 16:50, Kefeng Wang wrote:
>
>
> On 2024/9/24 18:16, Dev Jain wrote:
>> In preparation for the second patch, abstract away the THP allocation
>> logic present in the create_huge_pmd() path, which corresponds to the
>> faulting case when no page is present.
>>
>> There should be no functional change as a result of applying this patch,
>> except that, as David notes at [1], a PMD-aligned address should
>> be passed to update_mmu_cache_pmd().
>>
>> [1]: 
>> https://lore.kernel.org/all/ddd3fcd2-48b3-4170-bcaa-2fe66e093f43@redhat.com/
>>
>> Acked-by: David Hildenbrand <david@redhat.com>
>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>> ---
>>   mm/huge_memory.c | 98 ++++++++++++++++++++++++++++--------------------
>>   1 file changed, 57 insertions(+), 41 deletions(-)
>>
>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>> index 4e34b7f89daf..bdbf67c18f6c 100644
>> --- a/mm/huge_memory.c
>> +++ b/mm/huge_memory.c
>> @@ -1148,47 +1148,81 @@ unsigned long thp_get_unmapped_area(struct 
>> file *filp, unsigned long addr,
>>   }
>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>   -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>> -            struct page *page, gfp_t gfp)
>> +static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct 
>> *vma,
>> +                          unsigned long addr)
>>   {
>> -    struct vm_area_struct *vma = vmf->vma;
>> -    struct folio *folio = page_folio(page);
>> -    pgtable_t pgtable;
>> -    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> -    vm_fault_t ret = 0;
>> +    unsigned long haddr = addr & HPAGE_PMD_MASK;
>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>> +    const int order = HPAGE_PMD_ORDER;
>> +    struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, 
>> true);
>
> There is a warning without NUMA,
>
> ../mm/huge_memory.c: In function ‘vma_alloc_anon_folio_pmd’:
> ../mm/huge_memory.c:1154:16: warning: unused variable ‘haddr’ 
> [-Wunused-variable]
>  1154 |  unsigned long haddr = addr & HPAGE_PMD_MASK;
>       |                ^~~~~
>

But why is this happening?

>
> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
> index c584e77efe10..147a6e069c71 100644
> --- a/mm/huge_memory.c
> +++ b/mm/huge_memory.c
> @@ -1151,11 +1151,11 @@ EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>  static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct 
> *vma,
>                                               unsigned long addr)
>  {
> -       unsigned long haddr = addr & HPAGE_PMD_MASK;
>         gfp_t gfp = vma_thp_gfp_mask(vma);
>         const int order = HPAGE_PMD_ORDER;
> -       struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, 
> true);
> +       struct folio *folio;
>
> +       folio = vma_alloc_folio(gfp, order, vma, addr & 
> HPAGE_PMD_MASK, true);
>         if (unlikely(!folio)) {
>                 count_vm_event(THP_FAULT_FALLBACK);
>                 count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>
>>   - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>> +    if (unlikely(!folio)) {
>> +        count_vm_event(THP_FAULT_FALLBACK);
>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +        goto out;
>
> Maybe return NULL to omit the out?

Ah sorry, I have made a mess of unnecessary delayed returns :)
>
>
> Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>

Thanks!
>
>
>> +    }
>>   +    VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
>>       if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
>>           folio_put(folio);
>>           count_vm_event(THP_FAULT_FALLBACK);
>>           count_vm_event(THP_FAULT_FALLBACK_CHARGE);
>> -        count_mthp_stat(HPAGE_PMD_ORDER, 
>> MTHP_STAT_ANON_FAULT_FALLBACK);
>> -        count_mthp_stat(HPAGE_PMD_ORDER, 
>> MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> -        return VM_FAULT_FALLBACK;
>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK);
>> +        count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
>> +        return NULL;
>>       }
>>       folio_throttle_swaprate(folio, gfp);
>>   -    pgtable = pte_alloc_one(vma->vm_mm);
>> -    if (unlikely(!pgtable)) {
>> -        ret = VM_FAULT_OOM;
>> -        goto release;
>> -    }
>> -
>> -    folio_zero_user(folio, vmf->address);
>> +    folio_zero_user(folio, addr);
>>       /*
>>        * The memory barrier inside __folio_mark_uptodate makes sure that
>>        * folio_zero_user writes become visible before the set_pmd_at()
>>        * write.
>>        */
>>       __folio_mark_uptodate(folio);
>> +out:
>> +    return folio;
>> +}
>> +
>> +static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd,
>> +                   struct vm_area_struct *vma, unsigned long haddr)
>> +{
>> +    pmd_t entry;
>> +
>> +    entry = mk_huge_pmd(&folio->page, vma->vm_page_prot);
>> +    entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>> +    folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>> +    folio_add_lru_vma(folio, vma);
>> +    set_pmd_at(vma->vm_mm, haddr, pmd, entry);
>> +    update_mmu_cache_pmd(vma, haddr, pmd);
>> +    add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> +    count_vm_event(THP_FAULT_ALLOC);
>> +    count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>> +    count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>> +}
>> +
>> +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>> +{
>> +    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>> +    struct vm_area_struct *vma = vmf->vma;
>> +    struct folio *folio;
>> +    pgtable_t pgtable;
>> +    vm_fault_t ret = 0;
>> +
>> +    folio = vma_alloc_anon_folio_pmd(vma, vmf->address);
>> +    if (unlikely(!folio))
>> +        return VM_FAULT_FALLBACK;
>> +
>> +    pgtable = pte_alloc_one(vma->vm_mm);
>> +    if (unlikely(!pgtable)) {
>> +        ret = VM_FAULT_OOM;
>> +        goto release;
>> +    }
>>         vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
>>       if (unlikely(!pmd_none(*vmf->pmd))) {
>>           goto unlock_release;
>>       } else {
>> -        pmd_t entry;
>> -
>>           ret = check_stable_address_space(vma->vm_mm);
>>           if (ret)
>>               goto unlock_release;
>> @@ -1202,21 +1236,11 @@ static vm_fault_t 
>> __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>               VM_BUG_ON(ret & VM_FAULT_FALLBACK);
>>               return ret;
>>           }
>> -
>> -        entry = mk_huge_pmd(page, vma->vm_page_prot);
>> -        entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
>> -        folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE);
>> -        folio_add_lru_vma(folio, vma);
>>           pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
>> -        set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
>> -        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
>> -        add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
>> +        map_anon_folio_pmd(folio, vmf->pmd, vma, haddr);
>>           mm_inc_nr_ptes(vma->vm_mm);
>>           deferred_split_folio(folio, false);
>>           spin_unlock(vmf->ptl);
>> -        count_vm_event(THP_FAULT_ALLOC);
>> -        count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
>> -        count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
>>       }
>>         return 0;
>> @@ -1283,8 +1307,6 @@ static void set_huge_zero_folio(pgtable_t 
>> pgtable, struct mm_struct *mm,
>>   vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
>>   {
>>       struct vm_area_struct *vma = vmf->vma;
>> -    gfp_t gfp;
>> -    struct folio *folio;
>>       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>       vm_fault_t ret;
>>   @@ -1335,14 +1357,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct 
>> vm_fault *vmf)
>>           }
>>           return ret;
>>       }
>> -    gfp = vma_thp_gfp_mask(vma);
>> -    folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true);
>> -    if (unlikely(!folio)) {
>> -        count_vm_event(THP_FAULT_FALLBACK);
>> -        count_mthp_stat(HPAGE_PMD_ORDER, 
>> MTHP_STAT_ANON_FAULT_FALLBACK);
>> -        return VM_FAULT_FALLBACK;
>> -    }
>> -    return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp);
>> +
>> +    return __do_huge_pmd_anonymous_page(vmf);
>>   }
>>     static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned 
>> long addr,
>
Re: [PATCH v5 1/2] mm: Abstract THP allocation
Posted by Kefeng Wang 2 months ago

On 2024/9/24 20:17, Dev Jain wrote:
> 
> On 9/24/24 16:50, Kefeng Wang wrote:
>>
>>
>> On 2024/9/24 18:16, Dev Jain wrote:
>>> In preparation for the second patch, abstract away the THP allocation
>>> logic present in the create_huge_pmd() path, which corresponds to the
>>> faulting case when no page is present.
>>>
>>> There should be no functional change as a result of applying this patch,
>>> except that, as David notes at [1], a PMD-aligned address should
>>> be passed to update_mmu_cache_pmd().
>>>
>>> [1]: https://lore.kernel.org/all/ddd3fcd2-48b3-4170- 
>>> bcaa-2fe66e093f43@redhat.com/
>>>
>>> Acked-by: David Hildenbrand <david@redhat.com>
>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>> ---
>>>   mm/huge_memory.c | 98 ++++++++++++++++++++++++++++--------------------
>>>   1 file changed, 57 insertions(+), 41 deletions(-)
>>>
>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>> index 4e34b7f89daf..bdbf67c18f6c 100644
>>> --- a/mm/huge_memory.c
>>> +++ b/mm/huge_memory.c
>>> @@ -1148,47 +1148,81 @@ unsigned long thp_get_unmapped_area(struct 
>>> file *filp, unsigned long addr,
>>>   }
>>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>   -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
>>> -            struct page *page, gfp_t gfp)
>>> +static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct 
>>> *vma,
>>> +                          unsigned long addr)
>>>   {
>>> -    struct vm_area_struct *vma = vmf->vma;
>>> -    struct folio *folio = page_folio(page);
>>> -    pgtable_t pgtable;
>>> -    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>> -    vm_fault_t ret = 0;
>>> +    unsigned long haddr = addr & HPAGE_PMD_MASK;
>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>> +    const int order = HPAGE_PMD_ORDER;
>>> +    struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, 
>>> true);
>>
>> There is a warning without NUMA,
>>
>> ../mm/huge_memory.c: In function ‘vma_alloc_anon_folio_pmd’:
>> ../mm/huge_memory.c:1154:16: warning: unused variable ‘haddr’ [- 
>> Wunused-variable]
>>  1154 |  unsigned long haddr = addr & HPAGE_PMD_MASK;
>>       |                ^~~~~
>>
> 
> But why is this happening?

If no CONFIG_NUMA, vma_alloc_folio(...) = folio_alloc_noprof(gfp, order),
it won't use haddr.


Re: [PATCH v5 1/2] mm: Abstract THP allocation
Posted by Dev Jain 2 months ago
On 9/24/24 18:24, Kefeng Wang wrote:
>
>
> On 2024/9/24 20:17, Dev Jain wrote:
>>
>> On 9/24/24 16:50, Kefeng Wang wrote:
>>>
>>>
>>> On 2024/9/24 18:16, Dev Jain wrote:
>>>> In preparation for the second patch, abstract away the THP allocation
>>>> logic present in the create_huge_pmd() path, which corresponds to the
>>>> faulting case when no page is present.
>>>>
>>>> There should be no functional change as a result of applying this 
>>>> patch,
>>>> except that, as David notes at [1], a PMD-aligned address should
>>>> be passed to update_mmu_cache_pmd().
>>>>
>>>> [1]: https://lore.kernel.org/all/ddd3fcd2-48b3-4170- 
>>>> bcaa-2fe66e093f43@redhat.com/
>>>>
>>>> Acked-by: David Hildenbrand <david@redhat.com>
>>>> Signed-off-by: Dev Jain <dev.jain@arm.com>
>>>> ---
>>>>   mm/huge_memory.c | 98 
>>>> ++++++++++++++++++++++++++++--------------------
>>>>   1 file changed, 57 insertions(+), 41 deletions(-)
>>>>
>>>> diff --git a/mm/huge_memory.c b/mm/huge_memory.c
>>>> index 4e34b7f89daf..bdbf67c18f6c 100644
>>>> --- a/mm/huge_memory.c
>>>> +++ b/mm/huge_memory.c
>>>> @@ -1148,47 +1148,81 @@ unsigned long thp_get_unmapped_area(struct 
>>>> file *filp, unsigned long addr,
>>>>   }
>>>>   EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
>>>>   -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault 
>>>> *vmf,
>>>> -            struct page *page, gfp_t gfp)
>>>> +static struct folio *vma_alloc_anon_folio_pmd(struct 
>>>> vm_area_struct *vma,
>>>> +                          unsigned long addr)
>>>>   {
>>>> -    struct vm_area_struct *vma = vmf->vma;
>>>> -    struct folio *folio = page_folio(page);
>>>> -    pgtable_t pgtable;
>>>> -    unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
>>>> -    vm_fault_t ret = 0;
>>>> +    unsigned long haddr = addr & HPAGE_PMD_MASK;
>>>> +    gfp_t gfp = vma_thp_gfp_mask(vma);
>>>> +    const int order = HPAGE_PMD_ORDER;
>>>> +    struct folio *folio = vma_alloc_folio(gfp, order, vma, haddr, 
>>>> true);
>>>
>>> There is a warning without NUMA,
>>>
>>> ../mm/huge_memory.c: In function ‘vma_alloc_anon_folio_pmd’:
>>> ../mm/huge_memory.c:1154:16: warning: unused variable ‘haddr’ [- 
>>> Wunused-variable]
>>>  1154 |  unsigned long haddr = addr & HPAGE_PMD_MASK;
>>>       |                ^~~~~
>>>
>>
>> But why is this happening?
>
> If no CONFIG_NUMA, vma_alloc_folio(...) = folio_alloc_noprof(gfp, order),
> it won't use haddr.

Ah got it, thanks, I missed the ifdeffery in include/linux/gfp.h.
>
>