[v2] 1G page support for guest_memfd

[RFC PATCH v2 22/51] mm: hugetlb: Refactor hugetlb allocation functions

Posted by Ackerley Tng 8 months, 4 weeks ago

Refactor dequeue_hugetlb_folio() and alloc_surplus_hugetlb_folio() to
take mpol, nid and nodemask. This decouples allocation of a folio from
a vma.

Signed-off-by: Ackerley Tng <ackerleytng@google.com>
Change-Id: I890fb46fe8c6349383d8cf89befc68a4994eb416
---
 mm/hugetlb.c | 64 ++++++++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 5cc261b90e39..29d1a3fb10df 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1364,34 +1364,22 @@ static unsigned long available_huge_pages(struct hstate *h)
 	return h->free_huge_pages - h->resv_huge_pages;
 }
 
-static struct folio *dequeue_hugetlb_folio(struct hstate *h,
-					   struct vm_area_struct *vma,
-					   unsigned long address)
+static struct folio *dequeue_hugetlb_folio(struct hstate *h, gfp_t gfp_mask,
+					   struct mempolicy *mpol,
+					   int nid, nodemask_t *nodemask)
 {
 	struct folio *folio = NULL;
-	struct mempolicy *mpol;
-	gfp_t gfp_mask;
-	nodemask_t *nodemask;
-	pgoff_t ilx;
-	int nid;
-
-	gfp_mask = htlb_alloc_mask(h);
-	mpol = get_vma_policy(vma, address, h->order, &ilx);
-	nid = policy_node_nodemask(mpol, gfp_mask, ilx, &nodemask);
 
 	if (mpol_is_preferred_many(mpol)) {
-		folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
-							nid, nodemask);
+		folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask);
 
 		/* Fallback to all nodes if page==NULL */
 		nodemask = NULL;
 	}
 
 	if (!folio)
-		folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask,
-							nid, nodemask);
+		folio = dequeue_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask);
 
-	mpol_cond_put(mpol);
 	return folio;
 }
 
@@ -2312,21 +2300,14 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas
 }
 
 /*
- * Use the VMA's mpolicy to allocate a huge page from the buddy.
+ * Allocate a huge page from the buddy allocator given memory policy and node information.
  */
 static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
-						 struct vm_area_struct *vma,
-						 unsigned long addr)
+						 gfp_t gfp_mask,
+						 struct mempolicy *mpol,
+						 int nid, nodemask_t *nodemask)
 {
 	struct folio *folio = NULL;
-	struct mempolicy *mpol;
-	gfp_t gfp_mask = htlb_alloc_mask(h);
-	int nid;
-	nodemask_t *nodemask;
-	pgoff_t ilx;
-
-	mpol = get_vma_policy(vma, addr, h->order, &ilx);
-	nid = policy_node_nodemask(mpol, gfp_mask, ilx, &nodemask);
 
 	if (mpol_is_preferred_many(mpol)) {
 		gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
@@ -2339,7 +2320,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h,
 
 	if (!folio)
 		folio = alloc_surplus_hugetlb_folio_nodemask(h, gfp_mask, nid, nodemask);
-	mpol_cond_put(mpol);
+
 	return folio;
 }
 
@@ -2993,6 +2974,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 	int ret, idx;
 	struct hugetlb_cgroup *h_cg = NULL;
 	gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
+	struct mempolicy *mpol;
+	nodemask_t *nodemask;
+	gfp_t gfp_mask;
+	pgoff_t ilx;
+	int nid;
 
 	idx = hstate_index(h);
 
@@ -3032,7 +3018,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 
 		subpool_reservation_exists = npages_req == 0;
 	}
-
 	reservation_exists = vma_reservation_exists || subpool_reservation_exists;
 
 	/*
@@ -3048,21 +3033,30 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 			goto out_subpool_put;
 	}
 
+	mpol = get_vma_policy(vma, addr, h->order, &ilx);
+
 	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
-	if (ret)
+	if (ret) {
+		mpol_cond_put(mpol);
 		goto out_uncharge_cgroup_reservation;
+	}
+
+	gfp_mask = htlb_alloc_mask(h);
+	nid = policy_node_nodemask(mpol, gfp_mask, ilx, &nodemask);
 
 	spin_lock_irq(&hugetlb_lock);
 
 	folio = NULL;
 	if (reservation_exists || available_huge_pages(h))
-		folio = dequeue_hugetlb_folio(h, vma, addr);
+		folio = dequeue_hugetlb_folio(h, gfp_mask, mpol, nid, nodemask);
 
 	if (!folio) {
 		spin_unlock_irq(&hugetlb_lock);
-		folio = alloc_surplus_hugetlb_folio(h, vma, addr);
-		if (!folio)
+		folio = alloc_surplus_hugetlb_folio(h, gfp_mask, mpol, nid, nodemask);
+		if (!folio) {
+			mpol_cond_put(mpol);
 			goto out_uncharge_cgroup;
+		}
 		spin_lock_irq(&hugetlb_lock);
 		list_add(&folio->lru, &h->hugepage_activelist);
 		folio_ref_unfreeze(folio, 1);
@@ -3087,6 +3081,8 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
 
 	spin_unlock_irq(&hugetlb_lock);
 
+	mpol_cond_put(mpol);
+
 	hugetlb_set_folio_subpool(folio, spool);
 
 	/* If vma accounting wasn't bypassed earlier, follow up with commit. */
-- 
2.49.0.1045.g170613ef41-goog

Re: [RFC PATCH v2 22/51] mm: hugetlb: Refactor hugetlb allocation functions

Posted by Ira Weiny 8 months, 1 week ago

Ackerley Tng wrote:
> Refactor dequeue_hugetlb_folio() and alloc_surplus_hugetlb_folio() to
> take mpol, nid and nodemask. This decouples allocation of a folio from
> a vma.
> 
> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
> Change-Id: I890fb46fe8c6349383d8cf89befc68a4994eb416
> ---
>  mm/hugetlb.c | 64 ++++++++++++++++++++++++----------------------------
>  1 file changed, 30 insertions(+), 34 deletions(-)
> 

[snip]

>  
> @@ -2993,6 +2974,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>  	int ret, idx;
>  	struct hugetlb_cgroup *h_cg = NULL;
>  	gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
> +	struct mempolicy *mpol;
> +	nodemask_t *nodemask;
> +	gfp_t gfp_mask;
> +	pgoff_t ilx;
> +	int nid;
>  
>  	idx = hstate_index(h);
>  
> @@ -3032,7 +3018,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>  
>  		subpool_reservation_exists = npages_req == 0;
>  	}
> -
>  	reservation_exists = vma_reservation_exists || subpool_reservation_exists;
>  
>  	/*
> @@ -3048,21 +3033,30 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>  			goto out_subpool_put;
>  	}
>  
> +	mpol = get_vma_policy(vma, addr, h->order, &ilx);

Why does the memory policy need to be acquired here instead of after the
cgroup charge?  AFAICT this is not needed and would at least eliminate 1
of the error conditions puts.

> +
>  	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
> -	if (ret)
> +	if (ret) {
> +		mpol_cond_put(mpol);
                ^^^^
		here

All that said I think the use of some new cleanup macros could really help
a lot of this code.

What do folks in this area of the kernel think of those?

Ira

[snip]

Re: [RFC PATCH v2 22/51] mm: hugetlb: Refactor hugetlb allocation functions

Posted by Ackerley Tng 7 months, 4 weeks ago

Ira Weiny <ira.weiny@intel.com> writes:

> Ackerley Tng wrote:
>> Refactor dequeue_hugetlb_folio() and alloc_surplus_hugetlb_folio() to
>> take mpol, nid and nodemask. This decouples allocation of a folio from
>> a vma.
>> 
>> Signed-off-by: Ackerley Tng <ackerleytng@google.com>
>> Change-Id: I890fb46fe8c6349383d8cf89befc68a4994eb416
>> ---
>>  mm/hugetlb.c | 64 ++++++++++++++++++++++++----------------------------
>>  1 file changed, 30 insertions(+), 34 deletions(-)
>> 
>
> [snip]
>
>>  
>> @@ -2993,6 +2974,11 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>>  	int ret, idx;
>>  	struct hugetlb_cgroup *h_cg = NULL;
>>  	gfp_t gfp = htlb_alloc_mask(h) | __GFP_RETRY_MAYFAIL;
>> +	struct mempolicy *mpol;
>> +	nodemask_t *nodemask;
>> +	gfp_t gfp_mask;
>> +	pgoff_t ilx;
>> +	int nid;
>>  
>>  	idx = hstate_index(h);
>>  
>> @@ -3032,7 +3018,6 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>>  
>>  		subpool_reservation_exists = npages_req == 0;
>>  	}
>> -
>>  	reservation_exists = vma_reservation_exists || subpool_reservation_exists;
>>  
>>  	/*
>> @@ -3048,21 +3033,30 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma,
>>  			goto out_subpool_put;
>>  	}
>>  
>> +	mpol = get_vma_policy(vma, addr, h->order, &ilx);
>
> Why does the memory policy need to be acquired here instead of after the
> cgroup charge?  AFAICT this is not needed and would at least eliminate 1
> of the error conditions puts.
>

I was hoping that by taking this early, the upcoming refactoring out of
hugetlb_alloc_folio() will look like a nice, clean removal of the middle
of this function, leaving acquiring of the mpol and mpol_cond_put()
in-place.

In the next revision I'm splitting up the refactoring in this patch
further so if this is still an issue in some number of revisions' time,
I can fix this.

>> +
>>  	ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
>> -	if (ret)
>> +	if (ret) {
>> +		mpol_cond_put(mpol);
>                 ^^^^
> 		here
>
> All that said I think the use of some new cleanup macros could really help
> a lot of this code.
>

I'm happy to try that out...

> What do folks in this area of the kernel think of those?
>

not sure though.

> Ira
>
> [snip]