[v1] convert hugetlb fault functions to folios

[PATCH 4/9] mm/rmap: change hugepage_add_new_anon_rmap to take in a folio

Posted by Sidhartha Kumar 2 years, 7 months ago

Change hugepage_add_new_anon_rmap() to take in a folio directly. While
adding a folio variable to the four call sites, convert page functions
with their folio equivalents. This removes three callers of the Huge Page
macros which take in a page.

Signed-off-by: Sidhartha Kumar <sidhartha.kumar@oracle.com>
---
 include/linux/rmap.h |  2 +-
 mm/hugetlb.c         | 90 +++++++++++++++++++++++++-------------------
 mm/rmap.c            |  6 +--
 3 files changed, 55 insertions(+), 43 deletions(-)

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index a6bd1f0a183d..a4570da03e58 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -203,7 +203,7 @@ void page_remove_rmap(struct page *, struct vm_area_struct *,
 
 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
 		unsigned long address, rmap_t flags);
-void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+void hugepage_add_new_anon_rmap(struct folio *, struct vm_area_struct *,
 		unsigned long address);
 
 static inline void __page_dup_rmap(struct page *page, bool compound)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index c37a26c8392c..6f25055c3ba5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4950,7 +4950,7 @@ hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long add
 		     struct folio *new_folio)
 {
 	__folio_mark_uptodate(new_folio);
-	hugepage_add_new_anon_rmap(&new_folio->page, vma, addr);
+	hugepage_add_new_anon_rmap(new_folio, vma, addr);
 	set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, &new_folio->page, 1));
 	hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
 	folio_set_hugetlb_migratable(new_folio);
@@ -5479,6 +5479,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t pte;
 	struct hstate *h = hstate_vma(vma);
 	struct page *old_page, *new_page;
+	struct folio *new_folio = NULL;
 	int outside_reserve = 0;
 	vm_fault_t ret = 0;
 	unsigned long haddr = address & huge_page_mask(h);
@@ -5590,6 +5591,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_release_old;
 	}
 
+	if (new_page)
+		new_folio = page_folio(new_page);
+
 	/*
 	 * When the original hugepage is shared one, it does not have
 	 * anon_vma prepared.
@@ -5599,9 +5603,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_release_all;
 	}
 
-	copy_user_huge_page(new_page, old_page, address, vma,
+	copy_user_huge_page(&new_folio->page, old_page, address, vma,
 			    pages_per_huge_page(h));
-	__SetPageUptodate(new_page);
+	__folio_mark_uptodate(new_folio);
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, mm, haddr,
 				haddr + huge_page_size(h));
@@ -5618,10 +5622,10 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 		huge_ptep_clear_flush(vma, haddr, ptep);
 		mmu_notifier_invalidate_range(mm, range.start, range.end);
 		page_remove_rmap(old_page, vma, true);
-		hugepage_add_new_anon_rmap(new_page, vma, haddr);
+		hugepage_add_new_anon_rmap(new_folio, vma, haddr);
 		set_huge_pte_at(mm, haddr, ptep,
-				make_huge_pte(vma, new_page, !unshare));
-		SetHPageMigratable(new_page);
+				make_huge_pte(vma, &new_folio->page, !unshare));
+		folio_set_hugetlb_migratable(new_folio);
 		/* Make the old page be freed below */
 		new_page = old_page;
 	}
@@ -5633,8 +5637,8 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * unshare)
 	 */
 	if (new_page != old_page)
-		restore_reserve_on_error(h, vma, haddr, new_page);
-	put_page(new_page);
+		restore_reserve_on_error(h, vma, haddr, &new_folio->page);
+	folio_put(new_folio);
 out_release_old:
 	put_page(old_page);
 
@@ -5756,6 +5760,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	int anon_rmap = 0;
 	unsigned long size;
 	struct page *page;
+	struct folio *folio = NULL;
 	pte_t new_pte;
 	spinlock_t *ptl;
 	unsigned long haddr = address & huge_page_mask(h);
@@ -5833,12 +5838,16 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 				ret = 0;
 			goto out;
 		}
-		clear_huge_page(page, address, pages_per_huge_page(h));
-		__SetPageUptodate(page);
+
+		if (page)
+			folio = page_folio(page);
+
+		clear_huge_page(&folio->page, address, pages_per_huge_page(h));
+		__folio_mark_uptodate(folio);
 		new_page = true;
 
 		if (vma->vm_flags & VM_MAYSHARE) {
-			int err = hugetlb_add_to_page_cache(page, mapping, idx);
+			int err = hugetlb_add_to_page_cache(&folio->page, mapping, idx);
 			if (err) {
 				/*
 				 * err can't be -EEXIST which implies someone
@@ -5847,13 +5856,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 				 * to the page cache. So it's safe to call
 				 * restore_reserve_on_error() here.
 				 */
-				restore_reserve_on_error(h, vma, haddr, page);
-				put_page(page);
+				restore_reserve_on_error(h, vma, haddr, &folio->page);
+				folio_put(folio);
 				goto out;
 			}
 			new_pagecache_page = true;
 		} else {
-			lock_page(page);
+			folio_lock(folio);
 			if (unlikely(anon_vma_prepare(vma))) {
 				ret = VM_FAULT_OOM;
 				goto backout_unlocked;
@@ -5861,12 +5870,13 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 			anon_rmap = 1;
 		}
 	} else {
+		folio = page_folio(page);
 		/*
 		 * If memory error occurs between mmap() and fault, some process
 		 * don't have hwpoisoned swap entry for errored virtual address.
 		 * So we need to block hugepage fault by PG_hwpoison bit check.
 		 */
-		if (unlikely(PageHWPoison(page))) {
+		if (unlikely(folio_test_hwpoison(folio))) {
 			ret = VM_FAULT_HWPOISON_LARGE |
 				VM_FAULT_SET_HINDEX(hstate_index(h));
 			goto backout_unlocked;
@@ -5874,8 +5884,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 
 		/* Check for page in userfault range. */
 		if (userfaultfd_minor(vma)) {
-			unlock_page(page);
-			put_page(page);
+			folio_unlock(folio);
+			folio_put(folio);
 			/* See comment in userfaultfd_missing() block above */
 			if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
 				ret = 0;
@@ -5909,10 +5919,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 		goto backout;
 
 	if (anon_rmap)
-		hugepage_add_new_anon_rmap(page, vma, haddr);
+		hugepage_add_new_anon_rmap(folio, vma, haddr);
 	else
-		page_dup_file_rmap(page, true);
-	new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
+		page_dup_file_rmap(&folio->page, true);
+	new_pte = make_huge_pte(vma, &folio->page, ((vma->vm_flags & VM_WRITE)
 				&& (vma->vm_flags & VM_SHARED)));
 	/*
 	 * If this pte was previously wr-protected, keep it wr-protected even
@@ -5925,7 +5935,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	hugetlb_count_add(pages_per_huge_page(h), mm);
 	if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
 		/* Optimization, do the COW without a second fault */
-		ret = hugetlb_wp(mm, vma, address, ptep, flags, page, ptl);
+		ret = hugetlb_wp(mm, vma, address, ptep, flags, &folio->page, ptl);
 	}
 
 	spin_unlock(ptl);
@@ -5936,9 +5946,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	 * been isolated for migration.
 	 */
 	if (new_page)
-		SetHPageMigratable(page);
+		folio_set_hugetlb_migratable(folio);
 
-	unlock_page(page);
+	folio_unlock(folio);
 out:
 	hugetlb_vma_unlock_read(vma);
 	mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -5948,10 +5958,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 	spin_unlock(ptl);
 backout_unlocked:
 	if (new_page && !new_pagecache_page)
-		restore_reserve_on_error(h, vma, haddr, page);
+		restore_reserve_on_error(h, vma, haddr, &folio->page);
 
-	unlock_page(page);
-	put_page(page);
+	folio_unlock(folio);
+	folio_put(folio);
 	goto out;
 }
 
@@ -6176,6 +6186,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	spinlock_t *ptl;
 	int ret = -ENOMEM;
 	struct page *page;
+	struct folio *folio = NULL;
 	int writable;
 	bool page_in_pagecache = false;
 
@@ -6251,12 +6262,15 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		*pagep = NULL;
 	}
 
+	if (page)
+		folio = page_folio(page);
+
 	/*
-	 * The memory barrier inside __SetPageUptodate makes sure that
+	 * The memory barrier inside __folio_mark_uptodate makes sure that
 	 * preceding stores to the page contents become visible before
 	 * the set_pte_at() write.
 	 */
-	__SetPageUptodate(page);
+	__folio_mark_uptodate(folio);
 
 	/* Add shared, newly allocated pages to the page cache. */
 	if (vm_shared && !is_continue) {
@@ -6271,7 +6285,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		 * hugetlb_fault_mutex_table that here must be hold by
 		 * the caller.
 		 */
-		ret = hugetlb_add_to_page_cache(page, mapping, idx);
+		ret = hugetlb_add_to_page_cache(&folio->page, mapping, idx);
 		if (ret)
 			goto out_release_nounlock;
 		page_in_pagecache = true;
@@ -6280,7 +6294,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	ptl = huge_pte_lock(h, dst_mm, dst_pte);
 
 	ret = -EIO;
-	if (PageHWPoison(page))
+	if (folio_test_hwpoison(folio))
 		goto out_release_unlock;
 
 	/*
@@ -6293,9 +6307,9 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 		goto out_release_unlock;
 
 	if (page_in_pagecache)
-		page_dup_file_rmap(page, true);
+		page_dup_file_rmap(&folio->page, true);
 	else
-		hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
+		hugepage_add_new_anon_rmap(folio, dst_vma, dst_addr);
 
 	/*
 	 * For either: (1) CONTINUE on a non-shared VMA, or (2) UFFDIO_COPY
@@ -6306,7 +6320,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 	else
 		writable = dst_vma->vm_flags & VM_WRITE;
 
-	_dst_pte = make_huge_pte(dst_vma, page, writable);
+	_dst_pte = make_huge_pte(dst_vma, &folio->page, writable);
 	/*
 	 * Always mark UFFDIO_COPY page dirty; note that this may not be
 	 * extremely important for hugetlbfs for now since swapping is not
@@ -6328,20 +6342,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 
 	spin_unlock(ptl);
 	if (!is_continue)
-		SetHPageMigratable(page);
+		folio_set_hugetlb_migratable(folio);
 	if (vm_shared || is_continue)
-		unlock_page(page);
+		folio_unlock(folio);
 	ret = 0;
 out:
 	return ret;
 out_release_unlock:
 	spin_unlock(ptl);
 	if (vm_shared || is_continue)
-		unlock_page(page);
+		folio_unlock(folio);
 out_release_nounlock:
 	if (!page_in_pagecache)
-		restore_reserve_on_error(h, dst_vma, dst_addr, page);
-	put_page(page);
+		restore_reserve_on_error(h, dst_vma, dst_addr, &folio->page);
+	folio_put(folio);
 	goto out;
 }
 #endif /* CONFIG_USERFAULTFD */
diff --git a/mm/rmap.c b/mm/rmap.c
index 948ca17a96ad..e6d94bd19879 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -2547,15 +2547,13 @@ void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
 				     !!(flags & RMAP_EXCLUSIVE));
 }
 
-void hugepage_add_new_anon_rmap(struct page *page,
+void hugepage_add_new_anon_rmap(struct folio *folio,
 			struct vm_area_struct *vma, unsigned long address)
 {
-	struct folio *folio = page_folio(page);
-
 	BUG_ON(address < vma->vm_start || address >= vma->vm_end);
 	/* increment count (starts at -1) */
 	atomic_set(&folio->_entire_mapcount, 0);
 	folio_clear_hugetlb_restore_reserve(folio);
-	__page_set_anon_rmap(folio, page, vma, address, 1);
+	__page_set_anon_rmap(folio, &folio->page, vma, address, 1);
 }
 #endif /* CONFIG_HUGETLB_PAGE */
-- 
2.39.0

Re: [PATCH 4/9] mm/rmap: change hugepage_add_new_anon_rmap to take in a folio

Posted by Matthew Wilcox 2 years, 7 months ago

On Thu, Jan 19, 2023 at 01:14:41PM -0800, Sidhartha Kumar wrote:
> @@ -5599,9 +5603,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
>  		goto out_release_all;
>  	}
>  
> -	copy_user_huge_page(new_page, old_page, address, vma,
> +	copy_user_huge_page(&new_folio->page, old_page, address, vma,
>  			    pages_per_huge_page(h));

We have a folio_copy(), but it feels to me like we need a
folio_copy_user() so that we can use copy_user_page() on machines
with virtual caches.

> @@ -6176,6 +6186,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  	spinlock_t *ptl;
>  	int ret = -ENOMEM;
>  	struct page *page;
> +	struct folio *folio = NULL;
>  	int writable;
>  	bool page_in_pagecache = false;
>  
> @@ -6251,12 +6262,15 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>  		*pagep = NULL;
>  	}
>  
> +	if (page)
> +		folio = page_folio(page);
> +
>  	/*
> -	 * The memory barrier inside __SetPageUptodate makes sure that
> +	 * The memory barrier inside __folio_mark_uptodate makes sure that
>  	 * preceding stores to the page contents become visible before
>  	 * the set_pte_at() write.
>  	 */
> -	__SetPageUptodate(page);
> +	__folio_mark_uptodate(folio);

I suggest that "page" can never be NULL or __SetPageUptodate() would
have crashed.

Re: [PATCH 4/9] mm/rmap: change hugepage_add_new_anon_rmap to take in a folio

Posted by Sidhartha Kumar 2 years, 7 months ago

On 1/19/23 10:00 PM, Matthew Wilcox wrote:
> On Thu, Jan 19, 2023 at 01:14:41PM -0800, Sidhartha Kumar wrote:
>> @@ -5599,9 +5603,9 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
>>   		goto out_release_all;
>>   	}
>>   
>> -	copy_user_huge_page(new_page, old_page, address, vma,
>> +	copy_user_huge_page(&new_folio->page, old_page, address, vma,
>>   			    pages_per_huge_page(h));
> 
> We have a folio_copy(), but it feels to me like we need a
> folio_copy_user() so that we can use copy_user_page() on machines
> with virtual caches.
> 
>> @@ -6176,6 +6186,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>>   	spinlock_t *ptl;
>>   	int ret = -ENOMEM;
>>   	struct page *page;
>> +	struct folio *folio = NULL;
>>   	int writable;
>>   	bool page_in_pagecache = false;
>>   
>> @@ -6251,12 +6262,15 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>>   		*pagep = NULL;
>>   	}
>>   
>> +	if (page)
>> +		folio = page_folio(page);
>> +
>>   	/*
>> -	 * The memory barrier inside __SetPageUptodate makes sure that
>> +	 * The memory barrier inside __folio_mark_uptodate makes sure that
>>   	 * preceding stores to the page contents become visible before
>>   	 * the set_pte_at() write.
>>   	 */
>> -	__SetPageUptodate(page);
>> +	__folio_mark_uptodate(folio);
> 

Hi Matthew,

In the snippet:

page = alloc_huge_page(dst_vma, dst_addr, 0);
if (IS_ERR(page)) {
	put_page(*pagep);
	ret = -ENOMEM;
	*pagep = NULL;
	goto out;
}
copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
		pages_per_huge_page(h));

I thought the IS_ERR() call does not handle the NULL case and is a check 
for high memory addresses, and copy_user_huge_page() path does not seem 
to handle the NULL case as well but alloc_huge_page() can possibly 
return NULL so I was unsure about how to handle the folio conversion.

> I suggest that "page" can never be NULL or __SetPageUptodate() would
> have crashed.
>

Re: [PATCH 4/9] mm/rmap: change hugepage_add_new_anon_rmap to take in a folio

Posted by Matthew Wilcox 2 years, 7 months ago

On Fri, Jan 20, 2023 at 12:45:38PM -0800, Sidhartha Kumar wrote:
> > > @@ -6176,6 +6186,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> > >   	spinlock_t *ptl;
> > >   	int ret = -ENOMEM;
> > >   	struct page *page;
> > > +	struct folio *folio = NULL;
> > >   	int writable;
> > >   	bool page_in_pagecache = false;
> > > @@ -6251,12 +6262,15 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
> > >   		*pagep = NULL;
> > >   	}
> > > +	if (page)
> > > +		folio = page_folio(page);
> > > +
> > >   	/*
> > > -	 * The memory barrier inside __SetPageUptodate makes sure that
> > > +	 * The memory barrier inside __folio_mark_uptodate makes sure that
> > >   	 * preceding stores to the page contents become visible before
> > >   	 * the set_pte_at() write.
> > >   	 */
> > > -	__SetPageUptodate(page);
> > > +	__folio_mark_uptodate(folio);
> > 
> 
> Hi Matthew,
> 
> In the snippet:
> 
> page = alloc_huge_page(dst_vma, dst_addr, 0);
> if (IS_ERR(page)) {
> 	put_page(*pagep);
> 	ret = -ENOMEM;
> 	*pagep = NULL;
> 	goto out;
> }
> copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
> 		pages_per_huge_page(h));
> 
> I thought the IS_ERR() call does not handle the NULL case and is a check for
> high memory addresses, and copy_user_huge_page() path does not seem to
> handle the NULL case as well but alloc_huge_page() can possibly return NULL
> so I was unsure about how to handle the folio conversion.

I'm not sure how alloc_huge_page() can return NULL.  It seems like it
returns ERR_PTR(-ENOSPC) or ERR_PTR(-ENOMEM) if it cannot allocate memory?

Re: [PATCH 4/9] mm/rmap: change hugepage_add_new_anon_rmap to take in a folio

Posted by Sidhartha Kumar 2 years, 7 months ago

On 1/20/23 1:17 PM, Matthew Wilcox wrote:
> On Fri, Jan 20, 2023 at 12:45:38PM -0800, Sidhartha Kumar wrote:
>>>> @@ -6176,6 +6186,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>>>>    	spinlock_t *ptl;
>>>>    	int ret = -ENOMEM;
>>>>    	struct page *page;
>>>> +	struct folio *folio = NULL;
>>>>    	int writable;
>>>>    	bool page_in_pagecache = false;
>>>> @@ -6251,12 +6262,15 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>>>>    		*pagep = NULL;
>>>>    	}
>>>> +	if (page)
>>>> +		folio = page_folio(page);
>>>> +
>>>>    	/*
>>>> -	 * The memory barrier inside __SetPageUptodate makes sure that
>>>> +	 * The memory barrier inside __folio_mark_uptodate makes sure that
>>>>    	 * preceding stores to the page contents become visible before
>>>>    	 * the set_pte_at() write.
>>>>    	 */
>>>> -	__SetPageUptodate(page);
>>>> +	__folio_mark_uptodate(folio);
>>>
>>
>> Hi Matthew,
>>
>> In the snippet:
>>
>> page = alloc_huge_page(dst_vma, dst_addr, 0);
>> if (IS_ERR(page)) {
>> 	put_page(*pagep);
>> 	ret = -ENOMEM;
>> 	*pagep = NULL;
>> 	goto out;
>> }
>> copy_user_huge_page(page, *pagep, dst_addr, dst_vma,
>> 		pages_per_huge_page(h));
>>
>> I thought the IS_ERR() call does not handle the NULL case and is a check for
>> high memory addresses, and copy_user_huge_page() path does not seem to
>> handle the NULL case as well but alloc_huge_page() can possibly return NULL
>> so I was unsure about how to handle the folio conversion.
> 
> I'm not sure how alloc_huge_page() can return NULL.  It seems like it
> returns ERR_PTR(-ENOSPC) or ERR_PTR(-ENOMEM) if it cannot allocate memory?
> 
I see now, I agree that page cannot be NULL at the return from 
alloc_huge_page, I will make that change in v2.

Thanks,
Sidhartha Kumar