[PATCH RFC 07/14] fs/dax: use vmf_insert_folio_pmd() to insert the huge zero folio

David Hildenbrand posted 14 patches 3 months, 3 weeks ago
[PATCH RFC 07/14] fs/dax: use vmf_insert_folio_pmd() to insert the huge zero folio
Posted by David Hildenbrand 3 months, 3 weeks ago
Let's convert to vmf_insert_folio_pmd().

In the unlikely case there is already something mapped, we'll now still
call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE.

That should probably be fine, no need to add special cases for that.

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 fs/dax.c | 47 ++++++++++-------------------------------------
 1 file changed, 10 insertions(+), 37 deletions(-)

diff --git a/fs/dax.c b/fs/dax.c
index 4229513806bea..ae90706674a3f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1375,51 +1375,24 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
 		const struct iomap_iter *iter, void **entry)
 {
 	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
-	unsigned long pmd_addr = vmf->address & PMD_MASK;
-	struct vm_area_struct *vma = vmf->vma;
 	struct inode *inode = mapping->host;
-	pgtable_t pgtable = NULL;
 	struct folio *zero_folio;
-	spinlock_t *ptl;
-	pmd_t pmd_entry;
-	unsigned long pfn;
+	vm_fault_t ret;
 
 	zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
 
-	if (unlikely(!zero_folio))
-		goto fallback;
-
-	pfn = page_to_pfn(&zero_folio->page);
-	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
-				  DAX_PMD | DAX_ZERO_PAGE);
-
-	if (arch_needs_pgtable_deposit()) {
-		pgtable = pte_alloc_one(vma->vm_mm);
-		if (!pgtable)
-			return VM_FAULT_OOM;
-	}
-
-	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
-	if (!pmd_none(*(vmf->pmd))) {
-		spin_unlock(ptl);
-		goto fallback;
+	if (unlikely(!zero_folio)) {
+		trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
+		return VM_FAULT_FALLBACK;
 	}
 
-	if (pgtable) {
-		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
-		mm_inc_nr_ptes(vma->vm_mm);
-	}
-	pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot);
-	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
-	spin_unlock(ptl);
-	trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
-	return VM_FAULT_NOPAGE;
+	*entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio),
+				  DAX_PMD | DAX_ZERO_PAGE);
 
-fallback:
-	if (pgtable)
-		pte_free(vma->vm_mm, pgtable);
-	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
-	return VM_FAULT_FALLBACK;
+	ret = vmf_insert_folio_pmd(vmf, zero_folio, false);
+	if (ret == VM_FAULT_NOPAGE)
+		trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
+	return ret;
 }
 #else
 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
-- 
2.49.0
Re: [PATCH RFC 07/14] fs/dax: use vmf_insert_folio_pmd() to insert the huge zero folio
Posted by Alistair Popple 3 months, 2 weeks ago
On Tue, Jun 17, 2025 at 05:43:38PM +0200, David Hildenbrand wrote:
> Let's convert to vmf_insert_folio_pmd().
> 
> In the unlikely case there is already something mapped, we'll now still
> call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE.
> 
> That should probably be fine, no need to add special cases for that.

I'm not sure about that. Consider dax_iomap_pmd_fault() -> dax_fault_iter() ->
dax_pmd_load_hole(). It calls split_huge_pmd() in response to VM_FAULT_FALLBACK
which will no longer happen, what makes that ok?

> Signed-off-by: David Hildenbrand <david@redhat.com>
> ---
>  fs/dax.c | 47 ++++++++++-------------------------------------
>  1 file changed, 10 insertions(+), 37 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index 4229513806bea..ae90706674a3f 100644
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -1375,51 +1375,24 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
>  		const struct iomap_iter *iter, void **entry)
>  {
>  	struct address_space *mapping = vmf->vma->vm_file->f_mapping;
> -	unsigned long pmd_addr = vmf->address & PMD_MASK;
> -	struct vm_area_struct *vma = vmf->vma;
>  	struct inode *inode = mapping->host;
> -	pgtable_t pgtable = NULL;
>  	struct folio *zero_folio;
> -	spinlock_t *ptl;
> -	pmd_t pmd_entry;
> -	unsigned long pfn;
> +	vm_fault_t ret;
>  
>  	zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
>  
> -	if (unlikely(!zero_folio))
> -		goto fallback;
> -
> -	pfn = page_to_pfn(&zero_folio->page);
> -	*entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
> -				  DAX_PMD | DAX_ZERO_PAGE);
> -
> -	if (arch_needs_pgtable_deposit()) {
> -		pgtable = pte_alloc_one(vma->vm_mm);
> -		if (!pgtable)
> -			return VM_FAULT_OOM;
> -	}
> -
> -	ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
> -	if (!pmd_none(*(vmf->pmd))) {
> -		spin_unlock(ptl);
> -		goto fallback;
> +	if (unlikely(!zero_folio)) {
> +		trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
> +		return VM_FAULT_FALLBACK;
>  	}
>  
> -	if (pgtable) {
> -		pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
> -		mm_inc_nr_ptes(vma->vm_mm);
> -	}
> -	pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot);
> -	set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
> -	spin_unlock(ptl);
> -	trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
> -	return VM_FAULT_NOPAGE;
> +	*entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio),
> +				  DAX_PMD | DAX_ZERO_PAGE);
>  
> -fallback:
> -	if (pgtable)
> -		pte_free(vma->vm_mm, pgtable);
> -	trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
> -	return VM_FAULT_FALLBACK;
> +	ret = vmf_insert_folio_pmd(vmf, zero_folio, false);
> +	if (ret == VM_FAULT_NOPAGE)
> +		trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
> +	return ret;
>  }
>  #else
>  static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
> -- 
> 2.49.0
>
Re: [PATCH RFC 07/14] fs/dax: use vmf_insert_folio_pmd() to insert the huge zero folio
Posted by David Hildenbrand 3 months, 2 weeks ago
On 24.06.25 03:16, Alistair Popple wrote:
> On Tue, Jun 17, 2025 at 05:43:38PM +0200, David Hildenbrand wrote:
>> Let's convert to vmf_insert_folio_pmd().
>>
>> In the unlikely case there is already something mapped, we'll now still
>> call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE.
>>
>> That should probably be fine, no need to add special cases for that.
> 
> I'm not sure about that. Consider dax_iomap_pmd_fault() -> dax_fault_iter() ->
> dax_pmd_load_hole(). It calls split_huge_pmd() in response to VM_FAULT_FALLBACK
> which will no longer happen, what makes that ok?

My reasoning was that this is the exact same behavior other 
vmf_insert_folio_pmd() users here would result in.

But let me dig into the details.

-- 
Cheers,

David / dhildenb
Re: [PATCH RFC 07/14] fs/dax: use vmf_insert_folio_pmd() to insert the huge zero folio
Posted by David Hildenbrand 3 months ago
On 25.06.25 11:03, David Hildenbrand wrote:
> On 24.06.25 03:16, Alistair Popple wrote:
>> On Tue, Jun 17, 2025 at 05:43:38PM +0200, David Hildenbrand wrote:
>>> Let's convert to vmf_insert_folio_pmd().
>>>
>>> In the unlikely case there is already something mapped, we'll now still
>>> call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE.
>>>
>>> That should probably be fine, no need to add special cases for that.
>>
>> I'm not sure about that. Consider dax_iomap_pmd_fault() -> dax_fault_iter() ->
>> dax_pmd_load_hole(). It calls split_huge_pmd() in response to VM_FAULT_FALLBACK
>> which will no longer happen, what makes that ok?
> 
> My reasoning was that this is the exact same behavior other
> vmf_insert_folio_pmd() users here would result in.
> 
> But let me dig into the details.

Okay, trying to figure out what to do here.

Assume dax_pmd_load_hole() is called and there is already something. We 
would have returned VM_FAULT_FALLBACK, now we would return VM_FAULT_NO_PAGE.

That obviously only happens when we have not a write fault (otherwise, 
the shared zeropage does not apply).

In dax_iomap_pmd_fault(), we would indeed split_huge_pmd(). In the DAX 
case (!anon vma), that would simply zap whatever is already mapped there.

I guess we would then return VM_FAULT_FALLBACK from huge_fault-> ... -> 
dax_iomap_fault() and core MM code would fallback to handle_pte_fault() 
etc. and ... load a single PTE mapping the shared zeropage.

BUT

why is this case handled differently than everything else?

E.g.,

(1) when we try inserting the shared zeropage through 
dax_load_hole()->vmf_insert_page_mkwrite() and there is already 
something ... we return VM_FAULT_NOPAGE.

(2) when we try inserting a PTE mapping an ordinary folio through 
dax_fault_iter()->vmf_insert_page_mkwrite() and there is already 
something ... we return VM_FAULT_NOPAGE.

(3) when we try inserting a PMD mapping an ordinary folio through 
dax_fault_iter()->vmf_insert_folio_pmd() and there is already something 
... we return VM_FAULT_NOPAGE.


So that makes me think ... the VM_FAULT_FALLBACK right now is probably 
... wrong? And probably cannot be triggered?

If there is already the huge zerofolio mapped, all good.

Anything else is really not expected I would assume?

-- 
Cheers,

David / dhildenb
Re: [PATCH RFC 07/14] fs/dax: use vmf_insert_folio_pmd() to insert the huge zero folio
Posted by Alistair Popple 3 months ago
On Fri, Jul 04, 2025 at 03:22:28PM +0200, David Hildenbrand wrote:
> On 25.06.25 11:03, David Hildenbrand wrote:
> > On 24.06.25 03:16, Alistair Popple wrote:
> > > On Tue, Jun 17, 2025 at 05:43:38PM +0200, David Hildenbrand wrote:
> > > > Let's convert to vmf_insert_folio_pmd().
> > > > 
> > > > In the unlikely case there is already something mapped, we'll now still
> > > > call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE.
> > > > 
> > > > That should probably be fine, no need to add special cases for that.
> > > 
> > > I'm not sure about that. Consider dax_iomap_pmd_fault() -> dax_fault_iter() ->
> > > dax_pmd_load_hole(). It calls split_huge_pmd() in response to VM_FAULT_FALLBACK
> > > which will no longer happen, what makes that ok?
> > 
> > My reasoning was that this is the exact same behavior other
> > vmf_insert_folio_pmd() users here would result in.
> > 
> > But let me dig into the details.
> 
> Okay, trying to figure out what to do here.
> 
> Assume dax_pmd_load_hole() is called and there is already something. We
> would have returned VM_FAULT_FALLBACK, now we would return VM_FAULT_NO_PAGE.
> 
> That obviously only happens when we have not a write fault (otherwise, the
> shared zeropage does not apply).
> 
> In dax_iomap_pmd_fault(), we would indeed split_huge_pmd(). In the DAX case
> (!anon vma), that would simply zap whatever is already mapped there.
> 
> I guess we would then return VM_FAULT_FALLBACK from huge_fault-> ... ->
> dax_iomap_fault() and core MM code would fallback to handle_pte_fault() etc.
> and ... load a single PTE mapping the shared zeropage.
> 
> BUT
> 
> why is this case handled differently than everything else?

Hmm. Good question, I will have a bit more of a think about it, but your
conclusion below is probably correct.

> E.g.,
> 
> (1) when we try inserting the shared zeropage through
> dax_load_hole()->vmf_insert_page_mkwrite() and there is already something
> ... we return VM_FAULT_NOPAGE.
> 
> (2) when we try inserting a PTE mapping an ordinary folio through
> dax_fault_iter()->vmf_insert_page_mkwrite() and there is already something
> ... we return VM_FAULT_NOPAGE.
> 
> (3) when we try inserting a PMD mapping an ordinary folio through
> dax_fault_iter()->vmf_insert_folio_pmd() and there is already something ...
> we return VM_FAULT_NOPAGE.
> 
> 
> So that makes me think ... the VM_FAULT_FALLBACK right now is probably ...
> wrong? And probably cannot be triggered?

I suspect that's true. At least I just did a full run of xfstest on a XFS DAX
filesystem and was unable to trigger this path, so it's certainly not easy to
trigger.

> If there is already the huge zerofolio mapped, all good.
> 
> Anything else is really not expected I would assume?
> 
> -- 
> Cheers,
> 
> David / dhildenb
>