Let's convert to vmf_insert_folio_pmd().
In the unlikely case there is already something mapped, we'll now still
call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE.
That should probably be fine, no need to add special cases for that.
Signed-off-by: David Hildenbrand <david@redhat.com>
---
fs/dax.c | 47 ++++++++++-------------------------------------
1 file changed, 10 insertions(+), 37 deletions(-)
diff --git a/fs/dax.c b/fs/dax.c
index 4229513806bea..ae90706674a3f 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1375,51 +1375,24 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
const struct iomap_iter *iter, void **entry)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
- unsigned long pmd_addr = vmf->address & PMD_MASK;
- struct vm_area_struct *vma = vmf->vma;
struct inode *inode = mapping->host;
- pgtable_t pgtable = NULL;
struct folio *zero_folio;
- spinlock_t *ptl;
- pmd_t pmd_entry;
- unsigned long pfn;
+ vm_fault_t ret;
zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm);
- if (unlikely(!zero_folio))
- goto fallback;
-
- pfn = page_to_pfn(&zero_folio->page);
- *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
- DAX_PMD | DAX_ZERO_PAGE);
-
- if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm);
- if (!pgtable)
- return VM_FAULT_OOM;
- }
-
- ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
- if (!pmd_none(*(vmf->pmd))) {
- spin_unlock(ptl);
- goto fallback;
+ if (unlikely(!zero_folio)) {
+ trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
+ return VM_FAULT_FALLBACK;
}
- if (pgtable) {
- pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
- mm_inc_nr_ptes(vma->vm_mm);
- }
- pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot);
- set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
- spin_unlock(ptl);
- trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
- return VM_FAULT_NOPAGE;
+ *entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio),
+ DAX_PMD | DAX_ZERO_PAGE);
-fallback:
- if (pgtable)
- pte_free(vma->vm_mm, pgtable);
- trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry);
- return VM_FAULT_FALLBACK;
+ ret = vmf_insert_folio_pmd(vmf, zero_folio, false);
+ if (ret == VM_FAULT_NOPAGE)
+ trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry);
+ return ret;
}
#else
static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
--
2.49.0
On Tue, Jun 17, 2025 at 05:43:38PM +0200, David Hildenbrand wrote: > Let's convert to vmf_insert_folio_pmd(). > > In the unlikely case there is already something mapped, we'll now still > call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE. > > That should probably be fine, no need to add special cases for that. I'm not sure about that. Consider dax_iomap_pmd_fault() -> dax_fault_iter() -> dax_pmd_load_hole(). It calls split_huge_pmd() in response to VM_FAULT_FALLBACK which will no longer happen, what makes that ok? > Signed-off-by: David Hildenbrand <david@redhat.com> > --- > fs/dax.c | 47 ++++++++++------------------------------------- > 1 file changed, 10 insertions(+), 37 deletions(-) > > diff --git a/fs/dax.c b/fs/dax.c > index 4229513806bea..ae90706674a3f 100644 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -1375,51 +1375,24 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, > const struct iomap_iter *iter, void **entry) > { > struct address_space *mapping = vmf->vma->vm_file->f_mapping; > - unsigned long pmd_addr = vmf->address & PMD_MASK; > - struct vm_area_struct *vma = vmf->vma; > struct inode *inode = mapping->host; > - pgtable_t pgtable = NULL; > struct folio *zero_folio; > - spinlock_t *ptl; > - pmd_t pmd_entry; > - unsigned long pfn; > + vm_fault_t ret; > > zero_folio = mm_get_huge_zero_folio(vmf->vma->vm_mm); > > - if (unlikely(!zero_folio)) > - goto fallback; > - > - pfn = page_to_pfn(&zero_folio->page); > - *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, > - DAX_PMD | DAX_ZERO_PAGE); > - > - if (arch_needs_pgtable_deposit()) { > - pgtable = pte_alloc_one(vma->vm_mm); > - if (!pgtable) > - return VM_FAULT_OOM; > - } > - > - ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); > - if (!pmd_none(*(vmf->pmd))) { > - spin_unlock(ptl); > - goto fallback; > + if (unlikely(!zero_folio)) { > + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry); > + return VM_FAULT_FALLBACK; > } > > - if (pgtable) { > - pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); > - mm_inc_nr_ptes(vma->vm_mm); > - } > - pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot); > - set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); > - spin_unlock(ptl); > - trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); > - return VM_FAULT_NOPAGE; > + *entry = dax_insert_entry(xas, vmf, iter, *entry, folio_pfn(zero_folio), > + DAX_PMD | DAX_ZERO_PAGE); > > -fallback: > - if (pgtable) > - pte_free(vma->vm_mm, pgtable); > - trace_dax_pmd_load_hole_fallback(inode, vmf, zero_folio, *entry); > - return VM_FAULT_FALLBACK; > + ret = vmf_insert_folio_pmd(vmf, zero_folio, false); > + if (ret == VM_FAULT_NOPAGE) > + trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); > + return ret; > } > #else > static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, > -- > 2.49.0 >
On 24.06.25 03:16, Alistair Popple wrote: > On Tue, Jun 17, 2025 at 05:43:38PM +0200, David Hildenbrand wrote: >> Let's convert to vmf_insert_folio_pmd(). >> >> In the unlikely case there is already something mapped, we'll now still >> call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE. >> >> That should probably be fine, no need to add special cases for that. > > I'm not sure about that. Consider dax_iomap_pmd_fault() -> dax_fault_iter() -> > dax_pmd_load_hole(). It calls split_huge_pmd() in response to VM_FAULT_FALLBACK > which will no longer happen, what makes that ok? My reasoning was that this is the exact same behavior other vmf_insert_folio_pmd() users here would result in. But let me dig into the details. -- Cheers, David / dhildenb
On 25.06.25 11:03, David Hildenbrand wrote: > On 24.06.25 03:16, Alistair Popple wrote: >> On Tue, Jun 17, 2025 at 05:43:38PM +0200, David Hildenbrand wrote: >>> Let's convert to vmf_insert_folio_pmd(). >>> >>> In the unlikely case there is already something mapped, we'll now still >>> call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE. >>> >>> That should probably be fine, no need to add special cases for that. >> >> I'm not sure about that. Consider dax_iomap_pmd_fault() -> dax_fault_iter() -> >> dax_pmd_load_hole(). It calls split_huge_pmd() in response to VM_FAULT_FALLBACK >> which will no longer happen, what makes that ok? > > My reasoning was that this is the exact same behavior other > vmf_insert_folio_pmd() users here would result in. > > But let me dig into the details. Okay, trying to figure out what to do here. Assume dax_pmd_load_hole() is called and there is already something. We would have returned VM_FAULT_FALLBACK, now we would return VM_FAULT_NO_PAGE. That obviously only happens when we have not a write fault (otherwise, the shared zeropage does not apply). In dax_iomap_pmd_fault(), we would indeed split_huge_pmd(). In the DAX case (!anon vma), that would simply zap whatever is already mapped there. I guess we would then return VM_FAULT_FALLBACK from huge_fault-> ... -> dax_iomap_fault() and core MM code would fallback to handle_pte_fault() etc. and ... load a single PTE mapping the shared zeropage. BUT why is this case handled differently than everything else? E.g., (1) when we try inserting the shared zeropage through dax_load_hole()->vmf_insert_page_mkwrite() and there is already something ... we return VM_FAULT_NOPAGE. (2) when we try inserting a PTE mapping an ordinary folio through dax_fault_iter()->vmf_insert_page_mkwrite() and there is already something ... we return VM_FAULT_NOPAGE. (3) when we try inserting a PMD mapping an ordinary folio through dax_fault_iter()->vmf_insert_folio_pmd() and there is already something ... we return VM_FAULT_NOPAGE. So that makes me think ... the VM_FAULT_FALLBACK right now is probably ... wrong? And probably cannot be triggered? If there is already the huge zerofolio mapped, all good. Anything else is really not expected I would assume? -- Cheers, David / dhildenb
On Fri, Jul 04, 2025 at 03:22:28PM +0200, David Hildenbrand wrote: > On 25.06.25 11:03, David Hildenbrand wrote: > > On 24.06.25 03:16, Alistair Popple wrote: > > > On Tue, Jun 17, 2025 at 05:43:38PM +0200, David Hildenbrand wrote: > > > > Let's convert to vmf_insert_folio_pmd(). > > > > > > > > In the unlikely case there is already something mapped, we'll now still > > > > call trace_dax_pmd_load_hole() and return VM_FAULT_NOPAGE. > > > > > > > > That should probably be fine, no need to add special cases for that. > > > > > > I'm not sure about that. Consider dax_iomap_pmd_fault() -> dax_fault_iter() -> > > > dax_pmd_load_hole(). It calls split_huge_pmd() in response to VM_FAULT_FALLBACK > > > which will no longer happen, what makes that ok? > > > > My reasoning was that this is the exact same behavior other > > vmf_insert_folio_pmd() users here would result in. > > > > But let me dig into the details. > > Okay, trying to figure out what to do here. > > Assume dax_pmd_load_hole() is called and there is already something. We > would have returned VM_FAULT_FALLBACK, now we would return VM_FAULT_NO_PAGE. > > That obviously only happens when we have not a write fault (otherwise, the > shared zeropage does not apply). > > In dax_iomap_pmd_fault(), we would indeed split_huge_pmd(). In the DAX case > (!anon vma), that would simply zap whatever is already mapped there. > > I guess we would then return VM_FAULT_FALLBACK from huge_fault-> ... -> > dax_iomap_fault() and core MM code would fallback to handle_pte_fault() etc. > and ... load a single PTE mapping the shared zeropage. > > BUT > > why is this case handled differently than everything else? Hmm. Good question, I will have a bit more of a think about it, but your conclusion below is probably correct. > E.g., > > (1) when we try inserting the shared zeropage through > dax_load_hole()->vmf_insert_page_mkwrite() and there is already something > ... we return VM_FAULT_NOPAGE. > > (2) when we try inserting a PTE mapping an ordinary folio through > dax_fault_iter()->vmf_insert_page_mkwrite() and there is already something > ... we return VM_FAULT_NOPAGE. > > (3) when we try inserting a PMD mapping an ordinary folio through > dax_fault_iter()->vmf_insert_folio_pmd() and there is already something ... > we return VM_FAULT_NOPAGE. > > > So that makes me think ... the VM_FAULT_FALLBACK right now is probably ... > wrong? And probably cannot be triggered? I suspect that's true. At least I just did a full run of xfstest on a XFS DAX filesystem and was unable to trigger this path, so it's certainly not easy to trigger. > If there is already the huge zerofolio mapped, all good. > > Anything else is really not expected I would assume? > > -- > Cheers, > > David / dhildenb >
© 2016 - 2025 Red Hat, Inc.