[v7] fs/dax: Fix ZONE_DEVICE page reference counts

[PATCH v7 16/20] huge_memory: Add vmf_insert_folio_pmd()

Posted by Alistair Popple 1 year ago

Currently DAX folio/page reference counts are managed differently to normal
pages. To allow these to be managed the same as normal pages introduce
vmf_insert_folio_pmd. This will map the entire PMD-sized folio and take
references as it would for a normally mapped page.

This is distinct from the current mechanism, vmf_insert_pfn_pmd, which
simply inserts a special devmap PMD entry into the page table without
holding a reference to the page for the mapping.

It is not currently useful to implement a more generic vmf_insert_folio()
which selects the correct behaviour based on folio_order(). This is because
PTE faults require only a subpage of the folio to be PTE mapped rather than
the entire folio. It would be possible to add this context somewhere but
callers already need to handle PTE faults and PMD faults separately so a
more generic function is not useful.

Signed-off-by: Alistair Popple <apopple@nvidia.com>

---

Changes for v7:

 - Fix bad pgtable handling for PPC64 (Thanks Dan and Dave)
 - Add lockdep_assert() to document locking requirements for insert_pfn_pmd()

Changes for v5:

 - Minor code cleanup suggested by David
---
 include/linux/huge_mm.h |  1 +-
 mm/huge_memory.c        | 61 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 11 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 5bd1ff7..3633bd3 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -39,6 +39,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
 vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write);
 vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write);
+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, bool write);
 vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio, bool write);
 
 enum transparent_hugepage_flag {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a3845ca..c27048d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1375,20 +1375,20 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf)
 	return __do_huge_pmd_anonymous_page(vmf);
 }
 
-static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
+static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
 		pgtable_t pgtable)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	pmd_t entry;
-	spinlock_t *ptl;
 
-	ptl = pmd_lock(mm, pmd);
+	lockdep_assert_held(pmd_lockptr(mm, pmd));
+
 	if (!pmd_none(*pmd)) {
 		if (write) {
 			if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
 				WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
-				goto out_unlock;
+				return -EEXIST;
 			}
 			entry = pmd_mkyoung(*pmd);
 			entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@@ -1396,7 +1396,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 				update_mmu_cache_pmd(vma, addr, pmd);
 		}
 
-		goto out_unlock;
+		return -EEXIST;
 	}
 
 	entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
@@ -1417,11 +1417,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
 
 	set_pmd_at(mm, addr, pmd, entry);
 	update_mmu_cache_pmd(vma, addr, pmd);
-
-out_unlock:
-	spin_unlock(ptl);
-	if (pgtable)
-		pte_free(mm, pgtable);
+	return 0;
 }
 
 /**
@@ -1440,6 +1436,8 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
 	struct vm_area_struct *vma = vmf->vma;
 	pgprot_t pgprot = vma->vm_page_prot;
 	pgtable_t pgtable = NULL;
+	spinlock_t *ptl;
+	int error;
 
 	/*
 	 * If we had pmd_special, we could avoid all these restrictions,
@@ -1462,12 +1460,53 @@ vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
 	}
 
 	track_pfn_insert(vma, &pgprot, pfn);
+	ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+	error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
+	spin_unlock(ptl);
+	if (error && pgtable)
+		pte_free(vma->vm_mm, pgtable);
 
-	insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
 
+vm_fault_t vmf_insert_folio_pmd(struct vm_fault *vmf, struct folio *folio, bool write)
+{
+	struct vm_area_struct *vma = vmf->vma;
+	unsigned long addr = vmf->address & PMD_MASK;
+	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
+	pgtable_t pgtable = NULL;
+	int error;
+
+	if (addr < vma->vm_start || addr >= vma->vm_end)
+		return VM_FAULT_SIGBUS;
+
+	if (WARN_ON_ONCE(folio_order(folio) != PMD_ORDER))
+		return VM_FAULT_SIGBUS;
+
+	if (arch_needs_pgtable_deposit()) {
+		pgtable = pte_alloc_one(vma->vm_mm);
+		if (!pgtable)
+			return VM_FAULT_OOM;
+	}
+
+	ptl = pmd_lock(mm, vmf->pmd);
+	if (pmd_none(*vmf->pmd)) {
+		folio_get(folio);
+		folio_add_file_rmap_pmd(folio, &folio->page, vma);
+		add_mm_counter(mm, mm_counter_file(folio), HPAGE_PMD_NR);
+	}
+	error = insert_pfn_pmd(vma, addr, vmf->pmd, pfn_to_pfn_t(folio_pfn(folio)),
+			       vma->vm_page_prot, write, pgtable);
+	spin_unlock(ptl);
+	if (error && pgtable)
+		pte_free(mm, pgtable);
+
+	return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(vmf_insert_folio_pmd);
+
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
 {
-- 
git-series 0.9.1

Re: [PATCH v7 16/20] huge_memory: Add vmf_insert_folio_pmd()

Posted by David Hildenbrand 12 months ago

On 04.02.25 23:48, Alistair Popple wrote:
> Currently DAX folio/page reference counts are managed differently to normal
> pages. To allow these to be managed the same as normal pages introduce
> vmf_insert_folio_pmd. This will map the entire PMD-sized folio and take
> references as it would for a normally mapped page.
> 
> This is distinct from the current mechanism, vmf_insert_pfn_pmd, which
> simply inserts a special devmap PMD entry into the page table without
> holding a reference to the page for the mapping.
> 
> It is not currently useful to implement a more generic vmf_insert_folio()
> which selects the correct behaviour based on folio_order(). This is because
> PTE faults require only a subpage of the folio to be PTE mapped rather than
> the entire folio. It would be possible to add this context somewhere but
> callers already need to handle PTE faults and PMD faults separately so a
> more generic function is not useful.
> 
> Signed-off-by: Alistair Popple <apopple@nvidia.com>

Nit: patch subject ;)

> 
> ---
> 
> Changes for v7:
> 
>   - Fix bad pgtable handling for PPC64 (Thanks Dan and Dave)

Is it? ;) insert_pfn_pmd() still doesn't consume a "pgtable_t *"

But maybe I am missing something ...

-- 
Cheers,

David / dhildenb

Re: [PATCH v7 16/20] huge_memory: Add vmf_insert_folio_pmd()

Posted by Alistair Popple 11 months, 3 weeks ago

On Mon, Feb 10, 2025 at 07:45:09PM +0100, David Hildenbrand wrote:
> On 04.02.25 23:48, Alistair Popple wrote:
> > Currently DAX folio/page reference counts are managed differently to normal
> > pages. To allow these to be managed the same as normal pages introduce
> > vmf_insert_folio_pmd. This will map the entire PMD-sized folio and take
> > references as it would for a normally mapped page.
> > 
> > This is distinct from the current mechanism, vmf_insert_pfn_pmd, which
> > simply inserts a special devmap PMD entry into the page table without
> > holding a reference to the page for the mapping.
> > 
> > It is not currently useful to implement a more generic vmf_insert_folio()
> > which selects the correct behaviour based on folio_order(). This is because
> > PTE faults require only a subpage of the folio to be PTE mapped rather than
> > the entire folio. It would be possible to add this context somewhere but
> > callers already need to handle PTE faults and PMD faults separately so a
> > more generic function is not useful.
> > 
> > Signed-off-by: Alistair Popple <apopple@nvidia.com>
> 
> Nit: patch subject ;)
> 
> > 
> > ---
> > 
> > Changes for v7:
> > 
> >   - Fix bad pgtable handling for PPC64 (Thanks Dan and Dave)
> 
> Is it? ;) insert_pfn_pmd() still doesn't consume a "pgtable_t *"
> 
> But maybe I am missing something ...

At a high-level all I'm trying to do (perhaps badly) is pull the ptl locking one
level up the callstack.

As far as I can tell the pgtable is consumed here:

static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
		pgtable_t pgtable)

[...]

	if (pgtable) {
		pgtable_trans_huge_deposit(mm, pmd, pgtable);
		mm_inc_nr_ptes(mm);
		pgtable = NULL;
	}

[...]

	return 0;

Now I can see I failed to clean up the useless pgtable = NULL asignment, which
is confusing because I'm not trying to look at pgtable in the caller (ie.
vmf_insert_pfn_pmd()/vmf_insert_folio_pmd()) to determine if it needs freeing.
So I will remove this assignment.

Instead callers just look at the return code from insert_pfn_pmd() - if there
was an error pgtable_trans_huge_deposit(pgtable) wasn't called and if the caller
passed a pgtable it should be freed. Otherwise if insert_pfn_pmd() succeeded
then callers can assume the pgtable was consumed by pgtable_trans_huge_deposit()
and therefore should not be freed.

Hopefully that all makes sense, but maybe I've missed something obvious too...

 - Alistair

> -- 
> Cheers,
> 
> David / dhildenb
>

Re: [PATCH v7 16/20] huge_memory: Add vmf_insert_folio_pmd()

Posted by David Hildenbrand 11 months, 3 weeks ago

On 17.02.25 05:29, Alistair Popple wrote:
> On Mon, Feb 10, 2025 at 07:45:09PM +0100, David Hildenbrand wrote:
>> On 04.02.25 23:48, Alistair Popple wrote:
>>> Currently DAX folio/page reference counts are managed differently to normal
>>> pages. To allow these to be managed the same as normal pages introduce
>>> vmf_insert_folio_pmd. This will map the entire PMD-sized folio and take
>>> references as it would for a normally mapped page.
>>>
>>> This is distinct from the current mechanism, vmf_insert_pfn_pmd, which
>>> simply inserts a special devmap PMD entry into the page table without
>>> holding a reference to the page for the mapping.
>>>
>>> It is not currently useful to implement a more generic vmf_insert_folio()
>>> which selects the correct behaviour based on folio_order(). This is because
>>> PTE faults require only a subpage of the folio to be PTE mapped rather than
>>> the entire folio. It would be possible to add this context somewhere but
>>> callers already need to handle PTE faults and PMD faults separately so a
>>> more generic function is not useful.
>>>
>>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>>
>> Nit: patch subject ;)
>>
>>>
>>> ---
>>>
>>> Changes for v7:
>>>
>>>    - Fix bad pgtable handling for PPC64 (Thanks Dan and Dave)
>>
>> Is it? ;) insert_pfn_pmd() still doesn't consume a "pgtable_t *"
>>
>> But maybe I am missing something ...
> 
> At a high-level all I'm trying to do (perhaps badly) is pull the ptl locking one
> level up the callstack.
> 
> As far as I can tell the pgtable is consumed here:
> 
> static int insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
> 		pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write,
> 		pgtable_t pgtable)
> 
> [...]
> 
> 	if (pgtable) {
> 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
> 		mm_inc_nr_ptes(mm);
> 		pgtable = NULL;
> 	}
> 
> [...]
> 
> 	return 0;
> 
> Now I can see I failed to clean up the useless pgtable = NULL asignment, which
> is confusing because I'm not trying to look at pgtable in the caller (ie.
> vmf_insert_pfn_pmd()/vmf_insert_folio_pmd()) to determine if it needs freeing.
> So I will remove this assignment.

Ahhh, yes, the "pgtable = NULL" confused me, so I was looking for a 
"pgtable_t *pgtable" being passed instead, that we could manipulate.

> 
> Instead callers just look at the return code from insert_pfn_pmd() - if there
> was an error pgtable_trans_huge_deposit(pgtable) wasn't called and if the caller
> passed a pgtable it should be freed. Otherwise if insert_pfn_pmd() succeeded
> then callers can assume the pgtable was consumed by pgtable_trans_huge_deposit()
> and therefore should not be freed.
> 
> Hopefully that all makes sense, but maybe I've missed something obvious too...

Yes, you assume that if insert_pfn_pmd() succeeds, the table was 
consumed, otherwise it must be freed.

Thanks!

Acked-by: David Hildenbrand <david@redhat.com>

-- 
Cheers,

David / dhildenb