Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This
creates a special devmap PTE entry for the pfn but does not take a
reference on the underlying struct page for the mapping. This is
because DAX page refcounts are treated specially, as indicated by the
presence of a devmap entry.
To allow DAX page refcounts to be managed the same as normal page
refcounts introduce dax_insert_pfn. This will take a reference on the
underlying page much the same as vmf_insert_page, except it also
permits upgrading an existing mapping to be writable if
requested/possible.
Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
include/linux/mm.h | 4 ++-
mm/memory.c | 79 ++++++++++++++++++++++++++++++++++++++++++-----
2 files changed, 76 insertions(+), 7 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9a5652c..b84368b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1080,6 +1080,8 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
struct mmu_gather;
struct inode;
+extern void prep_compound_page(struct page *page, unsigned int order);
+
/*
* compound_order() can be called without holding a reference, which means
* that niceties like page_folio() don't work. These callers should be
@@ -3624,6 +3626,8 @@ int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
unsigned long num);
+vm_fault_t dax_insert_pfn(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn, bool write);
vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn);
vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
diff --git a/mm/memory.c b/mm/memory.c
index ce48a05..4f26a1f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1989,14 +1989,42 @@ static int validate_page_before_insert(struct page *page)
}
static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
- unsigned long addr, struct page *page, pgprot_t prot)
+ unsigned long addr, struct page *page, pgprot_t prot, bool mkwrite)
{
struct folio *folio = page_folio(page);
+ pte_t entry = ptep_get(pte);
- if (!pte_none(ptep_get(pte)))
+ if (!pte_none(entry)) {
+ if (mkwrite) {
+ /*
+ * For read faults on private mappings the PFN passed
+ * in may not match the PFN we have mapped if the
+ * mapped PFN is a writeable COW page. In the mkwrite
+ * case we are creating a writable PTE for a shared
+ * mapping and we expect the PFNs to match. If they
+ * don't match, we are likely racing with block
+ * allocation and mapping invalidation so just skip the
+ * update.
+ */
+ if (pte_pfn(entry) != page_to_pfn(page)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
+ return -EFAULT;
+ }
+ entry = maybe_mkwrite(entry, vma);
+ entry = pte_mkyoung(entry);
+ if (ptep_set_access_flags(vma, addr, pte, entry, 1))
+ update_mmu_cache(vma, addr, pte);
+ return 0;
+ }
return -EBUSY;
+ }
+
/* Ok, finally just insert the thing.. */
folio_get(folio);
+ if (mkwrite)
+ entry = maybe_mkwrite(mk_pte(page, prot), vma);
+ else
+ entry = mk_pte(page, prot);
inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
folio_add_file_rmap_pte(folio, page, vma);
set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
@@ -2011,7 +2039,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
* pages reserved for the old functions anyway.
*/
static int insert_page(struct vm_area_struct *vma, unsigned long addr,
- struct page *page, pgprot_t prot)
+ struct page *page, pgprot_t prot, bool mkwrite)
{
int retval;
pte_t *pte;
@@ -2024,7 +2052,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
pte = get_locked_pte(vma->vm_mm, addr, &ptl);
if (!pte)
goto out;
- retval = insert_page_into_pte_locked(vma, pte, addr, page, prot);
+ retval = insert_page_into_pte_locked(vma, pte, addr, page, prot, mkwrite);
pte_unmap_unlock(pte, ptl);
out:
return retval;
@@ -2040,7 +2068,7 @@ static int insert_page_in_batch_locked(struct vm_area_struct *vma, pte_t *pte,
err = validate_page_before_insert(page);
if (err)
return err;
- return insert_page_into_pte_locked(vma, pte, addr, page, prot);
+ return insert_page_into_pte_locked(vma, pte, addr, page, prot, false);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -2177,7 +2205,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
BUG_ON(vma->vm_flags & VM_PFNMAP);
vm_flags_set(vma, VM_MIXEDMAP);
}
- return insert_page(vma, addr, page, vma->vm_page_prot);
+ return insert_page(vma, addr, page, vma->vm_page_prot, false);
}
EXPORT_SYMBOL(vm_insert_page);
@@ -2451,7 +2479,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- err = insert_page(vma, addr, page, pgprot);
+ err = insert_page(vma, addr, page, pgprot, mkwrite);
} else {
return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
@@ -2464,6 +2492,43 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
return VM_FAULT_NOPAGE;
}
+vm_fault_t dax_insert_pfn(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn_t, bool write)
+{
+ pgprot_t pgprot = vma->vm_page_prot;
+ unsigned long pfn = pfn_t_to_pfn(pfn_t);
+ struct page *page = pfn_to_page(pfn);
+ int err;
+
+ if (addr < vma->vm_start || addr >= vma->vm_end)
+ return VM_FAULT_SIGBUS;
+
+ track_pfn_insert(vma, &pgprot, pfn_t);
+
+ if (!pfn_modify_allowed(pfn, pgprot))
+ return VM_FAULT_SIGBUS;
+
+ /*
+ * We refcount the page normally so make sure pfn_valid is true.
+ */
+ if (!pfn_t_valid(pfn_t))
+ return VM_FAULT_SIGBUS;
+
+ WARN_ON_ONCE(pfn_t_devmap(pfn_t));
+
+ if (WARN_ON(is_zero_pfn(pfn) && write))
+ return VM_FAULT_SIGBUS;
+
+ err = insert_page(vma, addr, page, pgprot, write);
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
+}
+EXPORT_SYMBOL_GPL(dax_insert_pfn);
+
vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn)
{
--
git-series 0.9.1
On 27.06.24 02:54, Alistair Popple wrote: > Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This > creates a special devmap PTE entry for the pfn but does not take a > reference on the underlying struct page for the mapping. This is > because DAX page refcounts are treated specially, as indicated by the > presence of a devmap entry. > > To allow DAX page refcounts to be managed the same as normal page > refcounts introduce dax_insert_pfn. This will take a reference on the > underlying page much the same as vmf_insert_page, except it also > permits upgrading an existing mapping to be writable if > requested/possible. We have this comparably nasty vmf_insert_mixed() that FS dax abused to insert into !VM_MIXED VMAs. Is that abuse now stopping and are there maybe ways to get rid of vmf_insert_mixed()? -- Cheers, David / dhildenb
On Tue, Jul 02, 2024 at 09:18:31AM +0200, David Hildenbrand wrote: > We have this comparably nasty vmf_insert_mixed() that FS dax abused to > insert into !VM_MIXED VMAs. Is that abuse now stopping and are there maybe > ways to get rid of vmf_insert_mixed()? Unfortunately it is also used by a few drm drivers and not just DAX.
On 02.07.24 13:46, Christoph Hellwig wrote:
> On Tue, Jul 02, 2024 at 09:18:31AM +0200, David Hildenbrand wrote:
>> We have this comparably nasty vmf_insert_mixed() that FS dax abused to
>> insert into !VM_MIXED VMAs. Is that abuse now stopping and are there maybe
>> ways to get rid of vmf_insert_mixed()?
>
> Unfortunately it is also used by a few drm drivers and not just DAX.
At least they all seem to set VM_MIXED:
* fs/cramfs/inode.c does
* drivers/gpu/drm/gma500/fbdev.c does
* drivers/gpu/drm/omapdrm/omap_gem.c does
Only DAX (including drivers/dax/device.c) doesn't.
VM_MIXEDMAP handling for DAX was changed in
commit e1fb4a0864958fac2fb1b23f9f4562a9f90e3e8f
Author: Dave Jiang <dave.jiang@intel.com>
Date: Fri Aug 17 15:43:40 2018 -0700
dax: remove VM_MIXEDMAP for fsdax and device dax
After prepared by
commit 785a3fab4adbf91b2189c928a59ae219c54ba95e
Author: Dan Williams <dan.j.williams@intel.com>
Date: Mon Oct 23 07:20:00 2017 -0700
mm, dax: introduce pfn_t_special()
In support of removing the VM_MIXEDMAP indication from DAX VMAs,
introduce pfn_t_special() for drivers to indicate that _PAGE_SPECIAL
should be used for DAX ptes.
I wonder if there are ways forward to either remove vmf_insert_mixed()
or at least require it (as the name suggests) to have VM_MIXEDMAP set.
--
Cheers,
David / dhildenb
David Hildenbrand <david@redhat.com> writes: > On 27.06.24 02:54, Alistair Popple wrote: >> Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This >> creates a special devmap PTE entry for the pfn but does not take a >> reference on the underlying struct page for the mapping. This is >> because DAX page refcounts are treated specially, as indicated by the >> presence of a devmap entry. >> To allow DAX page refcounts to be managed the same as normal page >> refcounts introduce dax_insert_pfn. This will take a reference on the >> underlying page much the same as vmf_insert_page, except it also >> permits upgrading an existing mapping to be writable if >> requested/possible. > > We have this comparably nasty vmf_insert_mixed() that FS dax abused to > insert into !VM_MIXED VMAs. Is that abuse now stopping and are there > maybe ways to get rid of vmf_insert_mixed()? It's not something I've looked at but quite possibly - there are a couple of other users of vmf_insert_mixed() that would need to be removed though. I just added this as dax_insert_pfn() because as an API it is really specific to the DAX use case. For example a more general API would likely pass a page/folio.
On Thu 27-06-24 10:54:21, Alistair Popple wrote:
> Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This
> creates a special devmap PTE entry for the pfn but does not take a
> reference on the underlying struct page for the mapping. This is
> because DAX page refcounts are treated specially, as indicated by the
> presence of a devmap entry.
>
> To allow DAX page refcounts to be managed the same as normal page
> refcounts introduce dax_insert_pfn. This will take a reference on the
> underlying page much the same as vmf_insert_page, except it also
> permits upgrading an existing mapping to be writable if
> requested/possible.
>
> Signed-off-by: Alistair Popple <apopple@nvidia.com>
Overall this looks good to me. Some comments below.
> ---
> include/linux/mm.h | 4 ++-
> mm/memory.c | 79 ++++++++++++++++++++++++++++++++++++++++++-----
> 2 files changed, 76 insertions(+), 7 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 9a5652c..b84368b 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -1080,6 +1080,8 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
> struct mmu_gather;
> struct inode;
>
> +extern void prep_compound_page(struct page *page, unsigned int order);
> +
You don't seem to use this function in this patch?
> diff --git a/mm/memory.c b/mm/memory.c
> index ce48a05..4f26a1f 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -1989,14 +1989,42 @@ static int validate_page_before_insert(struct page *page)
> }
>
> static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
> - unsigned long addr, struct page *page, pgprot_t prot)
> + unsigned long addr, struct page *page, pgprot_t prot, bool mkwrite)
> {
> struct folio *folio = page_folio(page);
> + pte_t entry = ptep_get(pte);
>
> - if (!pte_none(ptep_get(pte)))
> + if (!pte_none(entry)) {
> + if (mkwrite) {
> + /*
> + * For read faults on private mappings the PFN passed
> + * in may not match the PFN we have mapped if the
> + * mapped PFN is a writeable COW page. In the mkwrite
> + * case we are creating a writable PTE for a shared
> + * mapping and we expect the PFNs to match. If they
> + * don't match, we are likely racing with block
> + * allocation and mapping invalidation so just skip the
> + * update.
> + */
> + if (pte_pfn(entry) != page_to_pfn(page)) {
> + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
> + return -EFAULT;
> + }
> + entry = maybe_mkwrite(entry, vma);
> + entry = pte_mkyoung(entry);
> + if (ptep_set_access_flags(vma, addr, pte, entry, 1))
> + update_mmu_cache(vma, addr, pte);
> + return 0;
> + }
> return -EBUSY;
If you do this like:
if (!mkwrite)
return -EBUSY;
You can reduce indentation of the big block and also making the flow more
obvious...
> + }
> +
> /* Ok, finally just insert the thing.. */
> folio_get(folio);
> + if (mkwrite)
> + entry = maybe_mkwrite(mk_pte(page, prot), vma);
> + else
> + entry = mk_pte(page, prot);
I'd prefer:
entry = mk_pte(page, prot);
if (mkwrite)
entry = maybe_mkwrite(entry, vma);
but I don't insist. Also insert_pfn() additionally has pte_mkyoung() and
pte_mkdirty(). Why was it left out here?
Honza
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
Jan Kara <jack@suse.cz> writes:
> On Thu 27-06-24 10:54:21, Alistair Popple wrote:
>> Currently to map a DAX page the DAX driver calls vmf_insert_pfn. This
>> creates a special devmap PTE entry for the pfn but does not take a
>> reference on the underlying struct page for the mapping. This is
>> because DAX page refcounts are treated specially, as indicated by the
>> presence of a devmap entry.
>>
>> To allow DAX page refcounts to be managed the same as normal page
>> refcounts introduce dax_insert_pfn. This will take a reference on the
>> underlying page much the same as vmf_insert_page, except it also
>> permits upgrading an existing mapping to be writable if
>> requested/possible.
>>
>> Signed-off-by: Alistair Popple <apopple@nvidia.com>
>
> Overall this looks good to me. Some comments below.
>
>> ---
>> include/linux/mm.h | 4 ++-
>> mm/memory.c | 79 ++++++++++++++++++++++++++++++++++++++++++-----
>> 2 files changed, 76 insertions(+), 7 deletions(-)
>>
>> diff --git a/include/linux/mm.h b/include/linux/mm.h
>> index 9a5652c..b84368b 100644
>> --- a/include/linux/mm.h
>> +++ b/include/linux/mm.h
>> @@ -1080,6 +1080,8 @@ int vma_is_stack_for_current(struct vm_area_struct *vma);
>> struct mmu_gather;
>> struct inode;
>>
>> +extern void prep_compound_page(struct page *page, unsigned int order);
>> +
>
> You don't seem to use this function in this patch?
Thanks, bad rebase splitting this up. It belongs later in the series.
>> diff --git a/mm/memory.c b/mm/memory.c
>> index ce48a05..4f26a1f 100644
>> --- a/mm/memory.c
>> +++ b/mm/memory.c
>> @@ -1989,14 +1989,42 @@ static int validate_page_before_insert(struct page *page)
>> }
>>
>> static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
>> - unsigned long addr, struct page *page, pgprot_t prot)
>> + unsigned long addr, struct page *page, pgprot_t prot, bool mkwrite)
>> {
>> struct folio *folio = page_folio(page);
>> + pte_t entry = ptep_get(pte);
>>
>> - if (!pte_none(ptep_get(pte)))
>> + if (!pte_none(entry)) {
>> + if (mkwrite) {
>> + /*
>> + * For read faults on private mappings the PFN passed
>> + * in may not match the PFN we have mapped if the
>> + * mapped PFN is a writeable COW page. In the mkwrite
>> + * case we are creating a writable PTE for a shared
>> + * mapping and we expect the PFNs to match. If they
>> + * don't match, we are likely racing with block
>> + * allocation and mapping invalidation so just skip the
>> + * update.
>> + */
>> + if (pte_pfn(entry) != page_to_pfn(page)) {
>> + WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
>> + return -EFAULT;
>> + }
>> + entry = maybe_mkwrite(entry, vma);
>> + entry = pte_mkyoung(entry);
>> + if (ptep_set_access_flags(vma, addr, pte, entry, 1))
>> + update_mmu_cache(vma, addr, pte);
>> + return 0;
>> + }
>> return -EBUSY;
>
> If you do this like:
>
> if (!mkwrite)
> return -EBUSY;
>
> You can reduce indentation of the big block and also making the flow more
> obvious...
Good idea.
>> + }
>> +
>> /* Ok, finally just insert the thing.. */
>> folio_get(folio);
>> + if (mkwrite)
>> + entry = maybe_mkwrite(mk_pte(page, prot), vma);
>> + else
>> + entry = mk_pte(page, prot);
>
> I'd prefer:
>
> entry = mk_pte(page, prot);
> if (mkwrite)
> entry = maybe_mkwrite(entry, vma);
>
> but I don't insist. Also insert_pfn() additionally has pte_mkyoung() and
> pte_mkdirty(). Why was it left out here?
An oversight by me, thanks for pointing it out!
> Honza
On Thu, Jun 27, 2024 at 10:54:21AM +1000, Alistair Popple wrote: > +extern void prep_compound_page(struct page *page, unsigned int order); No need for the extern. > static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte, > - unsigned long addr, struct page *page, pgprot_t prot) > + unsigned long addr, struct page *page, pgprot_t prot, bool mkwrite) Overly long line. > + retval = insert_page_into_pte_locked(vma, pte, addr, page, prot, mkwrite); .. same here. > +vm_fault_t dax_insert_pfn(struct vm_area_struct *vma, > + unsigned long addr, pfn_t pfn_t, bool write) This could probably use a kerneldoc comment.
© 2016 - 2025 Red Hat, Inc.