We need the ability to split PFN remap between updating the VMA and
performing the actual remap, in order to do away with the legacy f_op->mmap
hook.
To do so, update the PFN remap code to provide shared logic, and also make
remap_pfn_range_notrack() static, as its one user, io_mapping_map_user()
was removed in commit 9a4f90e24661 ("mm: remove mm/io-mapping.c").
Then, introduce remap_pfn_range_prepare(), which accepts VMA descriptor and
PFN parameters, and remap_pfn_range_complete() which accepts the same
parameters as remap_pfn_rangte().
remap_pfn_range_prepare() will set the cow vma->vm_pgoff if necessary, so
it must be supplied with a correct PFN to do so. If the caller must hold
locks to be able to do this, those locks should be held across the
operation, and mmap_abort() should be provided to revoke the lock should an
error arise.
While we're here, also clean up the duplicated #ifdef
__HAVE_PFNMAP_TRACKING check and put into a single #ifdef/#else block.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
include/linux/mm.h | 25 +++++++--
mm/memory.c | 128 ++++++++++++++++++++++++++++-----------------
2 files changed, 102 insertions(+), 51 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9d4508b20be3..0f59bf14cac3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -489,6 +489,21 @@ extern unsigned int kobjsize(const void *objp);
*/
#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
+/*
+ * Physically remapped pages are special. Tell the
+ * rest of the world about it:
+ * VM_IO tells people not to look at these pages
+ * (accesses can have side effects).
+ * VM_PFNMAP tells the core MM that the base pages are just
+ * raw PFN mappings, and do not have a "struct page" associated
+ * with them.
+ * VM_DONTEXPAND
+ * Disable vma merging and expanding with mremap().
+ * VM_DONTDUMP
+ * Omit vma from core dump, even when VM_IO turned off.
+ */
+#define VM_REMAP_FLAGS (VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP)
+
/* This mask prevents VMA from being scanned with khugepaged */
#define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
@@ -3611,10 +3626,12 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
struct vm_area_struct *find_extend_vma_locked(struct mm_struct *,
unsigned long addr);
-int remap_pfn_range(struct vm_area_struct *, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t);
-int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot);
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t pgprot);
+void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn);
+int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t pgprot);
+
int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *);
int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num);
diff --git a/mm/memory.c b/mm/memory.c
index d9de6c056179..f6234c54047f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2900,8 +2900,27 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
return 0;
}
+static int get_remap_pgoff(vm_flags_t vm_flags, unsigned long addr,
+ unsigned long end, unsigned long vm_start, unsigned long vm_end,
+ unsigned long pfn, pgoff_t *vm_pgoff_p)
+{
+ /*
+ * There's a horrible special case to handle copy-on-write
+ * behaviour that some programs depend on. We mark the "original"
+ * un-COW'ed pages by matching them up with "vma->vm_pgoff".
+ * See vm_normal_page() for details.
+ */
+ if (is_cow_mapping(vm_flags)) {
+ if (addr != vm_start || end != vm_end)
+ return -EINVAL;
+ *vm_pgoff_p = pfn;
+ }
+
+ return 0;
+}
+
static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+ unsigned long pfn, unsigned long size, pgprot_t prot, bool set_vma)
{
pgd_t *pgd;
unsigned long next;
@@ -2912,32 +2931,17 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
return -EINVAL;
- /*
- * Physically remapped pages are special. Tell the
- * rest of the world about it:
- * VM_IO tells people not to look at these pages
- * (accesses can have side effects).
- * VM_PFNMAP tells the core MM that the base pages are just
- * raw PFN mappings, and do not have a "struct page" associated
- * with them.
- * VM_DONTEXPAND
- * Disable vma merging and expanding with mremap().
- * VM_DONTDUMP
- * Omit vma from core dump, even when VM_IO turned off.
- *
- * There's a horrible special case to handle copy-on-write
- * behaviour that some programs depend on. We mark the "original"
- * un-COW'ed pages by matching them up with "vma->vm_pgoff".
- * See vm_normal_page() for details.
- */
- if (is_cow_mapping(vma->vm_flags)) {
- if (addr != vma->vm_start || end != vma->vm_end)
- return -EINVAL;
- vma->vm_pgoff = pfn;
+ if (set_vma) {
+ err = get_remap_pgoff(vma->vm_flags, addr, end,
+ vma->vm_start, vma->vm_end,
+ pfn, &vma->vm_pgoff);
+ if (err)
+ return err;
+ vm_flags_set(vma, VM_REMAP_FLAGS);
+ } else {
+ VM_WARN_ON_ONCE((vma->vm_flags & VM_REMAP_FLAGS) == VM_REMAP_FLAGS);
}
- vm_flags_set(vma, VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP);
-
BUG_ON(addr >= end);
pfn -= addr >> PAGE_SHIFT;
pgd = pgd_offset(mm, addr);
@@ -2957,11 +2961,10 @@ static int remap_pfn_range_internal(struct vm_area_struct *vma, unsigned long ad
* Variant of remap_pfn_range that does not call track_pfn_remap. The caller
* must have pre-validated the caching bits of the pgprot_t.
*/
-int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+static int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot, bool set_vma)
{
- int error = remap_pfn_range_internal(vma, addr, pfn, size, prot);
-
+ int error = remap_pfn_range_internal(vma, addr, pfn, size, prot, set_vma);
if (!error)
return 0;
@@ -2974,6 +2977,18 @@ int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
return error;
}
+void remap_pfn_range_prepare(struct vm_area_desc *desc, unsigned long pfn)
+{
+ /*
+ * We set addr=VMA start, end=VMA end here, so this won't fail, but we
+ * check it again on complete and will fail there if specified addr is
+ * invalid.
+ */
+ get_remap_pgoff(desc->vm_flags, desc->start, desc->end,
+ desc->start, desc->end, pfn, &desc->pgoff);
+ desc->vm_flags |= VM_REMAP_FLAGS;
+}
+
#ifdef __HAVE_PFNMAP_TRACKING
static inline struct pfnmap_track_ctx *pfnmap_track_ctx_alloc(unsigned long pfn,
unsigned long size, pgprot_t *prot)
@@ -3002,23 +3017,9 @@ void pfnmap_track_ctx_release(struct kref *ref)
pfnmap_untrack(ctx->pfn, ctx->size);
kfree(ctx);
}
-#endif /* __HAVE_PFNMAP_TRACKING */
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-#ifdef __HAVE_PFNMAP_TRACKING
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn, unsigned long size, pgprot_t prot)
+static int remap_pfn_range_track(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot, bool set_vma)
{
struct pfnmap_track_ctx *ctx = NULL;
int err;
@@ -3044,7 +3045,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
return -EINVAL;
}
- err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+ err = remap_pfn_range_notrack(vma, addr, pfn, size, prot, set_vma);
if (ctx) {
if (err)
kref_put(&ctx->kref, pfnmap_track_ctx_release);
@@ -3054,11 +3055,44 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
return err;
}
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range_track(vma, addr, pfn, size, prot,
+ /* set_vma = */true);
+}
+
+int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ /* With set_vma = false, the VMA will not be modified. */
+ return remap_pfn_range_track(vma, addr, pfn, size, prot,
+ /* set_vma = */false);
+}
#else
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
- return remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+ return remap_pfn_range_notrack(vma, addr, pfn, size, prot, /* set_vma = */true);
+}
+
+int remap_pfn_range_complete(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+ return remap_pfn_range_notrack(vma, addr, pfn, size, prot,
+ /* set_vma = */false);
}
#endif
EXPORT_SYMBOL(remap_pfn_range);
--
2.51.0
On Mon, Sep 08, 2025 at 12:10:39PM +0100, Lorenzo Stoakes wrote: > remap_pfn_range_prepare() will set the cow vma->vm_pgoff if necessary, so > it must be supplied with a correct PFN to do so. If the caller must hold > locks to be able to do this, those locks should be held across the > operation, and mmap_abort() should be provided to revoke the lock should an > error arise. It seems very strange to me that callers have to provide locks. Today once mmap is called the vma priv should be allocated and access to the PFN is allowed - access doesn't stop until the priv is destroyed. So whatever refcounting the driver must do to protect PFN must already be in place and driven by the vma priv. When split I'd expect the same thing the prepare should obtain the vma priv and that locks the pfn. On complete the already affiliated PFN is mapped to PTEs. Why would any driver need a lock held to complete? Arguably we should store the remap pfn in the desc and just make complete a fully generic helper that fills the PTEs from the prepared desc. Jason
On Mon, Sep 08, 2025 at 10:00:15AM -0300, Jason Gunthorpe wrote: > On Mon, Sep 08, 2025 at 12:10:39PM +0100, Lorenzo Stoakes wrote: > > remap_pfn_range_prepare() will set the cow vma->vm_pgoff if necessary, so > > it must be supplied with a correct PFN to do so. If the caller must hold > > locks to be able to do this, those locks should be held across the > > operation, and mmap_abort() should be provided to revoke the lock should an > > error arise. > > It seems very strange to me that callers have to provide locks. > > Today once mmap is called the vma priv should be allocated and access > to the PFN is allowed - access doesn't stop until the priv is > destroyed. > > So whatever refcounting the driver must do to protect PFN must already > be in place and driven by the vma priv. > > When split I'd expect the same thing the prepare should obtain the vma > priv and that locks the pfn. On complete the already affiliated PFN is > mapped to PTEs. > > Why would any driver need a lock held to complete? In general, again we're splitting an operation that didn't used to be split. A hook implementor may need to hold the lock in order to stabilise whatever is required to be stabilisesd across the two (of course, with careful consideration of the fact we're doing stuff between the two!) It's not only remap that is a concern here, people do all kinds of weird and wonderful things in .mmap(), sometimes in combination with remap. This is what makes this so fun to try to change ;) An implementor may also update state somehow which would need to be altered should the operation fail, again something that would not have needed to be considered previously, as it was all done in one. > > Arguably we should store the remap pfn in the desc and just make > complete a fully generic helper that fills the PTEs from the prepared > desc. That's an interesting take actually. Though I don't thik we can _always_ do that, as drivers again do weird and wonderful things and we need to have maximum flexibility here. But we could have a generic function that could speed some things up here, and have that assume desc->mmap_context contains the PFN. You can see patch 12/16 for an example of mmap_abort in action. I also wonder if we should add remap_pfn_range_prepare_nocow() - which can assert !is_cow_mapping(desc->vm_flags) - and then that self-documents the cases where we don't actually need the PFN on prepare (this is only for the hideous vm_pgoff hack for arches without special page table flag). > > Jason Cheers, Lorenzo
On Mon, Sep 08, 2025 at 02:27:12PM +0100, Lorenzo Stoakes wrote: > It's not only remap that is a concern here, people do all kinds of weird > and wonderful things in .mmap(), sometimes in combination with remap. So it should really not be split this way, complete is a badly name prepopulate and it should only fill the PTEs, which shouldn't need more locking. The only example in this series didn't actually need to hold the lock. Jason
On Mon, Sep 08, 2025 at 10:35:38AM -0300, Jason Gunthorpe wrote: > On Mon, Sep 08, 2025 at 02:27:12PM +0100, Lorenzo Stoakes wrote: > > > It's not only remap that is a concern here, people do all kinds of weird > > and wonderful things in .mmap(), sometimes in combination with remap. > > So it should really not be split this way, complete is a badly name I don't understand, you think we can avoid splitting this in two? If so, I disagree. We have two stages, _intentionally_ placed to avoid the issues the mmap_prepare series in the first instance worked to avoid: 1. 'Hey, how do we configure this VMA we have _not yet set up_' 2. 'OK it's set up, now do you want to do something else? I'm sorry but I'm not sure how we could otherwise do this. Keep in mind re: point 1, we _need_ the VMA to be established enough to check for merge etc. Another key aim of this change was to eliminate the need for a merge re-check. > prepopulate and it should only fill the PTEs, which shouldn't need > more locking. > > The only example in this series didn't actually need to hold the lock. There's ~250 more mmap callbacks to work through. Do you provide a guarantee that: - All 250 absolutely only need access to the VMAs to perform prepopulation of this nature. - That absolutely none will set up state in the prepopulate step that might need to be unwound should an error arise? Keeping in mind I must remain practical re: refactoring each caller. I mean, let me go check what you say re: the resctl lock, if you're right I could drop mmap_abort for now and add it later if needed. But re: calling mmap_complete prepopulate, I don't really think that's sensible. mmap_prepare is invoked at the point of the preparation of the mapping, and mmap_complete is invoked once that preoparation is complete to allow further actions. I'm obviously open to naming suggestions, but I think it's safer to consistently refer to where we are in the lifecycle rather than presuming what the caller might do. (I'd _prefer_ they always did just prepopulate, but I just don't think we necessarily can). > > Jason Cheers, Lorenzo
On Mon, Sep 08, 2025 at 03:18:46PM +0100, Lorenzo Stoakes wrote: > On Mon, Sep 08, 2025 at 10:35:38AM -0300, Jason Gunthorpe wrote: > > On Mon, Sep 08, 2025 at 02:27:12PM +0100, Lorenzo Stoakes wrote: > > > > > It's not only remap that is a concern here, people do all kinds of weird > > > and wonderful things in .mmap(), sometimes in combination with remap. > > > > So it should really not be split this way, complete is a badly name > > I don't understand, you think we can avoid splitting this in two? If so, I > disagree. I'm saying to the greatest extent possible complete should only populate PTEs. We should refrain from trying to use it for other things, because it shouldn't need to be there. > > The only example in this series didn't actually need to hold the lock. > > There's ~250 more mmap callbacks to work through. Do you provide a guarantee > that: I'd be happy if only a small few need something weird and everything else was aligned. Jason
On Mon, Sep 08, 2025 at 01:03:06PM -0300, Jason Gunthorpe wrote: > On Mon, Sep 08, 2025 at 03:18:46PM +0100, Lorenzo Stoakes wrote: > > On Mon, Sep 08, 2025 at 10:35:38AM -0300, Jason Gunthorpe wrote: > > > On Mon, Sep 08, 2025 at 02:27:12PM +0100, Lorenzo Stoakes wrote: > > > > > > > It's not only remap that is a concern here, people do all kinds of weird > > > > and wonderful things in .mmap(), sometimes in combination with remap. > > > > > > So it should really not be split this way, complete is a badly name > > > > I don't understand, you think we can avoid splitting this in two? If so, I > > disagree. > > I'm saying to the greatest extent possible complete should only > populate PTEs. > > We should refrain from trying to use it for other things, because it > shouldn't need to be there. OK that sounds sensible, I will refactor to try to do only this in the mmap_complete hook as far as is possible and see if I can use a generic function also. > > > > The only example in this series didn't actually need to hold the lock. > > > > There's ~250 more mmap callbacks to work through. Do you provide a guarantee > > that: > > I'd be happy if only a small few need something weird and everything > else was aligned. Ack! > > Jason Cheers, Lorenzo
© 2016 - 2025 Red Hat, Inc.