[PATCH v2 12/16] mm: allow handling of stacked mmap_prepare hooks in more drivers

Lorenzo Stoakes (Oracle) posted 16 patches 3 weeks ago
There is a newer version of this series
[PATCH v2 12/16] mm: allow handling of stacked mmap_prepare hooks in more drivers
Posted by Lorenzo Stoakes (Oracle) 3 weeks ago
While the conversion of mmap hooks to mmap_prepare is underway, we wil
encounter situations where mmap hooks need to invoke nested mmap_prepare
hooks.

The nesting of mmap hooks is termed 'stacking'.  In order to flexibly
facilitate the conversion of custom mmap hooks in drivers which stack, we
must split up the existing compat_vma_mapped() function into two separate
functions:

* compat_set_desc_from_vma() - This allows the setting of a vm_area_desc
  object's fields to the relevant fields of a VMA.

* __compat_vma_mmap() - Once an mmap_prepare hook has been executed upon a
  vm_area_desc object, this function performs any mmap actions specified by
  the mmap_prepare hook and then invokes its vm_ops->mapped() hook if any
  were specified.

In ordinary cases, where a file's f_op->mmap_prepare() hook simply needs to
be invoked in a stacked mmap() hook, compat_vma_mmap() can be used.

However some drivers define their own nested hooks, which are invoked in
turn by another hook.

A concrete example is vmbus_channel->mmap_ring_buffer(), which is invoked
in turn by bin_attribute->mmap():

vmbus_channel->mmap_ring_buffer() has a signature of:

int (*mmap_ring_buffer)(struct vmbus_channel *channel,
			struct vm_area_struct *vma);

And bin_attribute->mmap() has a signature of:

	int (*mmap)(struct file *, struct kobject *,
		    const struct bin_attribute *attr,
		    struct vm_area_struct *vma);

And so compat_vma_mmap() cannot be used here for incremental conversion of
hooks from mmap() to mmap_prepare().

There are many such instances like this, where conversion to mmap_prepare
would otherwise cascade to a huge change set due to nesting of this kind.

The changes in this patch mean we could now instead convert
vmbus_channel->mmap_ring_buffer() to
vmbus_channel->mmap_prepare_ring_buffer(), and implement something like:

	struct vm_area_desc desc;
	int err;

	compat_set_desc_from_vm(&desc, file, vma);
	err = channel->mmap_prepare_ring_buffer(channel, &desc);
	if (err)
		return err;

	return __compat_vma_mmap(&desc, vma);

Allowing us to incrementally update this logic, and other logic like it.

Unfortunately, as part of this change, we need to be able to flexibly
assign to the VMA descriptor, so have to remove some of the const
declarations within the structure.

Also update the VMA tests to reflect the changes.

Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
---
 include/linux/fs.h              |   3 +
 include/linux/mm_types.h        |   4 +-
 mm/util.c                       | 111 +++++++++++++++++++++++---------
 mm/vma.h                        |   2 +-
 tools/testing/vma/include/dup.h | 111 ++++++++++++++++++++------------
 5 files changed, 157 insertions(+), 74 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c390f5c667e3..0bdccfa70b44 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2058,6 +2058,9 @@ static inline bool can_mmap_file(struct file *file)
 	return true;
 }
 
+void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file,
+			      const struct vm_area_struct *vma);
+int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma);
 int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
 int __vma_check_mmap_hook(struct vm_area_struct *vma);
 
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 50685cf29792..7538d64f8848 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -891,8 +891,8 @@ static __always_inline bool vma_flags_empty(vma_flags_t *flags)
  */
 struct vm_area_desc {
 	/* Immutable state. */
-	const struct mm_struct *const mm;
-	struct file *const file; /* May vary from vm_file in stacked callers. */
+	struct mm_struct *mm;
+	struct file *file; /* May vary from vm_file in stacked callers. */
 	unsigned long start;
 	unsigned long end;
 
diff --git a/mm/util.c b/mm/util.c
index aa92e471afe1..a166c48fe894 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1163,34 +1163,38 @@ void flush_dcache_folio(struct folio *folio)
 EXPORT_SYMBOL(flush_dcache_folio);
 #endif
 
-static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
+/**
+ * compat_set_desc_from_vma() - assigns VMA descriptor @desc fields from a VMA.
+ * @desc: A VMA descriptor whose fields need to be set.
+ * @file: The file object describing the file being mmap()'d.
+ * @vma: The VMA whose fields we wish to assign to @desc.
+ *
+ * This is a compatibility function to allow an mmap() hook to call
+ * mmap_prepare() hooks when drivers nest these. This function specifically
+ * allows the construction of a vm_area_desc value, @desc, from a VMA @vma for
+ * the purposes of doing this.
+ *
+ * Once the conversion of drivers is complete this function will no longer be
+ * required and will be removed.
+ */
+void compat_set_desc_from_vma(struct vm_area_desc *desc,
+			      const struct file *file,
+			      const struct vm_area_struct *vma)
 {
-	struct vm_area_desc desc = {
-		.mm = vma->vm_mm,
-		.file = file,
-		.start = vma->vm_start,
-		.end = vma->vm_end,
-
-		.pgoff = vma->vm_pgoff,
-		.vm_file = vma->vm_file,
-		.vma_flags = vma->flags,
-		.page_prot = vma->vm_page_prot,
-
-		.action.type = MMAP_NOTHING, /* Default */
-	};
-	int err;
+	desc->mm = vma->vm_mm;
+	desc->file = (struct file *)file;
+	desc->start = vma->vm_start;
+	desc->end = vma->vm_end;
 
-	err = vfs_mmap_prepare(file, &desc);
-	if (err)
-		return err;
+	desc->pgoff = vma->vm_pgoff;
+	desc->vm_file = vma->vm_file;
+	desc->vma_flags = vma->flags;
+	desc->page_prot = vma->vm_page_prot;
 
-	err = mmap_action_prepare(&desc);
-	if (err)
-		return err;
-
-	set_vma_from_desc(vma, &desc);
-	return mmap_action_complete(vma, &desc.action);
+	/* Default. */
+	desc->action.type = MMAP_NOTHING;
 }
+EXPORT_SYMBOL(compat_set_desc_from_vma);
 
 static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
 {
@@ -1211,6 +1215,49 @@ static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
 	return err;
 }
 
+/**
+ * __compat_vma_mmap() - Similar to compat_vma_mmap(), only it allows
+ * flexibility as to how the mmap_prepare callback is invoked, which is useful
+ * for drivers which invoke nested mmap_prepare callbacks in an mmap() hook.
+ * @desc: A VMA descriptor upon which an mmap_prepare() hook has already been
+ * executed.
+ * @vma: The VMA to which @desc should be applied.
+ *
+ * The function assumes that you have obtained a VMA descriptor @desc from
+ * compt_set_desc_from_vma(), and already executed the mmap_prepare() hook upon
+ * it.
+ *
+ * It then performs any specified mmap actions, and invokes the vm_ops->mapped()
+ * hook if one is present.
+ *
+ * See the description of compat_vma_mmap() for more details.
+ *
+ * Once the conversion of drivers is complete this function will no longer be
+ * required and will be removed.
+ *
+ * Returns: 0 on success or error.
+ */
+int __compat_vma_mmap(struct vm_area_desc *desc,
+		      struct vm_area_struct *vma)
+{
+	int err;
+
+	/* Perform any preparatory tasks for mmap action. */
+	err = mmap_action_prepare(desc);
+	if (err)
+		return err;
+	/* Update the VMA from the descriptor. */
+	compat_set_vma_from_desc(vma, desc);
+	/* Complete any specified mmap actions. */
+	err = mmap_action_complete(vma, &desc->action);
+	if (err)
+		return err;
+
+	/* Invoke vm_ops->mapped callback. */
+	return __compat_vma_mapped(desc->file, vma);
+}
+EXPORT_SYMBOL(__compat_vma_mmap);
+
 /**
  * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
  * existing VMA and execute any requested actions.
@@ -1218,10 +1265,10 @@ static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
  * @vma: The VMA to apply the .mmap_prepare() hook to.
  *
  * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
- * stacked filesystems invoke a nested mmap hook of an underlying file.
+ * stacked drivers invoke a nested mmap hook of an underlying file.
  *
- * Until all filesystems are converted to use .mmap_prepare(), we must be
- * conservative and continue to invoke these stacked filesystems using the
+ * Until all drivers are converted to use .mmap_prepare(), we must be
+ * conservative and continue to invoke these stacked drivers using the
  * deprecated .mmap() hook.
  *
  * However we have a problem if the underlying file system possesses an
@@ -1232,20 +1279,22 @@ static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
  * establishes a struct vm_area_desc descriptor, passes to the underlying
  * .mmap_prepare() hook and applies any changes performed by it.
  *
- * Once the conversion of filesystems is complete this function will no longer
- * be required and will be removed.
+ * Once the conversion of drivers is complete this function will no longer be
+ * required and will be removed.
  *
  * Returns: 0 on success or error.
  */
 int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct vm_area_desc desc;
 	int err;
 
-	err = __compat_vma_mmap(file, vma);
+	compat_set_desc_from_vma(&desc, file, vma);
+	err = vfs_mmap_prepare(file, &desc);
 	if (err)
 		return err;
 
-	return __compat_vma_mapped(file, vma);
+	return __compat_vma_mmap(&desc, vma);
 }
 EXPORT_SYMBOL(compat_vma_mmap);
 
diff --git a/mm/vma.h b/mm/vma.h
index adc18f7dd9f1..a76046c39b14 100644
--- a/mm/vma.h
+++ b/mm/vma.h
@@ -300,7 +300,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
  * f_op->mmap() but which might have an underlying file system which implements
  * f_op->mmap_prepare().
  */
-static inline void set_vma_from_desc(struct vm_area_struct *vma,
+static inline void compat_set_vma_from_desc(struct vm_area_struct *vma,
 		struct vm_area_desc *desc)
 {
 	/*
diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
index 114daaef4f73..6658df26698a 100644
--- a/tools/testing/vma/include/dup.h
+++ b/tools/testing/vma/include/dup.h
@@ -519,8 +519,8 @@ enum vma_operation {
  */
 struct vm_area_desc {
 	/* Immutable state. */
-	const struct mm_struct *const mm;
-	struct file *const file; /* May vary from vm_file in stacked callers. */
+	struct mm_struct *mm;
+	struct file *file; /* May vary from vm_file in stacked callers. */
 	unsigned long start;
 	unsigned long end;
 
@@ -1272,43 +1272,92 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
 }
 
 /* Declared in vma.h. */
-static inline void set_vma_from_desc(struct vm_area_struct *vma,
+static inline void compat_set_vma_from_desc(struct vm_area_struct *vma,
 		struct vm_area_desc *desc);
 
-static inline int __compat_vma_mmap(const struct file_operations *f_op,
-		struct file *file, struct vm_area_struct *vma)
+static inline void compat_set_desc_from_vma(struct vm_area_desc *desc,
+			      const struct file *file,
+			      const struct vm_area_struct *vma)
 {
-	struct vm_area_desc desc = {
-		.mm = vma->vm_mm,
-		.file = file,
-		.start = vma->vm_start,
-		.end = vma->vm_end,
+	desc->mm = vma->vm_mm;
+	desc->file = (struct file *)file;
+	desc->start = vma->vm_start;
+	desc->end = vma->vm_end;
 
-		.pgoff = vma->vm_pgoff,
-		.vm_file = vma->vm_file,
-		.vma_flags = vma->flags,
-		.page_prot = vma->vm_page_prot,
+	desc->pgoff = vma->vm_pgoff;
+	desc->vm_file = vma->vm_file;
+	desc->vma_flags = vma->flags;
+	desc->page_prot = vma->vm_page_prot;
 
-		.action.type = MMAP_NOTHING, /* Default */
-	};
+	/* Default. */
+	desc->action.type = MMAP_NOTHING;
+}
+
+static inline unsigned long vma_pages(const struct vm_area_struct *vma)
+{
+	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
+}
+
+static inline void unmap_vma_locked(struct vm_area_struct *vma)
+{
+	const size_t len = vma_pages(vma) << PAGE_SHIFT;
+
+	mmap_assert_write_locked(vma->vm_mm);
+	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
+}
+
+static inline int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
+{
+	const struct vm_operations_struct *vm_ops = vma->vm_ops;
 	int err;
 
-	err = f_op->mmap_prepare(&desc);
+	if (!vm_ops->mapped)
+		return 0;
+
+	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
+			     &vma->vm_private_data);
 	if (err)
-		return err;
+		unmap_vma_locked(vma);
+	return err;
+}
 
-	err = mmap_action_prepare(&desc);
+static inline int __compat_vma_mmap(struct vm_area_desc *desc,
+		struct vm_area_struct *vma)
+{
+	int err;
+
+	/* Perform any preparatory tasks for mmap action. */
+	err = mmap_action_prepare(desc);
+	if (err)
+		return err;
+	/* Update the VMA from the descriptor. */
+	compat_set_vma_from_desc(vma, desc);
+	/* Complete any specified mmap actions. */
+	err = mmap_action_complete(vma, &desc->action);
 	if (err)
 		return err;
 
-	set_vma_from_desc(vma, &desc);
-	return mmap_action_complete(vma, &desc.action);
+	/* Invoke vm_ops->mapped callback. */
+	return __compat_vma_mapped(desc->file, vma);
+}
+
+static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
+{
+	return file->f_op->mmap_prepare(desc);
 }
 
 static inline int compat_vma_mmap(struct file *file,
 		struct vm_area_struct *vma)
 {
-	return __compat_vma_mmap(file->f_op, file, vma);
+	struct vm_area_desc desc;
+	int err;
+
+	compat_set_desc_from_vma(&desc, file, vma);
+	err = vfs_mmap_prepare(file, &desc);
+	if (err)
+		return err;
+
+	return __compat_vma_mmap(&desc, vma);
 }
 
 
@@ -1318,11 +1367,6 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
 	mas_init(&vmi->mas, &mm->mm_mt, addr);
 }
 
-static inline unsigned long vma_pages(struct vm_area_struct *vma)
-{
-	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
-}
-
 static inline void mmap_assert_locked(struct mm_struct *);
 static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
 						unsigned long start_addr,
@@ -1492,11 +1536,6 @@ static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
 	return file->f_op->mmap(file, vma);
 }
 
-static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
-{
-	return file->f_op->mmap_prepare(desc);
-}
-
 static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
 {
 	/* Changing an anonymous vma with this is illegal */
@@ -1521,11 +1560,3 @@ static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags)
 
 	return vm_get_page_prot(vm_flags);
 }
-
-static inline void unmap_vma_locked(struct vm_area_struct *vma)
-{
-	const size_t len = vma_pages(vma) << PAGE_SHIFT;
-
-	mmap_assert_write_locked(vma->vm_mm);
-	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
-}
-- 
2.53.0
Re: [PATCH v2 12/16] mm: allow handling of stacked mmap_prepare hooks in more drivers
Posted by Joshua Hahn 2 weeks, 5 days ago
On Mon, 16 Mar 2026 21:12:08 +0000 "Lorenzo Stoakes (Oracle)" <ljs@kernel.org> wrote:

> While the conversion of mmap hooks to mmap_prepare is underway, we wil
> encounter situations where mmap hooks need to invoke nested mmap_prepare
> hooks.
> 
> The nesting of mmap hooks is termed 'stacking'.  In order to flexibly
> facilitate the conversion of custom mmap hooks in drivers which stack, we
> must split up the existing compat_vma_mapped() function into two separate
> functions:
> 
> * compat_set_desc_from_vma() - This allows the setting of a vm_area_desc
>   object's fields to the relevant fields of a VMA.

Hello Lorenzo, I hope you are doing well!

Thank you for this patch. I was developing on top of mm-new today and had
an error that I think was caused by this patch. I want to preface this by
saying that I am not at all familiar with this area of the code, so please
do forgive me if I've misinterpreted the crash and mistakenly pointed
at this commit : -)

Here is the crash:

[    1.083795] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
[    1.083883] BUG: unable to handle page fault for address: ffa00000048efbb8
[    1.083957] #PF: supervisor instruction fetch in kernel mode
[    1.084030] #PF: error_code(0x0011) - permissions violation
[    1.084086] PGD 100000067 P4D 10035f067 PUD 100364067 PMD 441ed9067 PTE 80000004466a3163
[    1.084162] Oops: Oops: 0011 [#1] SMP
[    1.084218] CPU: 0 UID: 0 PID: 305 Comm: mkdir Tainted: G        W   E       7.0.0-rc4-virtme-00442-ge53de5a0302f-dirty #85 PREEMPTLAZY

As you can see, it's on a QEMU instance. I don't think this makes a difference
in the crash, though.

[    1.084321] Tainted: [W]=WARN, [E]=UNSIGNED_MODULE
[    1.084369] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-5.el9 11/05/2023
[    1.084450] RIP: 0010:0xffa00000048efbb8
[    1.084489] Code: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <40> 12 0e 00 01 00 11 ff d0 fa 8e 04 00 00 a0 ff 80 33 51 02 01 00
[    1.084642] RSP: 0018:ffa00000048ef998 EFLAGS: 00010286
[    1.084692] RAX: ffa00000048efbb8 RBX: ff11000102512cc0 RCX: 000000000000000d
[    1.084766] RDX: ffffffffa06247d0 RSI: ffa00000048efa18 RDI: ff11000102512cc0
[    1.084826] RBP: ffa00000048ef9c8 R08: 0000000000000000 R09: 0000000000000007
[    1.084889] R10: ff110001047d1f08 R11: 00007effdc3d0fff R12: ff110001047d3b00
[    1.084954] R13: ff11000446cae600 R14: ff110001024efe00 R15: ff11000102510a80
[    1.085021] FS:  0000000000000000(0000) GS:ff110004aae72000(0000) knlGS:0000000000000000
[    1.085083] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    1.085136] CR2: ffa00000048efbb8 CR3: 0000000102667001 CR4: 0000000000771ef0
[    1.085201] PKRU: 55555554
[    1.085228] Call Trace:
[    1.085248]  <TASK>
[    1.085274]  ? __compat_vma_mmap+0x8e/0x130
[    1.085318]  ? compat_vma_mmap+0x76/0x80
[    1.085354]  ? mas_alloc_nodes+0xb2/0x110
[    1.085390]  ? backing_file_mmap+0xc3/0xf0
[    1.085426]  ? ovl_mmap+0x41/0x50
[    1.085463]  ? ovl_mmap+0x50/0x50
[    1.085499]  ? __mmap_region+0x7e8/0x1100
[    1.085539]  ? do_mmap+0x49f/0x5e0
[    1.085573]  ? vm_mmap_pgoff+0xef/0x1e0
[    1.085609]  ? ksys_mmap_pgoff+0x15c/0x1f0
[    1.085647]  ? do_syscall_64+0xab/0x980
[    1.085684]  ? entry_SYSCALL_64_after_hwframe+0x4b/0x53
[    1.085730]  </TASK>
[    1.085770] Modules linked in: virtio_mmio(E) 9pnet_virtio(E) 9p(E) 9pnet(E) netfs(E)
[    1.085838] CR2: ffa00000048efbb8
[    1.085874] ---[ end trace 0000000000000000 ]---
[    1.085875] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
[    1.085918] RIP: 0010:0xffa00000048efbb8
[    1.085921] Code: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <40> 12 0e 00 01 00 11 ff d0 fa 8e 04 00 00 a0 ff 80 33 51 02 01 00
[    1.085988] BUG: unable to handle page fault for address: ffa00000048f7bb8
[    1.086026] RSP: 0018:ffa00000048ef998 EFLAGS: 00010286
[    1.086166] #PF: supervisor instruction fetch in kernel mode
[    1.086221]
[    1.086267] #PF: error_code(0x0011) - permissions violation
[    1.086321] RAX: ffa00000048efbb8 RBX: ff11000102512cc0 RCX: 000000000000000d
[    1.086348] PGD 100000067
[    1.086394] RDX: ffffffffa06247d0 RSI: ffa00000048efa18 RDI: ff11000102512cc0
[    1.086459] P4D 10035f067
[    1.086486] RBP: ffa00000048ef9c8 R08: 0000000000000000 R09: 0000000000000007
[    1.086550] PUD 100364067
[    1.086577] R10: ff110001047d1f08 R11: 00007effdc3d0fff R12: ff110001047d3b00
[    1.086641] PMD 441ed9067
[    1.086668] R13: ff11000446cae600 R14: ff110001024efe00 R15: ff11000102510a80
[    1.086731] PTE 80000004433d3163
[    1.086764] FS:  0000000000000000(0000) GS:ff110004aae72000(0000) knlGS:0000000000000000
[    1.086829]
[    1.086868] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    1.086931] Oops: Oops: 0011 [#2] SMP
[    1.086958] CR2: ffa00000048efbb8 CR3: 0000000102667001 CR4: 0000000000771ef0
[    1.087015] CPU: 29 UID: 0 PID: 306 Comm: mount Tainted: G      D W   E       7.0.0-rc4-virtme-00442-ge53de5a0302f-dirty #85 PREEMPTLAZY
[    1.087050] PKRU: 55555554
[    1.087115] Tainted: [D]=DIE, [W]=WARN, [E]=UNSIGNED_MODULE
[    1.087207] Kernel panic - not syncing: Fatal exception
[    2.158392] Shutting down cpus with NMI
[    2.158629] Kernel Offset: disabled
[    2.158668] ---[ end Kernel panic - not syncing: Fatal exception ]---

It crashes at compat_vma_mmap, and here is what I think could be the 
potential crash path:

- compat_vma_mmap() creates struct vm_area_desc desc;
  - compat_set_desc_from_vma Doesn't initialize the struct, but instead
    modifies independent fields. I think this is where the behavior
    diverges, since before we would use the C initializer and uninitialized
    variables would be set to 0 (including ommitted ones, like
    action.success_hook or action.error_hook). But action.type = MMAP_NOTHING
  - desc.action.success_hook remains uninitialized in vfs_mmap_prepare
  - mmap_action_complete()
    - Here, We've set action.type to be MMAP_NOTHING, so we have err = 0
    - mmap_action_finish(action, vma, 0)
      - And here, since err == 0, we check action->success_hook (which has
        garbage, therefore it's nonzero) and call action->success_hook(vma)

And I think action->success_hook(vma) where success_hook is uninitialized
stack garbage gets me to where I am.

Again, I'm not too familiar with this area of the kernel, this is just
based on the quick digging that I did. And aplogies again if I'm missing
something ; -) I do think that the uninitialized members could be a problem
though.

Thank you, I hope you have a great day Lorenzo!
Joshua
Re: [PATCH v2 12/16] mm: allow handling of stacked mmap_prepare hooks in more drivers
Posted by Lorenzo Stoakes (Oracle) 2 weeks, 4 days ago
On Wed, Mar 18, 2026 at 02:08:45PM -0700, Joshua Hahn wrote:
> On Mon, 16 Mar 2026 21:12:08 +0000 "Lorenzo Stoakes (Oracle)" <ljs@kernel.org> wrote:
>
> > While the conversion of mmap hooks to mmap_prepare is underway, we wil
> > encounter situations where mmap hooks need to invoke nested mmap_prepare
> > hooks.
> >
> > The nesting of mmap hooks is termed 'stacking'.  In order to flexibly
> > facilitate the conversion of custom mmap hooks in drivers which stack, we
> > must split up the existing compat_vma_mapped() function into two separate
> > functions:
> >
> > * compat_set_desc_from_vma() - This allows the setting of a vm_area_desc
> >   object's fields to the relevant fields of a VMA.
>
> Hello Lorenzo, I hope you are doing well!
>
> Thank you for this patch. I was developing on top of mm-new today and had
> an error that I think was caused by this patch. I want to preface this by
> saying that I am not at all familiar with this area of the code, so please
> do forgive me if I've misinterpreted the crash and mistakenly pointed
> at this commit : -)
>
> Here is the crash:
>
> [    1.083795] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
> [    1.083883] BUG: unable to handle page fault for address: ffa00000048efbb8
> [    1.083957] #PF: supervisor instruction fetch in kernel mode
> [    1.084030] #PF: error_code(0x0011) - permissions violation
> [    1.084086] PGD 100000067 P4D 10035f067 PUD 100364067 PMD 441ed9067 PTE 80000004466a3163
> [    1.084162] Oops: Oops: 0011 [#1] SMP
> [    1.084218] CPU: 0 UID: 0 PID: 305 Comm: mkdir Tainted: G        W   E       7.0.0-rc4-virtme-00442-ge53de5a0302f-dirty #85 PREEMPTLAZY
>
> As you can see, it's on a QEMU instance. I don't think this makes a difference
> in the crash, though.
>
> [    1.084321] Tainted: [W]=WARN, [E]=UNSIGNED_MODULE
> [    1.084369] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-5.el9 11/05/2023
> [    1.084450] RIP: 0010:0xffa00000048efbb8
> [    1.084489] Code: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <40> 12 0e 00 01 00 11 ff d0 fa 8e 04 00 00 a0 ff 80 33 51 02 01 00
> [    1.084642] RSP: 0018:ffa00000048ef998 EFLAGS: 00010286
> [    1.084692] RAX: ffa00000048efbb8 RBX: ff11000102512cc0 RCX: 000000000000000d
> [    1.084766] RDX: ffffffffa06247d0 RSI: ffa00000048efa18 RDI: ff11000102512cc0
> [    1.084826] RBP: ffa00000048ef9c8 R08: 0000000000000000 R09: 0000000000000007
> [    1.084889] R10: ff110001047d1f08 R11: 00007effdc3d0fff R12: ff110001047d3b00
> [    1.084954] R13: ff11000446cae600 R14: ff110001024efe00 R15: ff11000102510a80
> [    1.085021] FS:  0000000000000000(0000) GS:ff110004aae72000(0000) knlGS:0000000000000000
> [    1.085083] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    1.085136] CR2: ffa00000048efbb8 CR3: 0000000102667001 CR4: 0000000000771ef0
> [    1.085201] PKRU: 55555554
> [    1.085228] Call Trace:
> [    1.085248]  <TASK>
> [    1.085274]  ? __compat_vma_mmap+0x8e/0x130
> [    1.085318]  ? compat_vma_mmap+0x76/0x80
> [    1.085354]  ? mas_alloc_nodes+0xb2/0x110
> [    1.085390]  ? backing_file_mmap+0xc3/0xf0
> [    1.085426]  ? ovl_mmap+0x41/0x50
> [    1.085463]  ? ovl_mmap+0x50/0x50
> [    1.085499]  ? __mmap_region+0x7e8/0x1100
> [    1.085539]  ? do_mmap+0x49f/0x5e0
> [    1.085573]  ? vm_mmap_pgoff+0xef/0x1e0
> [    1.085609]  ? ksys_mmap_pgoff+0x15c/0x1f0
> [    1.085647]  ? do_syscall_64+0xab/0x980
> [    1.085684]  ? entry_SYSCALL_64_after_hwframe+0x4b/0x53
> [    1.085730]  </TASK>
> [    1.085770] Modules linked in: virtio_mmio(E) 9pnet_virtio(E) 9p(E) 9pnet(E) netfs(E)
> [    1.085838] CR2: ffa00000048efbb8
> [    1.085874] ---[ end trace 0000000000000000 ]---
> [    1.085875] kernel tried to execute NX-protected page - exploit attempt? (uid: 0)
> [    1.085918] RIP: 0010:0xffa00000048efbb8
> [    1.085921] Code: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 <40> 12 0e 00 01 00 11 ff d0 fa 8e 04 00 00 a0 ff 80 33 51 02 01 00
> [    1.085988] BUG: unable to handle page fault for address: ffa00000048f7bb8
> [    1.086026] RSP: 0018:ffa00000048ef998 EFLAGS: 00010286
> [    1.086166] #PF: supervisor instruction fetch in kernel mode
> [    1.086221]
> [    1.086267] #PF: error_code(0x0011) - permissions violation
> [    1.086321] RAX: ffa00000048efbb8 RBX: ff11000102512cc0 RCX: 000000000000000d
> [    1.086348] PGD 100000067
> [    1.086394] RDX: ffffffffa06247d0 RSI: ffa00000048efa18 RDI: ff11000102512cc0
> [    1.086459] P4D 10035f067
> [    1.086486] RBP: ffa00000048ef9c8 R08: 0000000000000000 R09: 0000000000000007
> [    1.086550] PUD 100364067
> [    1.086577] R10: ff110001047d1f08 R11: 00007effdc3d0fff R12: ff110001047d3b00
> [    1.086641] PMD 441ed9067
> [    1.086668] R13: ff11000446cae600 R14: ff110001024efe00 R15: ff11000102510a80
> [    1.086731] PTE 80000004433d3163
> [    1.086764] FS:  0000000000000000(0000) GS:ff110004aae72000(0000) knlGS:0000000000000000
> [    1.086829]
> [    1.086868] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [    1.086931] Oops: Oops: 0011 [#2] SMP
> [    1.086958] CR2: ffa00000048efbb8 CR3: 0000000102667001 CR4: 0000000000771ef0
> [    1.087015] CPU: 29 UID: 0 PID: 306 Comm: mount Tainted: G      D W   E       7.0.0-rc4-virtme-00442-ge53de5a0302f-dirty #85 PREEMPTLAZY
> [    1.087050] PKRU: 55555554
> [    1.087115] Tainted: [D]=DIE, [W]=WARN, [E]=UNSIGNED_MODULE
> [    1.087207] Kernel panic - not syncing: Fatal exception
> [    2.158392] Shutting down cpus with NMI
> [    2.158629] Kernel Offset: disabled
> [    2.158668] ---[ end Kernel panic - not syncing: Fatal exception ]---
>
> It crashes at compat_vma_mmap, and here is what I think could be the
> potential crash path:
>
> - compat_vma_mmap() creates struct vm_area_desc desc;
>   - compat_set_desc_from_vma Doesn't initialize the struct, but instead
>     modifies independent fields. I think this is where the behavior
>     diverges, since before we would use the C initializer and uninitialized

Ah yeah you're right I'll fix that up!

>     variables would be set to 0 (including ommitted ones, like
>     action.success_hook or action.error_hook). But action.type = MMAP_NOTHING
>   - desc.action.success_hook remains uninitialized in vfs_mmap_prepare
>   - mmap_action_complete()
>     - Here, We've set action.type to be MMAP_NOTHING, so we have err = 0
>     - mmap_action_finish(action, vma, 0)
>       - And here, since err == 0, we check action->success_hook (which has
>         garbage, therefore it's nonzero) and call action->success_hook(vma)
>
> And I think action->success_hook(vma) where success_hook is uninitialized
> stack garbage gets me to where I am.
>
> Again, I'm not too familiar with this area of the kernel, this is just
> based on the quick digging that I did. And aplogies again if I'm missing
> something ; -) I do think that the uninitialized members could be a problem
> though.
>
> Thank you, I hope you have a great day Lorenzo!
> Joshua

Thanks for the report and analysis, much appreciated, hope you have a great
day too :)

Cheers, Lorenzo
Re: [PATCH v2 12/16] mm: allow handling of stacked mmap_prepare hooks in more drivers
Posted by Suren Baghdasaryan 2 weeks, 5 days ago
On Mon, Mar 16, 2026 at 2:14 PM Lorenzo Stoakes (Oracle) <ljs@kernel.org> wrote:
>
> While the conversion of mmap hooks to mmap_prepare is underway, we wil

nit: s/wil/will

> encounter situations where mmap hooks need to invoke nested mmap_prepare
> hooks.
>
> The nesting of mmap hooks is termed 'stacking'.  In order to flexibly
> facilitate the conversion of custom mmap hooks in drivers which stack, we
> must split up the existing compat_vma_mapped() function into two separate
> functions:
>
> * compat_set_desc_from_vma() - This allows the setting of a vm_area_desc
>   object's fields to the relevant fields of a VMA.
>
> * __compat_vma_mmap() - Once an mmap_prepare hook has been executed upon a
>   vm_area_desc object, this function performs any mmap actions specified by
>   the mmap_prepare hook and then invokes its vm_ops->mapped() hook if any
>   were specified.
>
> In ordinary cases, where a file's f_op->mmap_prepare() hook simply needs to
> be invoked in a stacked mmap() hook, compat_vma_mmap() can be used.
>
> However some drivers define their own nested hooks, which are invoked in
> turn by another hook.
>
> A concrete example is vmbus_channel->mmap_ring_buffer(), which is invoked
> in turn by bin_attribute->mmap():
>
> vmbus_channel->mmap_ring_buffer() has a signature of:
>
> int (*mmap_ring_buffer)(struct vmbus_channel *channel,
>                         struct vm_area_struct *vma);
>
> And bin_attribute->mmap() has a signature of:
>
>         int (*mmap)(struct file *, struct kobject *,
>                     const struct bin_attribute *attr,
>                     struct vm_area_struct *vma);
>
> And so compat_vma_mmap() cannot be used here for incremental conversion of
> hooks from mmap() to mmap_prepare().
>
> There are many such instances like this, where conversion to mmap_prepare
> would otherwise cascade to a huge change set due to nesting of this kind.
>
> The changes in this patch mean we could now instead convert
> vmbus_channel->mmap_ring_buffer() to
> vmbus_channel->mmap_prepare_ring_buffer(), and implement something like:
>
>         struct vm_area_desc desc;
>         int err;
>
>         compat_set_desc_from_vm(&desc, file, vma);
>         err = channel->mmap_prepare_ring_buffer(channel, &desc);
>         if (err)
>                 return err;
>
>         return __compat_vma_mmap(&desc, vma);
>
> Allowing us to incrementally update this logic, and other logic like it.

The way I understand this and the next 2 patches is that they are
preperations for later replacement of mmap() with mmap_prepare() but
they don't yet do that completely. Is that right?
To clarify what I mean, in [1] for example, you are replacing struct
uio_info.mmap with uio_info.mmap_prepare but it's still being called
from uio_mmap(). IOW, you are not replacing uio_mmap with
uio_mmap_prepare. Is that the next step that's not yet implemented?

[1] https://lore.kernel.org/all/892a8b32e5ef64c69239ccc2d1bd364716fd7fdf.1773695307.git.ljs@kernel.org/

>
> Unfortunately, as part of this change, we need to be able to flexibly
> assign to the VMA descriptor, so have to remove some of the const
> declarations within the structure.
>
> Also update the VMA tests to reflect the changes.
>
> Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> ---
>  include/linux/fs.h              |   3 +
>  include/linux/mm_types.h        |   4 +-
>  mm/util.c                       | 111 +++++++++++++++++++++++---------
>  mm/vma.h                        |   2 +-
>  tools/testing/vma/include/dup.h | 111 ++++++++++++++++++++------------
>  5 files changed, 157 insertions(+), 74 deletions(-)
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index c390f5c667e3..0bdccfa70b44 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2058,6 +2058,9 @@ static inline bool can_mmap_file(struct file *file)
>         return true;
>  }
>
> +void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file,
> +                             const struct vm_area_struct *vma);
> +int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma);
>  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
>  int __vma_check_mmap_hook(struct vm_area_struct *vma);
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 50685cf29792..7538d64f8848 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -891,8 +891,8 @@ static __always_inline bool vma_flags_empty(vma_flags_t *flags)
>   */
>  struct vm_area_desc {
>         /* Immutable state. */
> -       const struct mm_struct *const mm;
> -       struct file *const file; /* May vary from vm_file in stacked callers. */
> +       struct mm_struct *mm;
> +       struct file *file; /* May vary from vm_file in stacked callers. */
>         unsigned long start;
>         unsigned long end;
>
> diff --git a/mm/util.c b/mm/util.c
> index aa92e471afe1..a166c48fe894 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -1163,34 +1163,38 @@ void flush_dcache_folio(struct folio *folio)
>  EXPORT_SYMBOL(flush_dcache_folio);
>  #endif
>
> -static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> +/**
> + * compat_set_desc_from_vma() - assigns VMA descriptor @desc fields from a VMA.
> + * @desc: A VMA descriptor whose fields need to be set.
> + * @file: The file object describing the file being mmap()'d.
> + * @vma: The VMA whose fields we wish to assign to @desc.
> + *
> + * This is a compatibility function to allow an mmap() hook to call
> + * mmap_prepare() hooks when drivers nest these. This function specifically
> + * allows the construction of a vm_area_desc value, @desc, from a VMA @vma for
> + * the purposes of doing this.
> + *
> + * Once the conversion of drivers is complete this function will no longer be
> + * required and will be removed.
> + */
> +void compat_set_desc_from_vma(struct vm_area_desc *desc,
> +                             const struct file *file,
> +                             const struct vm_area_struct *vma)
>  {
> -       struct vm_area_desc desc = {
> -               .mm = vma->vm_mm,
> -               .file = file,
> -               .start = vma->vm_start,
> -               .end = vma->vm_end,
> -
> -               .pgoff = vma->vm_pgoff,
> -               .vm_file = vma->vm_file,
> -               .vma_flags = vma->flags,
> -               .page_prot = vma->vm_page_prot,
> -
> -               .action.type = MMAP_NOTHING, /* Default */
> -       };
> -       int err;
> +       desc->mm = vma->vm_mm;
> +       desc->file = (struct file *)file;
> +       desc->start = vma->vm_start;
> +       desc->end = vma->vm_end;
>
> -       err = vfs_mmap_prepare(file, &desc);
> -       if (err)
> -               return err;
> +       desc->pgoff = vma->vm_pgoff;
> +       desc->vm_file = vma->vm_file;
> +       desc->vma_flags = vma->flags;
> +       desc->page_prot = vma->vm_page_prot;
>
> -       err = mmap_action_prepare(&desc);
> -       if (err)
> -               return err;
> -
> -       set_vma_from_desc(vma, &desc);
> -       return mmap_action_complete(vma, &desc.action);
> +       /* Default. */
> +       desc->action.type = MMAP_NOTHING;
>  }
> +EXPORT_SYMBOL(compat_set_desc_from_vma);
>
>  static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
>  {
> @@ -1211,6 +1215,49 @@ static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
>         return err;
>  }
>
> +/**
> + * __compat_vma_mmap() - Similar to compat_vma_mmap(), only it allows
> + * flexibility as to how the mmap_prepare callback is invoked, which is useful
> + * for drivers which invoke nested mmap_prepare callbacks in an mmap() hook.
> + * @desc: A VMA descriptor upon which an mmap_prepare() hook has already been
> + * executed.
> + * @vma: The VMA to which @desc should be applied.
> + *
> + * The function assumes that you have obtained a VMA descriptor @desc from
> + * compt_set_desc_from_vma(), and already executed the mmap_prepare() hook upon
> + * it.
> + *
> + * It then performs any specified mmap actions, and invokes the vm_ops->mapped()
> + * hook if one is present.
> + *
> + * See the description of compat_vma_mmap() for more details.
> + *
> + * Once the conversion of drivers is complete this function will no longer be
> + * required and will be removed.
> + *
> + * Returns: 0 on success or error.
> + */
> +int __compat_vma_mmap(struct vm_area_desc *desc,
> +                     struct vm_area_struct *vma)
> +{
> +       int err;
> +
> +       /* Perform any preparatory tasks for mmap action. */
> +       err = mmap_action_prepare(desc);
> +       if (err)
> +               return err;
> +       /* Update the VMA from the descriptor. */
> +       compat_set_vma_from_desc(vma, desc);
> +       /* Complete any specified mmap actions. */
> +       err = mmap_action_complete(vma, &desc->action);
> +       if (err)
> +               return err;
> +
> +       /* Invoke vm_ops->mapped callback. */
> +       return __compat_vma_mapped(desc->file, vma);
> +}
> +EXPORT_SYMBOL(__compat_vma_mmap);
> +
>  /**
>   * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
>   * existing VMA and execute any requested actions.
> @@ -1218,10 +1265,10 @@ static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
>   * @vma: The VMA to apply the .mmap_prepare() hook to.
>   *
>   * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
> - * stacked filesystems invoke a nested mmap hook of an underlying file.
> + * stacked drivers invoke a nested mmap hook of an underlying file.
>   *
> - * Until all filesystems are converted to use .mmap_prepare(), we must be
> - * conservative and continue to invoke these stacked filesystems using the
> + * Until all drivers are converted to use .mmap_prepare(), we must be
> + * conservative and continue to invoke these stacked drivers using the
>   * deprecated .mmap() hook.
>   *
>   * However we have a problem if the underlying file system possesses an
> @@ -1232,20 +1279,22 @@ static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
>   * establishes a struct vm_area_desc descriptor, passes to the underlying
>   * .mmap_prepare() hook and applies any changes performed by it.
>   *
> - * Once the conversion of filesystems is complete this function will no longer
> - * be required and will be removed.
> + * Once the conversion of drivers is complete this function will no longer be
> + * required and will be removed.
>   *
>   * Returns: 0 on success or error.
>   */
>  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
>  {
> +       struct vm_area_desc desc;
>         int err;
>
> -       err = __compat_vma_mmap(file, vma);
> +       compat_set_desc_from_vma(&desc, file, vma);
> +       err = vfs_mmap_prepare(file, &desc);
>         if (err)
>                 return err;
>
> -       return __compat_vma_mapped(file, vma);
> +       return __compat_vma_mmap(&desc, vma);
>  }
>  EXPORT_SYMBOL(compat_vma_mmap);
>
> diff --git a/mm/vma.h b/mm/vma.h
> index adc18f7dd9f1..a76046c39b14 100644
> --- a/mm/vma.h
> +++ b/mm/vma.h
> @@ -300,7 +300,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
>   * f_op->mmap() but which might have an underlying file system which implements
>   * f_op->mmap_prepare().
>   */
> -static inline void set_vma_from_desc(struct vm_area_struct *vma,
> +static inline void compat_set_vma_from_desc(struct vm_area_struct *vma,
>                 struct vm_area_desc *desc)
>  {
>         /*
> diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
> index 114daaef4f73..6658df26698a 100644
> --- a/tools/testing/vma/include/dup.h
> +++ b/tools/testing/vma/include/dup.h
> @@ -519,8 +519,8 @@ enum vma_operation {
>   */
>  struct vm_area_desc {
>         /* Immutable state. */
> -       const struct mm_struct *const mm;
> -       struct file *const file; /* May vary from vm_file in stacked callers. */
> +       struct mm_struct *mm;
> +       struct file *file; /* May vary from vm_file in stacked callers. */
>         unsigned long start;
>         unsigned long end;
>
> @@ -1272,43 +1272,92 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
>  }
>
>  /* Declared in vma.h. */
> -static inline void set_vma_from_desc(struct vm_area_struct *vma,
> +static inline void compat_set_vma_from_desc(struct vm_area_struct *vma,
>                 struct vm_area_desc *desc);
>
> -static inline int __compat_vma_mmap(const struct file_operations *f_op,
> -               struct file *file, struct vm_area_struct *vma)
> +static inline void compat_set_desc_from_vma(struct vm_area_desc *desc,
> +                             const struct file *file,
> +                             const struct vm_area_struct *vma)
>  {
> -       struct vm_area_desc desc = {
> -               .mm = vma->vm_mm,
> -               .file = file,
> -               .start = vma->vm_start,
> -               .end = vma->vm_end,
> +       desc->mm = vma->vm_mm;
> +       desc->file = (struct file *)file;
> +       desc->start = vma->vm_start;
> +       desc->end = vma->vm_end;
>
> -               .pgoff = vma->vm_pgoff,
> -               .vm_file = vma->vm_file,
> -               .vma_flags = vma->flags,
> -               .page_prot = vma->vm_page_prot,
> +       desc->pgoff = vma->vm_pgoff;
> +       desc->vm_file = vma->vm_file;
> +       desc->vma_flags = vma->flags;
> +       desc->page_prot = vma->vm_page_prot;
>
> -               .action.type = MMAP_NOTHING, /* Default */
> -       };
> +       /* Default. */
> +       desc->action.type = MMAP_NOTHING;
> +}
> +
> +static inline unsigned long vma_pages(const struct vm_area_struct *vma)
> +{
> +       return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> +}
> +
> +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> +{
> +       const size_t len = vma_pages(vma) << PAGE_SHIFT;
> +
> +       mmap_assert_write_locked(vma->vm_mm);
> +       do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> +}
> +
> +static inline int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> +{
> +       const struct vm_operations_struct *vm_ops = vma->vm_ops;
>         int err;
>
> -       err = f_op->mmap_prepare(&desc);
> +       if (!vm_ops->mapped)
> +               return 0;
> +
> +       err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
> +                            &vma->vm_private_data);
>         if (err)
> -               return err;
> +               unmap_vma_locked(vma);
> +       return err;
> +}
>
> -       err = mmap_action_prepare(&desc);
> +static inline int __compat_vma_mmap(struct vm_area_desc *desc,
> +               struct vm_area_struct *vma)
> +{
> +       int err;
> +
> +       /* Perform any preparatory tasks for mmap action. */
> +       err = mmap_action_prepare(desc);
> +       if (err)
> +               return err;
> +       /* Update the VMA from the descriptor. */
> +       compat_set_vma_from_desc(vma, desc);
> +       /* Complete any specified mmap actions. */
> +       err = mmap_action_complete(vma, &desc->action);
>         if (err)
>                 return err;
>
> -       set_vma_from_desc(vma, &desc);
> -       return mmap_action_complete(vma, &desc.action);
> +       /* Invoke vm_ops->mapped callback. */
> +       return __compat_vma_mapped(desc->file, vma);
> +}
> +
> +static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> +{
> +       return file->f_op->mmap_prepare(desc);
>  }
>
>  static inline int compat_vma_mmap(struct file *file,
>                 struct vm_area_struct *vma)
>  {
> -       return __compat_vma_mmap(file->f_op, file, vma);
> +       struct vm_area_desc desc;
> +       int err;
> +
> +       compat_set_desc_from_vma(&desc, file, vma);
> +       err = vfs_mmap_prepare(file, &desc);
> +       if (err)
> +               return err;
> +
> +       return __compat_vma_mmap(&desc, vma);
>  }
>
>
> @@ -1318,11 +1367,6 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
>         mas_init(&vmi->mas, &mm->mm_mt, addr);
>  }
>
> -static inline unsigned long vma_pages(struct vm_area_struct *vma)
> -{
> -       return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> -}
> -
>  static inline void mmap_assert_locked(struct mm_struct *);
>  static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
>                                                 unsigned long start_addr,
> @@ -1492,11 +1536,6 @@ static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
>         return file->f_op->mmap(file, vma);
>  }
>
> -static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> -{
> -       return file->f_op->mmap_prepare(desc);
> -}
> -
>  static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
>  {
>         /* Changing an anonymous vma with this is illegal */
> @@ -1521,11 +1560,3 @@ static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags)
>
>         return vm_get_page_prot(vm_flags);
>  }
> -
> -static inline void unmap_vma_locked(struct vm_area_struct *vma)
> -{
> -       const size_t len = vma_pages(vma) << PAGE_SHIFT;
> -
> -       mmap_assert_write_locked(vma->vm_mm);
> -       do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> -}
> --
> 2.53.0
>
Re: [PATCH v2 12/16] mm: allow handling of stacked mmap_prepare hooks in more drivers
Posted by Lorenzo Stoakes (Oracle) 2 weeks, 4 days ago
On Wed, Mar 18, 2026 at 08:33:28AM -0700, Suren Baghdasaryan wrote:
> On Mon, Mar 16, 2026 at 2:14 PM Lorenzo Stoakes (Oracle) <ljs@kernel.org> wrote:
> >
> > While the conversion of mmap hooks to mmap_prepare is underway, we wil
>
> nit: s/wil/will

Thanks, fixed.

>
> > encounter situations where mmap hooks need to invoke nested mmap_prepare
> > hooks.
> >
> > The nesting of mmap hooks is termed 'stacking'.  In order to flexibly
> > facilitate the conversion of custom mmap hooks in drivers which stack, we
> > must split up the existing compat_vma_mapped() function into two separate
> > functions:
> >
> > * compat_set_desc_from_vma() - This allows the setting of a vm_area_desc
> >   object's fields to the relevant fields of a VMA.
> >
> > * __compat_vma_mmap() - Once an mmap_prepare hook has been executed upon a
> >   vm_area_desc object, this function performs any mmap actions specified by
> >   the mmap_prepare hook and then invokes its vm_ops->mapped() hook if any
> >   were specified.
> >
> > In ordinary cases, where a file's f_op->mmap_prepare() hook simply needs to
> > be invoked in a stacked mmap() hook, compat_vma_mmap() can be used.
> >
> > However some drivers define their own nested hooks, which are invoked in
> > turn by another hook.
> >
> > A concrete example is vmbus_channel->mmap_ring_buffer(), which is invoked
> > in turn by bin_attribute->mmap():
> >
> > vmbus_channel->mmap_ring_buffer() has a signature of:
> >
> > int (*mmap_ring_buffer)(struct vmbus_channel *channel,
> >                         struct vm_area_struct *vma);
> >
> > And bin_attribute->mmap() has a signature of:
> >
> >         int (*mmap)(struct file *, struct kobject *,
> >                     const struct bin_attribute *attr,
> >                     struct vm_area_struct *vma);
> >
> > And so compat_vma_mmap() cannot be used here for incremental conversion of
> > hooks from mmap() to mmap_prepare().
> >
> > There are many such instances like this, where conversion to mmap_prepare
> > would otherwise cascade to a huge change set due to nesting of this kind.
> >
> > The changes in this patch mean we could now instead convert
> > vmbus_channel->mmap_ring_buffer() to
> > vmbus_channel->mmap_prepare_ring_buffer(), and implement something like:
> >
> >         struct vm_area_desc desc;
> >         int err;
> >
> >         compat_set_desc_from_vm(&desc, file, vma);
> >         err = channel->mmap_prepare_ring_buffer(channel, &desc);
> >         if (err)
> >                 return err;
> >
> >         return __compat_vma_mmap(&desc, vma);
> >
> > Allowing us to incrementally update this logic, and other logic like it.
>
> The way I understand this and the next 2 patches is that they are
> preperations for later replacement of mmap() with mmap_prepare() but
> they don't yet do that completely. Is that right?
> To clarify what I mean, in [1] for example, you are replacing struct
> uio_info.mmap with uio_info.mmap_prepare but it's still being called
> from uio_mmap(). IOW, you are not replacing uio_mmap with
> uio_mmap_prepare. Is that the next step that's not yet implemented?

Yeah, there were 12 more patches I didn't send :) because I feel they'd
make more sense separate and wanted to test/develop them more.

This is all laying the groundwork for having mmap_prepare DMA cache
mappings ultimately, while expanding functionality as we go.

The intent here though isn't _just_ that, it's more - in general - when we
have an e.g.:

int some_special_mmap(struct some_type *blah, struct file *filp /* sometimes */,
		      struct vm_area_struct *vma)
{
	...
}

Or some say custom ops for something like this, where in the existing code
callers hook .mmap(), grab some specific struct (like a device pointer or a
state pointer) from the file private data and then delegate to another
helper.

In this situation, we are able to use the compatibility layer to change say
the ops to be .mmap_prepare instead while the overarching caller is .mmap.

This allows for iterative conversion to .mmap_prepare without having to
amend 100 files at once in a multi-thousand line patch touching dozens of
drivers or some hellish notion like that.

This is vital to sensibly being able to implement these changes bit-by-bit.

Cheers, Lorenzo

>
> [1] https://lore.kernel.org/all/892a8b32e5ef64c69239ccc2d1bd364716fd7fdf.1773695307.git.ljs@kernel.org/
>
> >
> > Unfortunately, as part of this change, we need to be able to flexibly
> > assign to the VMA descriptor, so have to remove some of the const
> > declarations within the structure.
> >
> > Also update the VMA tests to reflect the changes.
> >
> > Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> > ---
> >  include/linux/fs.h              |   3 +
> >  include/linux/mm_types.h        |   4 +-
> >  mm/util.c                       | 111 +++++++++++++++++++++++---------
> >  mm/vma.h                        |   2 +-
> >  tools/testing/vma/include/dup.h | 111 ++++++++++++++++++++------------
> >  5 files changed, 157 insertions(+), 74 deletions(-)
> >
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index c390f5c667e3..0bdccfa70b44 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -2058,6 +2058,9 @@ static inline bool can_mmap_file(struct file *file)
> >         return true;
> >  }
> >
> > +void compat_set_desc_from_vma(struct vm_area_desc *desc, const struct file *file,
> > +                             const struct vm_area_struct *vma);
> > +int __compat_vma_mmap(struct vm_area_desc *desc, struct vm_area_struct *vma);
> >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
> >  int __vma_check_mmap_hook(struct vm_area_struct *vma);
> >
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 50685cf29792..7538d64f8848 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -891,8 +891,8 @@ static __always_inline bool vma_flags_empty(vma_flags_t *flags)
> >   */
> >  struct vm_area_desc {
> >         /* Immutable state. */
> > -       const struct mm_struct *const mm;
> > -       struct file *const file; /* May vary from vm_file in stacked callers. */
> > +       struct mm_struct *mm;
> > +       struct file *file; /* May vary from vm_file in stacked callers. */
> >         unsigned long start;
> >         unsigned long end;
> >
> > diff --git a/mm/util.c b/mm/util.c
> > index aa92e471afe1..a166c48fe894 100644
> > --- a/mm/util.c
> > +++ b/mm/util.c
> > @@ -1163,34 +1163,38 @@ void flush_dcache_folio(struct folio *folio)
> >  EXPORT_SYMBOL(flush_dcache_folio);
> >  #endif
> >
> > -static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > +/**
> > + * compat_set_desc_from_vma() - assigns VMA descriptor @desc fields from a VMA.
> > + * @desc: A VMA descriptor whose fields need to be set.
> > + * @file: The file object describing the file being mmap()'d.
> > + * @vma: The VMA whose fields we wish to assign to @desc.
> > + *
> > + * This is a compatibility function to allow an mmap() hook to call
> > + * mmap_prepare() hooks when drivers nest these. This function specifically
> > + * allows the construction of a vm_area_desc value, @desc, from a VMA @vma for
> > + * the purposes of doing this.
> > + *
> > + * Once the conversion of drivers is complete this function will no longer be
> > + * required and will be removed.
> > + */
> > +void compat_set_desc_from_vma(struct vm_area_desc *desc,
> > +                             const struct file *file,
> > +                             const struct vm_area_struct *vma)
> >  {
> > -       struct vm_area_desc desc = {
> > -               .mm = vma->vm_mm,
> > -               .file = file,
> > -               .start = vma->vm_start,
> > -               .end = vma->vm_end,
> > -
> > -               .pgoff = vma->vm_pgoff,
> > -               .vm_file = vma->vm_file,
> > -               .vma_flags = vma->flags,
> > -               .page_prot = vma->vm_page_prot,
> > -
> > -               .action.type = MMAP_NOTHING, /* Default */
> > -       };
> > -       int err;
> > +       desc->mm = vma->vm_mm;
> > +       desc->file = (struct file *)file;
> > +       desc->start = vma->vm_start;
> > +       desc->end = vma->vm_end;
> >
> > -       err = vfs_mmap_prepare(file, &desc);
> > -       if (err)
> > -               return err;
> > +       desc->pgoff = vma->vm_pgoff;
> > +       desc->vm_file = vma->vm_file;
> > +       desc->vma_flags = vma->flags;
> > +       desc->page_prot = vma->vm_page_prot;
> >
> > -       err = mmap_action_prepare(&desc);
> > -       if (err)
> > -               return err;
> > -
> > -       set_vma_from_desc(vma, &desc);
> > -       return mmap_action_complete(vma, &desc.action);
> > +       /* Default. */
> > +       desc->action.type = MMAP_NOTHING;
> >  }
> > +EXPORT_SYMBOL(compat_set_desc_from_vma);
> >
> >  static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> >  {
> > @@ -1211,6 +1215,49 @@ static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> >         return err;
> >  }
> >
> > +/**
> > + * __compat_vma_mmap() - Similar to compat_vma_mmap(), only it allows
> > + * flexibility as to how the mmap_prepare callback is invoked, which is useful
> > + * for drivers which invoke nested mmap_prepare callbacks in an mmap() hook.
> > + * @desc: A VMA descriptor upon which an mmap_prepare() hook has already been
> > + * executed.
> > + * @vma: The VMA to which @desc should be applied.
> > + *
> > + * The function assumes that you have obtained a VMA descriptor @desc from
> > + * compt_set_desc_from_vma(), and already executed the mmap_prepare() hook upon
> > + * it.
> > + *
> > + * It then performs any specified mmap actions, and invokes the vm_ops->mapped()
> > + * hook if one is present.
> > + *
> > + * See the description of compat_vma_mmap() for more details.
> > + *
> > + * Once the conversion of drivers is complete this function will no longer be
> > + * required and will be removed.
> > + *
> > + * Returns: 0 on success or error.
> > + */
> > +int __compat_vma_mmap(struct vm_area_desc *desc,
> > +                     struct vm_area_struct *vma)
> > +{
> > +       int err;
> > +
> > +       /* Perform any preparatory tasks for mmap action. */
> > +       err = mmap_action_prepare(desc);
> > +       if (err)
> > +               return err;
> > +       /* Update the VMA from the descriptor. */
> > +       compat_set_vma_from_desc(vma, desc);
> > +       /* Complete any specified mmap actions. */
> > +       err = mmap_action_complete(vma, &desc->action);
> > +       if (err)
> > +               return err;
> > +
> > +       /* Invoke vm_ops->mapped callback. */
> > +       return __compat_vma_mapped(desc->file, vma);
> > +}
> > +EXPORT_SYMBOL(__compat_vma_mmap);
> > +
> >  /**
> >   * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
> >   * existing VMA and execute any requested actions.
> > @@ -1218,10 +1265,10 @@ static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> >   * @vma: The VMA to apply the .mmap_prepare() hook to.
> >   *
> >   * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain
> > - * stacked filesystems invoke a nested mmap hook of an underlying file.
> > + * stacked drivers invoke a nested mmap hook of an underlying file.
> >   *
> > - * Until all filesystems are converted to use .mmap_prepare(), we must be
> > - * conservative and continue to invoke these stacked filesystems using the
> > + * Until all drivers are converted to use .mmap_prepare(), we must be
> > + * conservative and continue to invoke these stacked drivers using the
> >   * deprecated .mmap() hook.
> >   *
> >   * However we have a problem if the underlying file system possesses an
> > @@ -1232,20 +1279,22 @@ static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> >   * establishes a struct vm_area_desc descriptor, passes to the underlying
> >   * .mmap_prepare() hook and applies any changes performed by it.
> >   *
> > - * Once the conversion of filesystems is complete this function will no longer
> > - * be required and will be removed.
> > + * Once the conversion of drivers is complete this function will no longer be
> > + * required and will be removed.
> >   *
> >   * Returns: 0 on success or error.
> >   */
> >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> >  {
> > +       struct vm_area_desc desc;
> >         int err;
> >
> > -       err = __compat_vma_mmap(file, vma);
> > +       compat_set_desc_from_vma(&desc, file, vma);
> > +       err = vfs_mmap_prepare(file, &desc);
> >         if (err)
> >                 return err;
> >
> > -       return __compat_vma_mapped(file, vma);
> > +       return __compat_vma_mmap(&desc, vma);
> >  }
> >  EXPORT_SYMBOL(compat_vma_mmap);
> >
> > diff --git a/mm/vma.h b/mm/vma.h
> > index adc18f7dd9f1..a76046c39b14 100644
> > --- a/mm/vma.h
> > +++ b/mm/vma.h
> > @@ -300,7 +300,7 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
> >   * f_op->mmap() but which might have an underlying file system which implements
> >   * f_op->mmap_prepare().
> >   */
> > -static inline void set_vma_from_desc(struct vm_area_struct *vma,
> > +static inline void compat_set_vma_from_desc(struct vm_area_struct *vma,
> >                 struct vm_area_desc *desc)
> >  {
> >         /*
> > diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
> > index 114daaef4f73..6658df26698a 100644
> > --- a/tools/testing/vma/include/dup.h
> > +++ b/tools/testing/vma/include/dup.h
> > @@ -519,8 +519,8 @@ enum vma_operation {
> >   */
> >  struct vm_area_desc {
> >         /* Immutable state. */
> > -       const struct mm_struct *const mm;
> > -       struct file *const file; /* May vary from vm_file in stacked callers. */
> > +       struct mm_struct *mm;
> > +       struct file *file; /* May vary from vm_file in stacked callers. */
> >         unsigned long start;
> >         unsigned long end;
> >
> > @@ -1272,43 +1272,92 @@ static inline void vma_set_anonymous(struct vm_area_struct *vma)
> >  }
> >
> >  /* Declared in vma.h. */
> > -static inline void set_vma_from_desc(struct vm_area_struct *vma,
> > +static inline void compat_set_vma_from_desc(struct vm_area_struct *vma,
> >                 struct vm_area_desc *desc);
> >
> > -static inline int __compat_vma_mmap(const struct file_operations *f_op,
> > -               struct file *file, struct vm_area_struct *vma)
> > +static inline void compat_set_desc_from_vma(struct vm_area_desc *desc,
> > +                             const struct file *file,
> > +                             const struct vm_area_struct *vma)
> >  {
> > -       struct vm_area_desc desc = {
> > -               .mm = vma->vm_mm,
> > -               .file = file,
> > -               .start = vma->vm_start,
> > -               .end = vma->vm_end,
> > +       desc->mm = vma->vm_mm;
> > +       desc->file = (struct file *)file;
> > +       desc->start = vma->vm_start;
> > +       desc->end = vma->vm_end;
> >
> > -               .pgoff = vma->vm_pgoff,
> > -               .vm_file = vma->vm_file,
> > -               .vma_flags = vma->flags,
> > -               .page_prot = vma->vm_page_prot,
> > +       desc->pgoff = vma->vm_pgoff;
> > +       desc->vm_file = vma->vm_file;
> > +       desc->vma_flags = vma->flags;
> > +       desc->page_prot = vma->vm_page_prot;
> >
> > -               .action.type = MMAP_NOTHING, /* Default */
> > -       };
> > +       /* Default. */
> > +       desc->action.type = MMAP_NOTHING;
> > +}
> > +
> > +static inline unsigned long vma_pages(const struct vm_area_struct *vma)
> > +{
> > +       return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> > +}
> > +
> > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > +{
> > +       const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > +
> > +       mmap_assert_write_locked(vma->vm_mm);
> > +       do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > +}
> > +
> > +static inline int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> > +{
> > +       const struct vm_operations_struct *vm_ops = vma->vm_ops;
> >         int err;
> >
> > -       err = f_op->mmap_prepare(&desc);
> > +       if (!vm_ops->mapped)
> > +               return 0;
> > +
> > +       err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
> > +                            &vma->vm_private_data);
> >         if (err)
> > -               return err;
> > +               unmap_vma_locked(vma);
> > +       return err;
> > +}
> >
> > -       err = mmap_action_prepare(&desc);
> > +static inline int __compat_vma_mmap(struct vm_area_desc *desc,
> > +               struct vm_area_struct *vma)
> > +{
> > +       int err;
> > +
> > +       /* Perform any preparatory tasks for mmap action. */
> > +       err = mmap_action_prepare(desc);
> > +       if (err)
> > +               return err;
> > +       /* Update the VMA from the descriptor. */
> > +       compat_set_vma_from_desc(vma, desc);
> > +       /* Complete any specified mmap actions. */
> > +       err = mmap_action_complete(vma, &desc->action);
> >         if (err)
> >                 return err;
> >
> > -       set_vma_from_desc(vma, &desc);
> > -       return mmap_action_complete(vma, &desc.action);
> > +       /* Invoke vm_ops->mapped callback. */
> > +       return __compat_vma_mapped(desc->file, vma);
> > +}
> > +
> > +static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> > +{
> > +       return file->f_op->mmap_prepare(desc);
> >  }
> >
> >  static inline int compat_vma_mmap(struct file *file,
> >                 struct vm_area_struct *vma)
> >  {
> > -       return __compat_vma_mmap(file->f_op, file, vma);
> > +       struct vm_area_desc desc;
> > +       int err;
> > +
> > +       compat_set_desc_from_vma(&desc, file, vma);
> > +       err = vfs_mmap_prepare(file, &desc);
> > +       if (err)
> > +               return err;
> > +
> > +       return __compat_vma_mmap(&desc, vma);
> >  }
> >
> >
> > @@ -1318,11 +1367,6 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
> >         mas_init(&vmi->mas, &mm->mm_mt, addr);
> >  }
> >
> > -static inline unsigned long vma_pages(struct vm_area_struct *vma)
> > -{
> > -       return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
> > -}
> > -
> >  static inline void mmap_assert_locked(struct mm_struct *);
> >  static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm,
> >                                                 unsigned long start_addr,
> > @@ -1492,11 +1536,6 @@ static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
> >         return file->f_op->mmap(file, vma);
> >  }
> >
> > -static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> > -{
> > -       return file->f_op->mmap_prepare(desc);
> > -}
> > -
> >  static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
> >  {
> >         /* Changing an anonymous vma with this is illegal */
> > @@ -1521,11 +1560,3 @@ static inline pgprot_t vma_get_page_prot(vma_flags_t vma_flags)
> >
> >         return vm_get_page_prot(vm_flags);
> >  }
> > -
> > -static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > -{
> > -       const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > -
> > -       mmap_assert_write_locked(vma->vm_mm);
> > -       do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > -}
> > --
> > 2.53.0
> >