[PATCH 04/15] mm: add vm_ops->mapped hook

Lorenzo Stoakes (Oracle) posted 15 patches 3 weeks, 4 days ago
There is a newer version of this series
[PATCH 04/15] mm: add vm_ops->mapped hook
Posted by Lorenzo Stoakes (Oracle) 3 weeks, 4 days ago
Previously, when a driver needed to do something like establish a reference
count, it could do so in the mmap hook in the knowledge that the mapping
would succeed.

With the introduction of f_op->mmap_prepare this is no longer the case, as
it is invoked prior to actually establishing the mapping.

To take this into account, introduce a new vm_ops->mapped callback which is
invoked when the VMA is first mapped (though notably - not when it is
merged - which is correct and mirrors existing mmap/open/close behaviour).

We do better that vm_ops->open() here, as this callback can return an
error, at which point the VMA will be unmapped.

Note that vm_ops->mapped() is invoked after any mmap action is
complete (such as I/O remapping).

We intentionally do not expose the VMA at this point, exposing only the
fields that could be used, and an output parameter in case the operation
needs to update the vma->vm_private_data field.

In order to deal with stacked filesystems which invoke inner filesystem's
mmap() invocations, add __compat_vma_mapped() and invoke it on
vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is
handled when an mmap() caller invokes a nested filesystem's mmap_prepare()
callback.

We can now also remove call_action_complete() and invoke
mmap_action_complete() directly, as we separate out the rmap lock logic to
be called in __mmap_region() instead via maybe_drop_file_rmap_lock().

We also abstract unmapping of a VMA on mmap action completion into its own
helper function, unmap_vma_locked().

Additionally, update VMA userland test headers to reflect the change.

Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
---
 include/linux/fs.h              |  9 +++-
 include/linux/mm.h              | 17 +++++++
 mm/internal.h                   | 10 ++++
 mm/util.c                       | 86 ++++++++++++++++++++++++---------
 mm/vma.c                        | 41 +++++++++++-----
 tools/testing/vma/include/dup.h | 34 ++++++++++++-
 6 files changed, 158 insertions(+), 39 deletions(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index a2628a12bd2b..c390f5c667e3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file)
 }
 
 int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
+int __vma_check_mmap_hook(struct vm_area_struct *vma);
 
 static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	int err;
+
 	if (file->f_op->mmap_prepare)
 		return compat_vma_mmap(file, vma);
 
-	return file->f_op->mmap(file, vma);
+	err = file->f_op->mmap(file, vma);
+	if (err)
+		return err;
+
+	return __vma_check_mmap_hook(vma);
 }
 
 static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 12a0b4c63736..7333d5db1221 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -759,6 +759,23 @@ struct vm_operations_struct {
 	 * Context: User context.  May sleep.  Caller holds mmap_lock.
 	 */
 	void (*close)(struct vm_area_struct *vma);
+	/**
+	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
+	 * the new VMA is merged with an adjacent VMA.
+	 *
+	 * The @vm_private_data field is an output field allowing the user to
+	 * modify vma->vm_private_data as necessary.
+	 *
+	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
+	 * set from f_op->mmap.
+	 *
+	 * Returns %0 on success, or an error otherwise. On error, the VMA will
+	 * be unmapped.
+	 *
+	 * Context: User context.  May sleep.  Caller holds mmap_lock.
+	 */
+	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
+		      const struct file *file, void **vm_private_data);
 	/* Called any time before splitting to check if it's allowed */
 	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
 	int (*mremap)(struct vm_area_struct *vma);
diff --git a/mm/internal.h b/mm/internal.h
index 7bfa85b5e78b..f0f2cf1caa36 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -158,6 +158,8 @@ static inline void *folio_raw_mapping(const struct folio *folio)
  * mmap hook and safely handle error conditions. On error, VMA hooks will be
  * mutated.
  *
+ * IMPORTANT: f_op->mmap() is deprecated, prefer f_op->mmap_prepare().
+ *
  * @file: File which backs the mapping.
  * @vma:  VMA which we are mapping.
  *
@@ -201,6 +203,14 @@ static inline void vma_close(struct vm_area_struct *vma)
 /* unmap_vmas is in mm/memory.c */
 void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
 
+static inline void unmap_vma_locked(struct vm_area_struct *vma)
+{
+	const size_t len = vma_pages(vma) << PAGE_SHIFT;
+
+	mmap_assert_locked(vma->vm_mm);
+	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
+}
+
 #ifdef CONFIG_MMU
 
 static inline void get_anon_vma(struct anon_vma *anon_vma)
diff --git a/mm/util.c b/mm/util.c
index dba1191725b6..2b0ed54008d6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1163,6 +1163,55 @@ void flush_dcache_folio(struct folio *folio)
 EXPORT_SYMBOL(flush_dcache_folio);
 #endif
 
+static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct vm_area_desc desc = {
+		.mm = vma->vm_mm,
+		.file = file,
+		.start = vma->vm_start,
+		.end = vma->vm_end,
+
+		.pgoff = vma->vm_pgoff,
+		.vm_file = vma->vm_file,
+		.vma_flags = vma->flags,
+		.page_prot = vma->vm_page_prot,
+
+		.action.type = MMAP_NOTHING, /* Default */
+	};
+	int err;
+
+	err = vfs_mmap_prepare(file, &desc);
+	if (err)
+		return err;
+
+	err = mmap_action_prepare(&desc, &desc.action);
+	if (err)
+		return err;
+
+	set_vma_from_desc(vma, &desc);
+	return mmap_action_complete(vma, &desc.action);
+}
+
+static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
+{
+	const struct vm_operations_struct *vm_ops = vma->vm_ops;
+	void *vm_private_data = vma->vm_private_data;
+	int err;
+
+	if (!vm_ops->mapped)
+		return 0;
+
+	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
+			     &vm_private_data);
+	if (err)
+		unmap_vma_locked(vma);
+	/* Update private data if changed. */
+	if (vm_private_data != vma->vm_private_data)
+		vma->vm_private_data = vm_private_data;
+
+	return err;
+}
+
 /**
  * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
  * existing VMA and execute any requested actions.
@@ -1191,34 +1240,26 @@ EXPORT_SYMBOL(flush_dcache_folio);
  */
 int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	struct vm_area_desc desc = {
-		.mm = vma->vm_mm,
-		.file = file,
-		.start = vma->vm_start,
-		.end = vma->vm_end,
-
-		.pgoff = vma->vm_pgoff,
-		.vm_file = vma->vm_file,
-		.vma_flags = vma->flags,
-		.page_prot = vma->vm_page_prot,
-
-		.action.type = MMAP_NOTHING, /* Default */
-	};
 	int err;
 
-	err = vfs_mmap_prepare(file, &desc);
-	if (err)
-		return err;
-
-	err = mmap_action_prepare(&desc, &desc.action);
+	err = __compat_vma_mmap(file, vma);
 	if (err)
 		return err;
 
-	set_vma_from_desc(vma, &desc);
-	return mmap_action_complete(vma, &desc.action);
+	return __compat_vma_mapped(file, vma);
 }
 EXPORT_SYMBOL(compat_vma_mmap);
 
+int __vma_check_mmap_hook(struct vm_area_struct *vma)
+{
+	/* vm_ops->mapped is not valid if mmap() is specified. */
+	if (WARN_ON_ONCE(vma->vm_ops->mapped))
+		return -EINVAL;
+
+	return 0;
+}
+EXPORT_SYMBOL(__vma_check_mmap_hook);
+
 static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
 			 const struct page *page)
 {
@@ -1316,10 +1357,7 @@ static int mmap_action_finish(struct vm_area_struct *vma,
 	 * invoked if we do NOT merge, so we only clean up the VMA we created.
 	 */
 	if (err) {
-		const size_t len = vma_pages(vma) << PAGE_SHIFT;
-
-		do_munmap(current->mm, vma->vm_start, len, NULL);
-
+		unmap_vma_locked(vma);
 		if (action->error_hook) {
 			/* We may want to filter the error. */
 			err = action->error_hook(err);
diff --git a/mm/vma.c b/mm/vma.c
index 054cf1d262fb..ef9f5a5365d1 100644
--- a/mm/vma.c
+++ b/mm/vma.c
@@ -2705,21 +2705,35 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
 	return false;
 }
 
-static int call_action_complete(struct mmap_state *map,
-				struct mmap_action *action,
-				struct vm_area_struct *vma)
+static int call_mapped_hook(struct vm_area_struct *vma)
 {
-	int ret;
+	const struct vm_operations_struct *vm_ops = vma->vm_ops;
+	void *vm_private_data = vma->vm_private_data;
+	int err;
 
-	ret = mmap_action_complete(vma, action);
+	if (!vm_ops || !vm_ops->mapped)
+		return 0;
+	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
+			     vma->vm_file, &vm_private_data);
+	if (err) {
+		unmap_vma_locked(vma);
+		return err;
+	}
+	/* Update private data if changed. */
+	if (vm_private_data != vma->vm_private_data)
+		vma->vm_private_data = vm_private_data;
+	return 0;
+}
 
-	/* If we held the file rmap we need to release it. */
-	if (map->hold_file_rmap_lock) {
-		struct file *file = vma->vm_file;
+static void maybe_drop_file_rmap_lock(struct mmap_state *map,
+				      struct vm_area_struct *vma)
+{
+	struct file *file;
 
-		i_mmap_unlock_write(file->f_mapping);
-	}
-	return ret;
+	if (!map->hold_file_rmap_lock)
+		return;
+	file = vma->vm_file;
+	i_mmap_unlock_write(file->f_mapping);
 }
 
 static unsigned long __mmap_region(struct file *file, unsigned long addr,
@@ -2773,8 +2787,11 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
 	__mmap_complete(&map, vma);
 
 	if (have_mmap_prepare && allocated_new) {
-		error = call_action_complete(&map, &desc.action, vma);
+		error = mmap_action_complete(vma, &desc.action);
+		if (!error)
+			error = call_mapped_hook(vma);
 
+		maybe_drop_file_rmap_lock(&map, vma);
 		if (error)
 			return error;
 	}
diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
index 908beb263307..47d8db809f31 100644
--- a/tools/testing/vma/include/dup.h
+++ b/tools/testing/vma/include/dup.h
@@ -606,12 +606,34 @@ struct vm_area_struct {
 } __randomize_layout;
 
 struct vm_operations_struct {
-	void (*open)(struct vm_area_struct * area);
+	/**
+	 * @open: Called when a VMA is remapped or split. Not called upon first
+	 * mapping a VMA.
+	 * Context: User context.  May sleep.  Caller holds mmap_lock.
+	 */
+	void (*open)(struct vm_area_struct *vma);
 	/**
 	 * @close: Called when the VMA is being removed from the MM.
 	 * Context: User context.  May sleep.  Caller holds mmap_lock.
 	 */
-	void (*close)(struct vm_area_struct * area);
+	void (*close)(struct vm_area_struct *vma);
+	/**
+	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
+	 * the new VMA is merged with an adjacent VMA.
+	 *
+	 * The @vm_private_data field is an output field allowing the user to
+	 * modify vma->vm_private_data as necessary.
+	 *
+	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
+	 * set from f_op->mmap.
+	 *
+	 * Returns %0 on success, or an error otherwise. On error, the VMA will
+	 * be unmapped.
+	 *
+	 * Context: User context.  May sleep.  Caller holds mmap_lock.
+	 */
+	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
+		      const struct file *file, void **vm_private_data);
 	/* Called any time before splitting to check if it's allowed */
 	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
 	int (*mremap)(struct vm_area_struct *area);
@@ -1345,3 +1367,11 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
 	swap(vma->vm_file, file);
 	fput(file);
 }
+
+static inline void unmap_vma_locked(struct vm_area_struct *vma)
+{
+	const size_t len = vma_pages(vma) << PAGE_SHIFT;
+
+	mmap_assert_locked(vma->vm_mm);
+	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
+}
-- 
2.53.0
Re: [PATCH 04/15] mm: add vm_ops->mapped hook
Posted by Usama Arif 3 weeks, 4 days ago
On Thu, 12 Mar 2026 20:27:19 +0000 "Lorenzo Stoakes (Oracle)" <ljs@kernel.org> wrote:

> Previously, when a driver needed to do something like establish a reference
> count, it could do so in the mmap hook in the knowledge that the mapping
> would succeed.
> 
> With the introduction of f_op->mmap_prepare this is no longer the case, as
> it is invoked prior to actually establishing the mapping.
> 
> To take this into account, introduce a new vm_ops->mapped callback which is
> invoked when the VMA is first mapped (though notably - not when it is
> merged - which is correct and mirrors existing mmap/open/close behaviour).
> 
> We do better that vm_ops->open() here, as this callback can return an
> error, at which point the VMA will be unmapped.
> 
> Note that vm_ops->mapped() is invoked after any mmap action is
> complete (such as I/O remapping).
> 
> We intentionally do not expose the VMA at this point, exposing only the
> fields that could be used, and an output parameter in case the operation
> needs to update the vma->vm_private_data field.
> 
> In order to deal with stacked filesystems which invoke inner filesystem's
> mmap() invocations, add __compat_vma_mapped() and invoke it on
> vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is
> handled when an mmap() caller invokes a nested filesystem's mmap_prepare()
> callback.
> 
> We can now also remove call_action_complete() and invoke
> mmap_action_complete() directly, as we separate out the rmap lock logic to
> be called in __mmap_region() instead via maybe_drop_file_rmap_lock().
> 
> We also abstract unmapping of a VMA on mmap action completion into its own
> helper function, unmap_vma_locked().
> 
> Additionally, update VMA userland test headers to reflect the change.
> 
> Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> ---
>  include/linux/fs.h              |  9 +++-
>  include/linux/mm.h              | 17 +++++++
>  mm/internal.h                   | 10 ++++
>  mm/util.c                       | 86 ++++++++++++++++++++++++---------
>  mm/vma.c                        | 41 +++++++++++-----
>  tools/testing/vma/include/dup.h | 34 ++++++++++++-
>  6 files changed, 158 insertions(+), 39 deletions(-)
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index a2628a12bd2b..c390f5c667e3 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file)
>  }
>  
>  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
> +int __vma_check_mmap_hook(struct vm_area_struct *vma);
>  
>  static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
>  {
> +	int err;
> +
>  	if (file->f_op->mmap_prepare)
>  		return compat_vma_mmap(file, vma);
>  
> -	return file->f_op->mmap(file, vma);
> +	err = file->f_op->mmap(file, vma);
> +	if (err)
> +		return err;
> +
> +	return __vma_check_mmap_hook(vma);
>  }
>  
>  static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 12a0b4c63736..7333d5db1221 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -759,6 +759,23 @@ struct vm_operations_struct {
>  	 * Context: User context.  May sleep.  Caller holds mmap_lock.
>  	 */
>  	void (*close)(struct vm_area_struct *vma);
> +	/**
> +	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
> +	 * the new VMA is merged with an adjacent VMA.
> +	 *
> +	 * The @vm_private_data field is an output field allowing the user to
> +	 * modify vma->vm_private_data as necessary.
> +	 *
> +	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> +	 * set from f_op->mmap.
> +	 *
> +	 * Returns %0 on success, or an error otherwise. On error, the VMA will
> +	 * be unmapped.
> +	 *
> +	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> +	 */
> +	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> +		      const struct file *file, void **vm_private_data);
>  	/* Called any time before splitting to check if it's allowed */
>  	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
>  	int (*mremap)(struct vm_area_struct *vma);
> diff --git a/mm/internal.h b/mm/internal.h
> index 7bfa85b5e78b..f0f2cf1caa36 100644
> --- a/mm/internal.h
> +++ b/mm/internal.h
> @@ -158,6 +158,8 @@ static inline void *folio_raw_mapping(const struct folio *folio)
>   * mmap hook and safely handle error conditions. On error, VMA hooks will be
>   * mutated.
>   *
> + * IMPORTANT: f_op->mmap() is deprecated, prefer f_op->mmap_prepare().
> + *
>   * @file: File which backs the mapping.
>   * @vma:  VMA which we are mapping.
>   *
> @@ -201,6 +203,14 @@ static inline void vma_close(struct vm_area_struct *vma)
>  /* unmap_vmas is in mm/memory.c */
>  void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
>  
> +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> +{
> +	const size_t len = vma_pages(vma) << PAGE_SHIFT;
> +
> +	mmap_assert_locked(vma->vm_mm);
> +	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> +}
> +
>  #ifdef CONFIG_MMU
>  
>  static inline void get_anon_vma(struct anon_vma *anon_vma)
> diff --git a/mm/util.c b/mm/util.c
> index dba1191725b6..2b0ed54008d6 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -1163,6 +1163,55 @@ void flush_dcache_folio(struct folio *folio)
>  EXPORT_SYMBOL(flush_dcache_folio);
>  #endif
>  
> +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> +{
> +	struct vm_area_desc desc = {
> +		.mm = vma->vm_mm,
> +		.file = file,
> +		.start = vma->vm_start,
> +		.end = vma->vm_end,
> +
> +		.pgoff = vma->vm_pgoff,
> +		.vm_file = vma->vm_file,
> +		.vma_flags = vma->flags,
> +		.page_prot = vma->vm_page_prot,
> +
> +		.action.type = MMAP_NOTHING, /* Default */
> +	};
> +	int err;
> +
> +	err = vfs_mmap_prepare(file, &desc);
> +	if (err)
> +		return err;
> +
> +	err = mmap_action_prepare(&desc, &desc.action);
> +	if (err)
> +		return err;
> +
> +	set_vma_from_desc(vma, &desc);
> +	return mmap_action_complete(vma, &desc.action);
> +}
> +
> +static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> +{
> +	const struct vm_operations_struct *vm_ops = vma->vm_ops;
> +	void *vm_private_data = vma->vm_private_data;
> +	int err;
> +
> +	if (!vm_ops->mapped)
> +		return 0;
> +

Hello!

Can vm_ops be NULL here?  __compat_vma_mapped() is called from
compat_vma_mmap(), which is reached when a filesystem provides
mmap_prepare.  If the mmap_prepare hook does not set desc->vm_ops,
vma->vm_ops will be NULL and this dereferences a NULL pointer.

For e.g. drivers/char/mem.c, mmap_zero_prepare() would trigger
a NULL pointer dereference here.

Would need to do
	if (!vm_ops || !vm_ops->mapped)
		return 0;

here


> +	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
> +			     &vm_private_data);
> +	if (err)
> +		unmap_vma_locked(vma);

when mapped() returns an error, unmap_vma_locked(vma) is called
but execution continues into the vm_private_data update below.  After
unmap_vma_locked() the VMA may be freed (do_munmap can remove the VMA
entirely), so accessing vma->vm_private_data after that is a
use-after-free.

Probably need to do:
	if (err) {
		unmap_vma_locked(vma);
		return err;
	}

> +	/* Update private data if changed. */
> +	if (vm_private_data != vma->vm_private_data)
> +		vma->vm_private_data = vm_private_data;
> +
> +	return err;
> +}
> +
>  /**
>   * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
>   * existing VMA and execute any requested actions.
> @@ -1191,34 +1240,26 @@ EXPORT_SYMBOL(flush_dcache_folio);
>   */
>  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
>  {
> -	struct vm_area_desc desc = {
> -		.mm = vma->vm_mm,
> -		.file = file,
> -		.start = vma->vm_start,
> -		.end = vma->vm_end,
> -
> -		.pgoff = vma->vm_pgoff,
> -		.vm_file = vma->vm_file,
> -		.vma_flags = vma->flags,
> -		.page_prot = vma->vm_page_prot,
> -
> -		.action.type = MMAP_NOTHING, /* Default */
> -	};
>  	int err;
>  
> -	err = vfs_mmap_prepare(file, &desc);
> -	if (err)
> -		return err;
> -
> -	err = mmap_action_prepare(&desc, &desc.action);
> +	err = __compat_vma_mmap(file, vma);
>  	if (err)
>  		return err;
>  
> -	set_vma_from_desc(vma, &desc);
> -	return mmap_action_complete(vma, &desc.action);
> +	return __compat_vma_mapped(file, vma);
>  }
>  EXPORT_SYMBOL(compat_vma_mmap);
>  
> +int __vma_check_mmap_hook(struct vm_area_struct *vma)
> +{
> +	/* vm_ops->mapped is not valid if mmap() is specified. */
> +	if (WARN_ON_ONCE(vma->vm_ops->mapped))
> +		return -EINVAL;

I think vma->vm_ops can be NULL here. Should be:

	if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
		return -EINVAL;

> +
> +	return 0;
> +}
> +EXPORT_SYMBOL(__vma_check_mmap_hook);
> +
>  static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
>  			 const struct page *page)
>  {
> @@ -1316,10 +1357,7 @@ static int mmap_action_finish(struct vm_area_struct *vma,
>  	 * invoked if we do NOT merge, so we only clean up the VMA we created.
>  	 */
>  	if (err) {
> -		const size_t len = vma_pages(vma) << PAGE_SHIFT;
> -
> -		do_munmap(current->mm, vma->vm_start, len, NULL);
> -
> +		unmap_vma_locked(vma);
>  		if (action->error_hook) {
>  			/* We may want to filter the error. */
>  			err = action->error_hook(err);
> diff --git a/mm/vma.c b/mm/vma.c
> index 054cf1d262fb..ef9f5a5365d1 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -2705,21 +2705,35 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
>  	return false;
>  }
>  
> -static int call_action_complete(struct mmap_state *map,
> -				struct mmap_action *action,
> -				struct vm_area_struct *vma)
> +static int call_mapped_hook(struct vm_area_struct *vma)
>  {
> -	int ret;
> +	const struct vm_operations_struct *vm_ops = vma->vm_ops;
> +	void *vm_private_data = vma->vm_private_data;
> +	int err;
>  
> -	ret = mmap_action_complete(vma, action);
> +	if (!vm_ops || !vm_ops->mapped)
> +		return 0;
> +	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
> +			     vma->vm_file, &vm_private_data);
> +	if (err) {
> +		unmap_vma_locked(vma);
> +		return err;
> +	}
> +	/* Update private data if changed. */
> +	if (vm_private_data != vma->vm_private_data)
> +		vma->vm_private_data = vm_private_data;
> +	return 0;
> +}
>  
> -	/* If we held the file rmap we need to release it. */
> -	if (map->hold_file_rmap_lock) {
> -		struct file *file = vma->vm_file;
> +static void maybe_drop_file_rmap_lock(struct mmap_state *map,
> +				      struct vm_area_struct *vma)
> +{
> +	struct file *file;
>  
> -		i_mmap_unlock_write(file->f_mapping);
> -	}
> -	return ret;
> +	if (!map->hold_file_rmap_lock)
> +		return;
> +	file = vma->vm_file;
> +	i_mmap_unlock_write(file->f_mapping);
>  }
>  
>  static unsigned long __mmap_region(struct file *file, unsigned long addr,
> @@ -2773,8 +2787,11 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
>  	__mmap_complete(&map, vma);
>  
>  	if (have_mmap_prepare && allocated_new) {
> -		error = call_action_complete(&map, &desc.action, vma);
> +		error = mmap_action_complete(vma, &desc.action);
> +		if (!error)
> +			error = call_mapped_hook(vma);
>  
> +		maybe_drop_file_rmap_lock(&map, vma);
>  		if (error)
>  			return error;
>  	}
> diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
> index 908beb263307..47d8db809f31 100644
> --- a/tools/testing/vma/include/dup.h
> +++ b/tools/testing/vma/include/dup.h
> @@ -606,12 +606,34 @@ struct vm_area_struct {
>  } __randomize_layout;
>  
>  struct vm_operations_struct {
> -	void (*open)(struct vm_area_struct * area);
> +	/**
> +	 * @open: Called when a VMA is remapped or split. Not called upon first
> +	 * mapping a VMA.
> +	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> +	 */
> +	void (*open)(struct vm_area_struct *vma);
>  	/**
>  	 * @close: Called when the VMA is being removed from the MM.
>  	 * Context: User context.  May sleep.  Caller holds mmap_lock.
>  	 */
> -	void (*close)(struct vm_area_struct * area);
> +	void (*close)(struct vm_area_struct *vma);
> +	/**
> +	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
> +	 * the new VMA is merged with an adjacent VMA.
> +	 *
> +	 * The @vm_private_data field is an output field allowing the user to
> +	 * modify vma->vm_private_data as necessary.
> +	 *
> +	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> +	 * set from f_op->mmap.
> +	 *
> +	 * Returns %0 on success, or an error otherwise. On error, the VMA will
> +	 * be unmapped.
> +	 *
> +	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> +	 */
> +	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> +		      const struct file *file, void **vm_private_data);
>  	/* Called any time before splitting to check if it's allowed */
>  	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
>  	int (*mremap)(struct vm_area_struct *area);
> @@ -1345,3 +1367,11 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
>  	swap(vma->vm_file, file);
>  	fput(file);
>  }
> +
> +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> +{
> +	const size_t len = vma_pages(vma) << PAGE_SHIFT;
> +
> +	mmap_assert_locked(vma->vm_mm);
> +	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> +}
> -- 
> 2.53.0
> 
>
Re: [PATCH 04/15] mm: add vm_ops->mapped hook
Posted by Lorenzo Stoakes (Oracle) 3 weeks, 4 days ago
On Fri, Mar 13, 2026 at 04:02:36AM -0700, Usama Arif wrote:
> On Thu, 12 Mar 2026 20:27:19 +0000 "Lorenzo Stoakes (Oracle)" <ljs@kernel.org> wrote:
>
> > Previously, when a driver needed to do something like establish a reference
> > count, it could do so in the mmap hook in the knowledge that the mapping
> > would succeed.
> >
> > With the introduction of f_op->mmap_prepare this is no longer the case, as
> > it is invoked prior to actually establishing the mapping.
> >
> > To take this into account, introduce a new vm_ops->mapped callback which is
> > invoked when the VMA is first mapped (though notably - not when it is
> > merged - which is correct and mirrors existing mmap/open/close behaviour).
> >
> > We do better that vm_ops->open() here, as this callback can return an
> > error, at which point the VMA will be unmapped.
> >
> > Note that vm_ops->mapped() is invoked after any mmap action is
> > complete (such as I/O remapping).
> >
> > We intentionally do not expose the VMA at this point, exposing only the
> > fields that could be used, and an output parameter in case the operation
> > needs to update the vma->vm_private_data field.
> >
> > In order to deal with stacked filesystems which invoke inner filesystem's
> > mmap() invocations, add __compat_vma_mapped() and invoke it on
> > vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is
> > handled when an mmap() caller invokes a nested filesystem's mmap_prepare()
> > callback.
> >
> > We can now also remove call_action_complete() and invoke
> > mmap_action_complete() directly, as we separate out the rmap lock logic to
> > be called in __mmap_region() instead via maybe_drop_file_rmap_lock().
> >
> > We also abstract unmapping of a VMA on mmap action completion into its own
> > helper function, unmap_vma_locked().
> >
> > Additionally, update VMA userland test headers to reflect the change.
> >
> > Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> > ---
> >  include/linux/fs.h              |  9 +++-
> >  include/linux/mm.h              | 17 +++++++
> >  mm/internal.h                   | 10 ++++
> >  mm/util.c                       | 86 ++++++++++++++++++++++++---------
> >  mm/vma.c                        | 41 +++++++++++-----
> >  tools/testing/vma/include/dup.h | 34 ++++++++++++-
> >  6 files changed, 158 insertions(+), 39 deletions(-)
> >
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index a2628a12bd2b..c390f5c667e3 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file)
> >  }
> >
> >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
> > +int __vma_check_mmap_hook(struct vm_area_struct *vma);
> >
> >  static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
> >  {
> > +	int err;
> > +
> >  	if (file->f_op->mmap_prepare)
> >  		return compat_vma_mmap(file, vma);
> >
> > -	return file->f_op->mmap(file, vma);
> > +	err = file->f_op->mmap(file, vma);
> > +	if (err)
> > +		return err;
> > +
> > +	return __vma_check_mmap_hook(vma);
> >  }
> >
> >  static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > index 12a0b4c63736..7333d5db1221 100644
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -759,6 +759,23 @@ struct vm_operations_struct {
> >  	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> >  	 */
> >  	void (*close)(struct vm_area_struct *vma);
> > +	/**
> > +	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > +	 * the new VMA is merged with an adjacent VMA.
> > +	 *
> > +	 * The @vm_private_data field is an output field allowing the user to
> > +	 * modify vma->vm_private_data as necessary.
> > +	 *
> > +	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > +	 * set from f_op->mmap.
> > +	 *
> > +	 * Returns %0 on success, or an error otherwise. On error, the VMA will
> > +	 * be unmapped.
> > +	 *
> > +	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> > +	 */
> > +	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > +		      const struct file *file, void **vm_private_data);
> >  	/* Called any time before splitting to check if it's allowed */
> >  	int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
> >  	int (*mremap)(struct vm_area_struct *vma);
> > diff --git a/mm/internal.h b/mm/internal.h
> > index 7bfa85b5e78b..f0f2cf1caa36 100644
> > --- a/mm/internal.h
> > +++ b/mm/internal.h
> > @@ -158,6 +158,8 @@ static inline void *folio_raw_mapping(const struct folio *folio)
> >   * mmap hook and safely handle error conditions. On error, VMA hooks will be
> >   * mutated.
> >   *
> > + * IMPORTANT: f_op->mmap() is deprecated, prefer f_op->mmap_prepare().
> > + *
> >   * @file: File which backs the mapping.
> >   * @vma:  VMA which we are mapping.
> >   *
> > @@ -201,6 +203,14 @@ static inline void vma_close(struct vm_area_struct *vma)
> >  /* unmap_vmas is in mm/memory.c */
> >  void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
> >
> > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > +{
> > +	const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > +
> > +	mmap_assert_locked(vma->vm_mm);
> > +	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > +}
> > +
> >  #ifdef CONFIG_MMU
> >
> >  static inline void get_anon_vma(struct anon_vma *anon_vma)
> > diff --git a/mm/util.c b/mm/util.c
> > index dba1191725b6..2b0ed54008d6 100644
> > --- a/mm/util.c
> > +++ b/mm/util.c
> > @@ -1163,6 +1163,55 @@ void flush_dcache_folio(struct folio *folio)
> >  EXPORT_SYMBOL(flush_dcache_folio);
> >  #endif
> >
> > +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > +{
> > +	struct vm_area_desc desc = {
> > +		.mm = vma->vm_mm,
> > +		.file = file,
> > +		.start = vma->vm_start,
> > +		.end = vma->vm_end,
> > +
> > +		.pgoff = vma->vm_pgoff,
> > +		.vm_file = vma->vm_file,
> > +		.vma_flags = vma->flags,
> > +		.page_prot = vma->vm_page_prot,
> > +
> > +		.action.type = MMAP_NOTHING, /* Default */
> > +	};
> > +	int err;
> > +
> > +	err = vfs_mmap_prepare(file, &desc);
> > +	if (err)
> > +		return err;
> > +
> > +	err = mmap_action_prepare(&desc, &desc.action);
> > +	if (err)
> > +		return err;
> > +
> > +	set_vma_from_desc(vma, &desc);
> > +	return mmap_action_complete(vma, &desc.action);
> > +}
> > +
> > +static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> > +{
> > +	const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > +	void *vm_private_data = vma->vm_private_data;
> > +	int err;
> > +
> > +	if (!vm_ops->mapped)
> > +		return 0;
> > +
>
> Hello!
>
> Can vm_ops be NULL here?  __compat_vma_mapped() is called from
> compat_vma_mmap(), which is reached when a filesystem provides
> mmap_prepare.  If the mmap_prepare hook does not set desc->vm_ops,
> vma->vm_ops will be NULL and this dereferences a NULL pointer.

I _think_ for this to ever be invoked, you would need to be dealing with a
file-backed VMA so vm_ops->fault would HAVE to be defined.

But you're right anyway as a matter of principle we should check it! Will fix.

>
> For e.g. drivers/char/mem.c, mmap_zero_prepare() would trigger
> a NULL pointer dereference here.
>
> Would need to do
> 	if (!vm_ops || !vm_ops->mapped)
> 		return 0;
>
> here

Yes.

>
>
> > +	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
> > +			     &vm_private_data);
> > +	if (err)
> > +		unmap_vma_locked(vma);
>
> when mapped() returns an error, unmap_vma_locked(vma) is called
> but execution continues into the vm_private_data update below.  After
> unmap_vma_locked() the VMA may be freed (do_munmap can remove the VMA
> entirely), so accessing vma->vm_private_data after that is a
> use-after-free.

Very good point :) will fix thanks!

Probably:

	if (err)
		unmap_vma_locked(vma);
	else if (vm_private_data != vma->vm_private_data)
		vma->vm_private_data = vm_private_data;

	return err;

Would be fine.

>
> Probably need to do:
> 	if (err) {
> 		unmap_vma_locked(vma);
> 		return err;
> 	}
>
> > +	/* Update private data if changed. */
> > +	if (vm_private_data != vma->vm_private_data)
> > +		vma->vm_private_data = vm_private_data;
> > +
> > +	return err;
> > +}
> > +
> >  /**
> >   * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
> >   * existing VMA and execute any requested actions.
> > @@ -1191,34 +1240,26 @@ EXPORT_SYMBOL(flush_dcache_folio);
> >   */
> >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> >  {
> > -	struct vm_area_desc desc = {
> > -		.mm = vma->vm_mm,
> > -		.file = file,
> > -		.start = vma->vm_start,
> > -		.end = vma->vm_end,
> > -
> > -		.pgoff = vma->vm_pgoff,
> > -		.vm_file = vma->vm_file,
> > -		.vma_flags = vma->flags,
> > -		.page_prot = vma->vm_page_prot,
> > -
> > -		.action.type = MMAP_NOTHING, /* Default */
> > -	};
> >  	int err;
> >
> > -	err = vfs_mmap_prepare(file, &desc);
> > -	if (err)
> > -		return err;
> > -
> > -	err = mmap_action_prepare(&desc, &desc.action);
> > +	err = __compat_vma_mmap(file, vma);
> >  	if (err)
> >  		return err;
> >
> > -	set_vma_from_desc(vma, &desc);
> > -	return mmap_action_complete(vma, &desc.action);
> > +	return __compat_vma_mapped(file, vma);
> >  }
> >  EXPORT_SYMBOL(compat_vma_mmap);
> >
> > +int __vma_check_mmap_hook(struct vm_area_struct *vma)
> > +{
> > +	/* vm_ops->mapped is not valid if mmap() is specified. */
> > +	if (WARN_ON_ONCE(vma->vm_ops->mapped))
> > +		return -EINVAL;
>
> I think vma->vm_ops can be NULL here. Should be:
>
> 	if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
> 		return -EINVAL;

I think again you'd probably only invoke this on file-backed so be ok, but again
as a matter of principle we should check it so will fix, thanks!

>
> > +
> > +	return 0;
> > +}
> > +EXPORT_SYMBOL(__vma_check_mmap_hook);
> > +
> >  static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
> >  			 const struct page *page)
> >  {
> > @@ -1316,10 +1357,7 @@ static int mmap_action_finish(struct vm_area_struct *vma,
> >  	 * invoked if we do NOT merge, so we only clean up the VMA we created.
> >  	 */
> >  	if (err) {
> > -		const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > -
> > -		do_munmap(current->mm, vma->vm_start, len, NULL);
> > -
> > +		unmap_vma_locked(vma);
> >  		if (action->error_hook) {
> >  			/* We may want to filter the error. */
> >  			err = action->error_hook(err);
> > diff --git a/mm/vma.c b/mm/vma.c
> > index 054cf1d262fb..ef9f5a5365d1 100644
> > --- a/mm/vma.c
> > +++ b/mm/vma.c
> > @@ -2705,21 +2705,35 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
> >  	return false;
> >  }
> >
> > -static int call_action_complete(struct mmap_state *map,
> > -				struct mmap_action *action,
> > -				struct vm_area_struct *vma)
> > +static int call_mapped_hook(struct vm_area_struct *vma)
> >  {
> > -	int ret;
> > +	const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > +	void *vm_private_data = vma->vm_private_data;
> > +	int err;
> >
> > -	ret = mmap_action_complete(vma, action);
> > +	if (!vm_ops || !vm_ops->mapped)
> > +		return 0;
> > +	err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
> > +			     vma->vm_file, &vm_private_data);
> > +	if (err) {
> > +		unmap_vma_locked(vma);
> > +		return err;
> > +	}
> > +	/* Update private data if changed. */
> > +	if (vm_private_data != vma->vm_private_data)
> > +		vma->vm_private_data = vm_private_data;
> > +	return 0;
> > +}
> >
> > -	/* If we held the file rmap we need to release it. */
> > -	if (map->hold_file_rmap_lock) {
> > -		struct file *file = vma->vm_file;
> > +static void maybe_drop_file_rmap_lock(struct mmap_state *map,
> > +				      struct vm_area_struct *vma)
> > +{
> > +	struct file *file;
> >
> > -		i_mmap_unlock_write(file->f_mapping);
> > -	}
> > -	return ret;
> > +	if (!map->hold_file_rmap_lock)
> > +		return;
> > +	file = vma->vm_file;
> > +	i_mmap_unlock_write(file->f_mapping);
> >  }
> >
> >  static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > @@ -2773,8 +2787,11 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> >  	__mmap_complete(&map, vma);
> >
> >  	if (have_mmap_prepare && allocated_new) {
> > -		error = call_action_complete(&map, &desc.action, vma);
> > +		error = mmap_action_complete(vma, &desc.action);
> > +		if (!error)
> > +			error = call_mapped_hook(vma);
> >
> > +		maybe_drop_file_rmap_lock(&map, vma);
> >  		if (error)
> >  			return error;
> >  	}
> > diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
> > index 908beb263307..47d8db809f31 100644
> > --- a/tools/testing/vma/include/dup.h
> > +++ b/tools/testing/vma/include/dup.h
> > @@ -606,12 +606,34 @@ struct vm_area_struct {
> >  } __randomize_layout;
> >
> >  struct vm_operations_struct {
> > -	void (*open)(struct vm_area_struct * area);
> > +	/**
> > +	 * @open: Called when a VMA is remapped or split. Not called upon first
> > +	 * mapping a VMA.
> > +	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> > +	 */
> > +	void (*open)(struct vm_area_struct *vma);
> >  	/**
> >  	 * @close: Called when the VMA is being removed from the MM.
> >  	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> >  	 */
> > -	void (*close)(struct vm_area_struct * area);
> > +	void (*close)(struct vm_area_struct *vma);
> > +	/**
> > +	 * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > +	 * the new VMA is merged with an adjacent VMA.
> > +	 *
> > +	 * The @vm_private_data field is an output field allowing the user to
> > +	 * modify vma->vm_private_data as necessary.
> > +	 *
> > +	 * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > +	 * set from f_op->mmap.
> > +	 *
> > +	 * Returns %0 on success, or an error otherwise. On error, the VMA will
> > +	 * be unmapped.
> > +	 *
> > +	 * Context: User context.  May sleep.  Caller holds mmap_lock.
> > +	 */
> > +	int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > +		      const struct file *file, void **vm_private_data);
> >  	/* Called any time before splitting to check if it's allowed */
> >  	int (*may_split)(struct vm_area_struct *area, unsigned long addr);
> >  	int (*mremap)(struct vm_area_struct *area);
> > @@ -1345,3 +1367,11 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
> >  	swap(vma->vm_file, file);
> >  	fput(file);
> >  }
> > +
> > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > +{
> > +	const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > +
> > +	mmap_assert_locked(vma->vm_mm);
> > +	do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > +}
> > --
> > 2.53.0
> >
> >

Cheers, Lorenzo
Re: [PATCH 04/15] mm: add vm_ops->mapped hook
Posted by Suren Baghdasaryan 3 weeks, 1 day ago
On Fri, Mar 13, 2026 at 4:58 AM Lorenzo Stoakes (Oracle) <ljs@kernel.org> wrote:
>
> On Fri, Mar 13, 2026 at 04:02:36AM -0700, Usama Arif wrote:
> > On Thu, 12 Mar 2026 20:27:19 +0000 "Lorenzo Stoakes (Oracle)" <ljs@kernel.org> wrote:
> >
> > > Previously, when a driver needed to do something like establish a reference
> > > count, it could do so in the mmap hook in the knowledge that the mapping
> > > would succeed.
> > >
> > > With the introduction of f_op->mmap_prepare this is no longer the case, as
> > > it is invoked prior to actually establishing the mapping.
> > >
> > > To take this into account, introduce a new vm_ops->mapped callback which is
> > > invoked when the VMA is first mapped (though notably - not when it is
> > > merged - which is correct and mirrors existing mmap/open/close behaviour).
> > >
> > > We do better that vm_ops->open() here, as this callback can return an
> > > error, at which point the VMA will be unmapped.
> > >
> > > Note that vm_ops->mapped() is invoked after any mmap action is
> > > complete (such as I/O remapping).
> > >
> > > We intentionally do not expose the VMA at this point, exposing only the
> > > fields that could be used, and an output parameter in case the operation
> > > needs to update the vma->vm_private_data field.
> > >
> > > In order to deal with stacked filesystems which invoke inner filesystem's
> > > mmap() invocations, add __compat_vma_mapped() and invoke it on
> > > vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is
> > > handled when an mmap() caller invokes a nested filesystem's mmap_prepare()
> > > callback.
> > >
> > > We can now also remove call_action_complete() and invoke
> > > mmap_action_complete() directly, as we separate out the rmap lock logic to
> > > be called in __mmap_region() instead via maybe_drop_file_rmap_lock().
> > >
> > > We also abstract unmapping of a VMA on mmap action completion into its own
> > > helper function, unmap_vma_locked().
> > >
> > > Additionally, update VMA userland test headers to reflect the change.
> > >
> > > Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> > > ---
> > >  include/linux/fs.h              |  9 +++-
> > >  include/linux/mm.h              | 17 +++++++
> > >  mm/internal.h                   | 10 ++++
> > >  mm/util.c                       | 86 ++++++++++++++++++++++++---------
> > >  mm/vma.c                        | 41 +++++++++++-----
> > >  tools/testing/vma/include/dup.h | 34 ++++++++++++-
> > >  6 files changed, 158 insertions(+), 39 deletions(-)
> > >
> > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > index a2628a12bd2b..c390f5c667e3 100644
> > > --- a/include/linux/fs.h
> > > +++ b/include/linux/fs.h
> > > @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file)
> > >  }
> > >
> > >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
> > > +int __vma_check_mmap_hook(struct vm_area_struct *vma);
> > >
> > >  static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
> > >  {
> > > +   int err;
> > > +
> > >     if (file->f_op->mmap_prepare)
> > >             return compat_vma_mmap(file, vma);
> > >
> > > -   return file->f_op->mmap(file, vma);
> > > +   err = file->f_op->mmap(file, vma);
> > > +   if (err)
> > > +           return err;
> > > +
> > > +   return __vma_check_mmap_hook(vma);
> > >  }
> > >
> > >  static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > index 12a0b4c63736..7333d5db1221 100644
> > > --- a/include/linux/mm.h
> > > +++ b/include/linux/mm.h
> > > @@ -759,6 +759,23 @@ struct vm_operations_struct {
> > >      * Context: User context.  May sleep.  Caller holds mmap_lock.
> > >      */
> > >     void (*close)(struct vm_area_struct *vma);
> > > +   /**
> > > +    * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > > +    * the new VMA is merged with an adjacent VMA.
> > > +    *
> > > +    * The @vm_private_data field is an output field allowing the user to
> > > +    * modify vma->vm_private_data as necessary.
> > > +    *
> > > +    * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > > +    * set from f_op->mmap.
> > > +    *
> > > +    * Returns %0 on success, or an error otherwise. On error, the VMA will
> > > +    * be unmapped.
> > > +    *
> > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > +    */
> > > +   int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > > +                 const struct file *file, void **vm_private_data);
> > >     /* Called any time before splitting to check if it's allowed */
> > >     int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
> > >     int (*mremap)(struct vm_area_struct *vma);
> > > diff --git a/mm/internal.h b/mm/internal.h
> > > index 7bfa85b5e78b..f0f2cf1caa36 100644
> > > --- a/mm/internal.h
> > > +++ b/mm/internal.h
> > > @@ -158,6 +158,8 @@ static inline void *folio_raw_mapping(const struct folio *folio)
> > >   * mmap hook and safely handle error conditions. On error, VMA hooks will be
> > >   * mutated.
> > >   *
> > > + * IMPORTANT: f_op->mmap() is deprecated, prefer f_op->mmap_prepare().
> > > + *

What exactly would one do to "prefer f_op->mmap_prepare()"?
Since you are adding this comment for mmap_file(), I think you need to
describe more specifically what one should call instead.

> > >   * @file: File which backs the mapping.
> > >   * @vma:  VMA which we are mapping.
> > >   *
> > > @@ -201,6 +203,14 @@ static inline void vma_close(struct vm_area_struct *vma)
> > >  /* unmap_vmas is in mm/memory.c */
> > >  void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
> > >
> > > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > > +{
> > > +   const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > +
> > > +   mmap_assert_locked(vma->vm_mm);

You must hold the mmap write lock when unmapping. Would be better to
assert mmap_assert_write_locked() or even vma_assert_write_locked(),
which implies mmap_assert_write_locked().

> > > +   do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > > +}
> > > +
> > >  #ifdef CONFIG_MMU
> > >
> > >  static inline void get_anon_vma(struct anon_vma *anon_vma)
> > > diff --git a/mm/util.c b/mm/util.c
> > > index dba1191725b6..2b0ed54008d6 100644
> > > --- a/mm/util.c
> > > +++ b/mm/util.c
> > > @@ -1163,6 +1163,55 @@ void flush_dcache_folio(struct folio *folio)
> > >  EXPORT_SYMBOL(flush_dcache_folio);
> > >  #endif
> > >
> > > +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > > +{
> > > +   struct vm_area_desc desc = {
> > > +           .mm = vma->vm_mm,
> > > +           .file = file,
> > > +           .start = vma->vm_start,
> > > +           .end = vma->vm_end,
> > > +
> > > +           .pgoff = vma->vm_pgoff,
> > > +           .vm_file = vma->vm_file,
> > > +           .vma_flags = vma->flags,
> > > +           .page_prot = vma->vm_page_prot,
> > > +
> > > +           .action.type = MMAP_NOTHING, /* Default */
> > > +   };
> > > +   int err;
> > > +
> > > +   err = vfs_mmap_prepare(file, &desc);
> > > +   if (err)
> > > +           return err;
> > > +
> > > +   err = mmap_action_prepare(&desc, &desc.action);
> > > +   if (err)
> > > +           return err;
> > > +
> > > +   set_vma_from_desc(vma, &desc);
> > > +   return mmap_action_complete(vma, &desc.action);
> > > +}
> > > +
> > > +static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> > > +{
> > > +   const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > > +   void *vm_private_data = vma->vm_private_data;
> > > +   int err;
> > > +
> > > +   if (!vm_ops->mapped)
> > > +           return 0;
> > > +
> >
> > Hello!
> >
> > Can vm_ops be NULL here?  __compat_vma_mapped() is called from
> > compat_vma_mmap(), which is reached when a filesystem provides
> > mmap_prepare.  If the mmap_prepare hook does not set desc->vm_ops,
> > vma->vm_ops will be NULL and this dereferences a NULL pointer.
>
> I _think_ for this to ever be invoked, you would need to be dealing with a
> file-backed VMA so vm_ops->fault would HAVE to be defined.
>
> But you're right anyway as a matter of principle we should check it! Will fix.
>
> >
> > For e.g. drivers/char/mem.c, mmap_zero_prepare() would trigger
> > a NULL pointer dereference here.
> >
> > Would need to do
> >       if (!vm_ops || !vm_ops->mapped)
> >               return 0;
> >
> > here
>
> Yes.
>
> >
> >
> > > +   err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
> > > +                        &vm_private_data);
> > > +   if (err)
> > > +           unmap_vma_locked(vma);
> >
> > when mapped() returns an error, unmap_vma_locked(vma) is called
> > but execution continues into the vm_private_data update below.  After
> > unmap_vma_locked() the VMA may be freed (do_munmap can remove the VMA
> > entirely), so accessing vma->vm_private_data after that is a
> > use-after-free.
>
> Very good point :) will fix thanks!
>
> Probably:
>
>         if (err)
>                 unmap_vma_locked(vma);
>         else if (vm_private_data != vma->vm_private_data)
>                 vma->vm_private_data = vm_private_data;
>
>         return err;
>
> Would be fine.
>
> >
> > Probably need to do:
> >       if (err) {
> >               unmap_vma_locked(vma);
> >               return err;
> >       }
> >
> > > +   /* Update private data if changed. */
> > > +   if (vm_private_data != vma->vm_private_data)
> > > +           vma->vm_private_data = vm_private_data;
> > > +
> > > +   return err;
> > > +}
> > > +
> > >  /**
> > >   * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
> > >   * existing VMA and execute any requested actions.
> > > @@ -1191,34 +1240,26 @@ EXPORT_SYMBOL(flush_dcache_folio);
> > >   */
> > >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > >  {
> > > -   struct vm_area_desc desc = {
> > > -           .mm = vma->vm_mm,
> > > -           .file = file,
> > > -           .start = vma->vm_start,
> > > -           .end = vma->vm_end,
> > > -
> > > -           .pgoff = vma->vm_pgoff,
> > > -           .vm_file = vma->vm_file,
> > > -           .vma_flags = vma->flags,
> > > -           .page_prot = vma->vm_page_prot,
> > > -
> > > -           .action.type = MMAP_NOTHING, /* Default */
> > > -   };
> > >     int err;
> > >
> > > -   err = vfs_mmap_prepare(file, &desc);
> > > -   if (err)
> > > -           return err;
> > > -
> > > -   err = mmap_action_prepare(&desc, &desc.action);
> > > +   err = __compat_vma_mmap(file, vma);
> > >     if (err)
> > >             return err;
> > >
> > > -   set_vma_from_desc(vma, &desc);
> > > -   return mmap_action_complete(vma, &desc.action);
> > > +   return __compat_vma_mapped(file, vma);
> > >  }
> > >  EXPORT_SYMBOL(compat_vma_mmap);
> > >
> > > +int __vma_check_mmap_hook(struct vm_area_struct *vma)
> > > +{
> > > +   /* vm_ops->mapped is not valid if mmap() is specified. */
> > > +   if (WARN_ON_ONCE(vma->vm_ops->mapped))
> > > +           return -EINVAL;
> >
> > I think vma->vm_ops can be NULL here. Should be:
> >
> >       if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
> >               return -EINVAL;
>
> I think again you'd probably only invoke this on file-backed so be ok, but again
> as a matter of principle we should check it so will fix, thanks!
>
> >
> > > +
> > > +   return 0;
> > > +}
> > > +EXPORT_SYMBOL(__vma_check_mmap_hook);

nit: Any reason __vma_check_mmap_hook() is not inlined next to its
user vfs_mmap()?

> > > +
> > >  static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
> > >                      const struct page *page)
> > >  {
> > > @@ -1316,10 +1357,7 @@ static int mmap_action_finish(struct vm_area_struct *vma,
> > >      * invoked if we do NOT merge, so we only clean up the VMA we created.
> > >      */
> > >     if (err) {
> > > -           const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > -
> > > -           do_munmap(current->mm, vma->vm_start, len, NULL);
> > > -
> > > +           unmap_vma_locked(vma);
> > >             if (action->error_hook) {
> > >                     /* We may want to filter the error. */
> > >                     err = action->error_hook(err);
> > > diff --git a/mm/vma.c b/mm/vma.c
> > > index 054cf1d262fb..ef9f5a5365d1 100644
> > > --- a/mm/vma.c
> > > +++ b/mm/vma.c
> > > @@ -2705,21 +2705,35 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
> > >     return false;
> > >  }
> > >
> > > -static int call_action_complete(struct mmap_state *map,
> > > -                           struct mmap_action *action,
> > > -                           struct vm_area_struct *vma)
> > > +static int call_mapped_hook(struct vm_area_struct *vma)
> > >  {
> > > -   int ret;
> > > +   const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > > +   void *vm_private_data = vma->vm_private_data;
> > > +   int err;
> > >
> > > -   ret = mmap_action_complete(vma, action);
> > > +   if (!vm_ops || !vm_ops->mapped)
> > > +           return 0;
> > > +   err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
> > > +                        vma->vm_file, &vm_private_data);
> > > +   if (err) {
> > > +           unmap_vma_locked(vma);
> > > +           return err;
> > > +   }
> > > +   /* Update private data if changed. */
> > > +   if (vm_private_data != vma->vm_private_data)
> > > +           vma->vm_private_data = vm_private_data;
> > > +   return 0;
> > > +}
> > >
> > > -   /* If we held the file rmap we need to release it. */
> > > -   if (map->hold_file_rmap_lock) {
> > > -           struct file *file = vma->vm_file;
> > > +static void maybe_drop_file_rmap_lock(struct mmap_state *map,
> > > +                                 struct vm_area_struct *vma)
> > > +{
> > > +   struct file *file;
> > >
> > > -           i_mmap_unlock_write(file->f_mapping);
> > > -   }
> > > -   return ret;
> > > +   if (!map->hold_file_rmap_lock)
> > > +           return;
> > > +   file = vma->vm_file;
> > > +   i_mmap_unlock_write(file->f_mapping);
> > >  }
> > >
> > >  static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > > @@ -2773,8 +2787,11 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > >     __mmap_complete(&map, vma);
> > >
> > >     if (have_mmap_prepare && allocated_new) {
> > > -           error = call_action_complete(&map, &desc.action, vma);
> > > +           error = mmap_action_complete(vma, &desc.action);
> > > +           if (!error)
> > > +                   error = call_mapped_hook(vma);
> > >
> > > +           maybe_drop_file_rmap_lock(&map, vma);
> > >             if (error)
> > >                     return error;
> > >     }
> > > diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
> > > index 908beb263307..47d8db809f31 100644
> > > --- a/tools/testing/vma/include/dup.h
> > > +++ b/tools/testing/vma/include/dup.h
> > > @@ -606,12 +606,34 @@ struct vm_area_struct {
> > >  } __randomize_layout;
> > >
> > >  struct vm_operations_struct {
> > > -   void (*open)(struct vm_area_struct * area);
> > > +   /**
> > > +    * @open: Called when a VMA is remapped or split. Not called upon first
> > > +    * mapping a VMA.
> > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > +    */

This comment should have been introduced in the previous patch.

> > > +   void (*open)(struct vm_area_struct *vma);
> > >     /**
> > >      * @close: Called when the VMA is being removed from the MM.
> > >      * Context: User context.  May sleep.  Caller holds mmap_lock.
> > >      */
> > > -   void (*close)(struct vm_area_struct * area);
> > > +   void (*close)(struct vm_area_struct *vma);
> > > +   /**
> > > +    * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > > +    * the new VMA is merged with an adjacent VMA.
> > > +    *
> > > +    * The @vm_private_data field is an output field allowing the user to
> > > +    * modify vma->vm_private_data as necessary.
> > > +    *
> > > +    * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > > +    * set from f_op->mmap.
> > > +    *
> > > +    * Returns %0 on success, or an error otherwise. On error, the VMA will
> > > +    * be unmapped.
> > > +    *
> > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > +    */
> > > +   int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > > +                 const struct file *file, void **vm_private_data);
> > >     /* Called any time before splitting to check if it's allowed */
> > >     int (*may_split)(struct vm_area_struct *area, unsigned long addr);
> > >     int (*mremap)(struct vm_area_struct *area);
> > > @@ -1345,3 +1367,11 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
> > >     swap(vma->vm_file, file);
> > >     fput(file);
> > >  }
> > > +
> > > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > > +{
> > > +   const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > +
> > > +   mmap_assert_locked(vma->vm_mm);
> > > +   do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > > +}
> > > --
> > > 2.53.0
> > >
> > >
>
> Cheers, Lorenzo
Re: [PATCH 04/15] mm: add vm_ops->mapped hook
Posted by Lorenzo Stoakes (Oracle) 3 weeks, 1 day ago
On Sun, Mar 15, 2026 at 07:18:38PM -0700, Suren Baghdasaryan wrote:
> On Fri, Mar 13, 2026 at 4:58 AM Lorenzo Stoakes (Oracle) <ljs@kernel.org> wrote:
> >
> > On Fri, Mar 13, 2026 at 04:02:36AM -0700, Usama Arif wrote:
> > > On Thu, 12 Mar 2026 20:27:19 +0000 "Lorenzo Stoakes (Oracle)" <ljs@kernel.org> wrote:
> > >
> > > > Previously, when a driver needed to do something like establish a reference
> > > > count, it could do so in the mmap hook in the knowledge that the mapping
> > > > would succeed.
> > > >
> > > > With the introduction of f_op->mmap_prepare this is no longer the case, as
> > > > it is invoked prior to actually establishing the mapping.
> > > >
> > > > To take this into account, introduce a new vm_ops->mapped callback which is
> > > > invoked when the VMA is first mapped (though notably - not when it is
> > > > merged - which is correct and mirrors existing mmap/open/close behaviour).
> > > >
> > > > We do better that vm_ops->open() here, as this callback can return an
> > > > error, at which point the VMA will be unmapped.
> > > >
> > > > Note that vm_ops->mapped() is invoked after any mmap action is
> > > > complete (such as I/O remapping).
> > > >
> > > > We intentionally do not expose the VMA at this point, exposing only the
> > > > fields that could be used, and an output parameter in case the operation
> > > > needs to update the vma->vm_private_data field.
> > > >
> > > > In order to deal with stacked filesystems which invoke inner filesystem's
> > > > mmap() invocations, add __compat_vma_mapped() and invoke it on
> > > > vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is
> > > > handled when an mmap() caller invokes a nested filesystem's mmap_prepare()
> > > > callback.
> > > >
> > > > We can now also remove call_action_complete() and invoke
> > > > mmap_action_complete() directly, as we separate out the rmap lock logic to
> > > > be called in __mmap_region() instead via maybe_drop_file_rmap_lock().
> > > >
> > > > We also abstract unmapping of a VMA on mmap action completion into its own
> > > > helper function, unmap_vma_locked().
> > > >
> > > > Additionally, update VMA userland test headers to reflect the change.
> > > >
> > > > Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> > > > ---
> > > >  include/linux/fs.h              |  9 +++-
> > > >  include/linux/mm.h              | 17 +++++++
> > > >  mm/internal.h                   | 10 ++++
> > > >  mm/util.c                       | 86 ++++++++++++++++++++++++---------
> > > >  mm/vma.c                        | 41 +++++++++++-----
> > > >  tools/testing/vma/include/dup.h | 34 ++++++++++++-
> > > >  6 files changed, 158 insertions(+), 39 deletions(-)
> > > >
> > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > index a2628a12bd2b..c390f5c667e3 100644
> > > > --- a/include/linux/fs.h
> > > > +++ b/include/linux/fs.h
> > > > @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file)
> > > >  }
> > > >
> > > >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
> > > > +int __vma_check_mmap_hook(struct vm_area_struct *vma);
> > > >
> > > >  static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
> > > >  {
> > > > +   int err;
> > > > +
> > > >     if (file->f_op->mmap_prepare)
> > > >             return compat_vma_mmap(file, vma);
> > > >
> > > > -   return file->f_op->mmap(file, vma);
> > > > +   err = file->f_op->mmap(file, vma);
> > > > +   if (err)
> > > > +           return err;
> > > > +
> > > > +   return __vma_check_mmap_hook(vma);
> > > >  }
> > > >
> > > >  static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > index 12a0b4c63736..7333d5db1221 100644
> > > > --- a/include/linux/mm.h
> > > > +++ b/include/linux/mm.h
> > > > @@ -759,6 +759,23 @@ struct vm_operations_struct {
> > > >      * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > >      */
> > > >     void (*close)(struct vm_area_struct *vma);
> > > > +   /**
> > > > +    * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > > > +    * the new VMA is merged with an adjacent VMA.
> > > > +    *
> > > > +    * The @vm_private_data field is an output field allowing the user to
> > > > +    * modify vma->vm_private_data as necessary.
> > > > +    *
> > > > +    * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > > > +    * set from f_op->mmap.
> > > > +    *
> > > > +    * Returns %0 on success, or an error otherwise. On error, the VMA will
> > > > +    * be unmapped.
> > > > +    *
> > > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > +    */
> > > > +   int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > > > +                 const struct file *file, void **vm_private_data);
> > > >     /* Called any time before splitting to check if it's allowed */
> > > >     int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
> > > >     int (*mremap)(struct vm_area_struct *vma);
> > > > diff --git a/mm/internal.h b/mm/internal.h
> > > > index 7bfa85b5e78b..f0f2cf1caa36 100644
> > > > --- a/mm/internal.h
> > > > +++ b/mm/internal.h
> > > > @@ -158,6 +158,8 @@ static inline void *folio_raw_mapping(const struct folio *folio)
> > > >   * mmap hook and safely handle error conditions. On error, VMA hooks will be
> > > >   * mutated.
> > > >   *
> > > > + * IMPORTANT: f_op->mmap() is deprecated, prefer f_op->mmap_prepare().
> > > > + *
>
> What exactly would one do to "prefer f_op->mmap_prepare()"?

I'm saying a person should implement f_op->mmap_prepare() rather than
f_op->mmap(), since the latter is deprecated :)

I think that's pretty clear no?

> Since you are adding this comment for mmap_file(), I think you need to
> describe more specifically what one should call instead.

I think it'd be a complete distraction, since if you're at the point of calling
mmap_file() you're already not implement mmap_prepare except as a compatbility
layer.

I mean maybe I'll just drop this as it seems to be causing confusion.

>
> > > >   * @file: File which backs the mapping.
> > > >   * @vma:  VMA which we are mapping.
> > > >   *
> > > > @@ -201,6 +203,14 @@ static inline void vma_close(struct vm_area_struct *vma)
> > > >  /* unmap_vmas is in mm/memory.c */
> > > >  void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
> > > >
> > > > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > > > +{
> > > > +   const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > > +
> > > > +   mmap_assert_locked(vma->vm_mm);
>
> You must hold the mmap write lock when unmapping. Would be better to
> assert mmap_assert_write_locked() or even vma_assert_write_locked(),
> which implies mmap_assert_write_locked().

I'm not sure why we don't assert this in those paths.

I think I assumed we could only assert readonly because one of those paths
downgrades the mmap write lock to a read lock.

I don't think we can do a VMA write lock assert here, since at the point of
do_munmap() all callers can't possibly have the VMA write lock, since they are
_looking up_ the VMA at the specified address.

But I can convert this to an mmap_assert_write_locked()!

>
> > > > +   do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > > > +}
> > > > +
> > > >  #ifdef CONFIG_MMU
> > > >
> > > >  static inline void get_anon_vma(struct anon_vma *anon_vma)
> > > > diff --git a/mm/util.c b/mm/util.c
> > > > index dba1191725b6..2b0ed54008d6 100644
> > > > --- a/mm/util.c
> > > > +++ b/mm/util.c
> > > > @@ -1163,6 +1163,55 @@ void flush_dcache_folio(struct folio *folio)
> > > >  EXPORT_SYMBOL(flush_dcache_folio);
> > > >  #endif
> > > >
> > > > +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > > > +{
> > > > +   struct vm_area_desc desc = {
> > > > +           .mm = vma->vm_mm,
> > > > +           .file = file,
> > > > +           .start = vma->vm_start,
> > > > +           .end = vma->vm_end,
> > > > +
> > > > +           .pgoff = vma->vm_pgoff,
> > > > +           .vm_file = vma->vm_file,
> > > > +           .vma_flags = vma->flags,
> > > > +           .page_prot = vma->vm_page_prot,
> > > > +
> > > > +           .action.type = MMAP_NOTHING, /* Default */
> > > > +   };
> > > > +   int err;
> > > > +
> > > > +   err = vfs_mmap_prepare(file, &desc);
> > > > +   if (err)
> > > > +           return err;
> > > > +
> > > > +   err = mmap_action_prepare(&desc, &desc.action);
> > > > +   if (err)
> > > > +           return err;
> > > > +
> > > > +   set_vma_from_desc(vma, &desc);
> > > > +   return mmap_action_complete(vma, &desc.action);
> > > > +}
> > > > +
> > > > +static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> > > > +{
> > > > +   const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > > > +   void *vm_private_data = vma->vm_private_data;
> > > > +   int err;
> > > > +
> > > > +   if (!vm_ops->mapped)
> > > > +           return 0;
> > > > +
> > >
> > > Hello!
> > >
> > > Can vm_ops be NULL here?  __compat_vma_mapped() is called from
> > > compat_vma_mmap(), which is reached when a filesystem provides
> > > mmap_prepare.  If the mmap_prepare hook does not set desc->vm_ops,
> > > vma->vm_ops will be NULL and this dereferences a NULL pointer.
> >
> > I _think_ for this to ever be invoked, you would need to be dealing with a
> > file-backed VMA so vm_ops->fault would HAVE to be defined.
> >
> > But you're right anyway as a matter of principle we should check it! Will fix.
> >
> > >
> > > For e.g. drivers/char/mem.c, mmap_zero_prepare() would trigger
> > > a NULL pointer dereference here.
> > >
> > > Would need to do
> > >       if (!vm_ops || !vm_ops->mapped)
> > >               return 0;
> > >
> > > here
> >
> > Yes.
> >
> > >
> > >
> > > > +   err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
> > > > +                        &vm_private_data);
> > > > +   if (err)
> > > > +           unmap_vma_locked(vma);
> > >
> > > when mapped() returns an error, unmap_vma_locked(vma) is called
> > > but execution continues into the vm_private_data update below.  After
> > > unmap_vma_locked() the VMA may be freed (do_munmap can remove the VMA
> > > entirely), so accessing vma->vm_private_data after that is a
> > > use-after-free.
> >
> > Very good point :) will fix thanks!
> >
> > Probably:
> >
> >         if (err)
> >                 unmap_vma_locked(vma);
> >         else if (vm_private_data != vma->vm_private_data)
> >                 vma->vm_private_data = vm_private_data;
> >
> >         return err;
> >
> > Would be fine.
> >
> > >
> > > Probably need to do:
> > >       if (err) {
> > >               unmap_vma_locked(vma);
> > >               return err;
> > >       }
> > >
> > > > +   /* Update private data if changed. */
> > > > +   if (vm_private_data != vma->vm_private_data)
> > > > +           vma->vm_private_data = vm_private_data;
> > > > +
> > > > +   return err;
> > > > +}
> > > > +
> > > >  /**
> > > >   * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
> > > >   * existing VMA and execute any requested actions.
> > > > @@ -1191,34 +1240,26 @@ EXPORT_SYMBOL(flush_dcache_folio);
> > > >   */
> > > >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > > >  {
> > > > -   struct vm_area_desc desc = {
> > > > -           .mm = vma->vm_mm,
> > > > -           .file = file,
> > > > -           .start = vma->vm_start,
> > > > -           .end = vma->vm_end,
> > > > -
> > > > -           .pgoff = vma->vm_pgoff,
> > > > -           .vm_file = vma->vm_file,
> > > > -           .vma_flags = vma->flags,
> > > > -           .page_prot = vma->vm_page_prot,
> > > > -
> > > > -           .action.type = MMAP_NOTHING, /* Default */
> > > > -   };
> > > >     int err;
> > > >
> > > > -   err = vfs_mmap_prepare(file, &desc);
> > > > -   if (err)
> > > > -           return err;
> > > > -
> > > > -   err = mmap_action_prepare(&desc, &desc.action);
> > > > +   err = __compat_vma_mmap(file, vma);
> > > >     if (err)
> > > >             return err;
> > > >
> > > > -   set_vma_from_desc(vma, &desc);
> > > > -   return mmap_action_complete(vma, &desc.action);
> > > > +   return __compat_vma_mapped(file, vma);
> > > >  }
> > > >  EXPORT_SYMBOL(compat_vma_mmap);
> > > >
> > > > +int __vma_check_mmap_hook(struct vm_area_struct *vma)
> > > > +{
> > > > +   /* vm_ops->mapped is not valid if mmap() is specified. */
> > > > +   if (WARN_ON_ONCE(vma->vm_ops->mapped))
> > > > +           return -EINVAL;
> > >
> > > I think vma->vm_ops can be NULL here. Should be:
> > >
> > >       if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
> > >               return -EINVAL;
> >
> > I think again you'd probably only invoke this on file-backed so be ok, but again
> > as a matter of principle we should check it so will fix, thanks!
> >
> > >
> > > > +
> > > > +   return 0;
> > > > +}
> > > > +EXPORT_SYMBOL(__vma_check_mmap_hook);
>
> nit: Any reason __vma_check_mmap_hook() is not inlined next to its
> user vfs_mmap()?

Headers fun, fs.h is a 'before mm.h' header, so vm_operations_struct is not
declared yet here, so we can't actually do the check there.

>
> > > > +
> > > >  static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
> > > >                      const struct page *page)
> > > >  {
> > > > @@ -1316,10 +1357,7 @@ static int mmap_action_finish(struct vm_area_struct *vma,
> > > >      * invoked if we do NOT merge, so we only clean up the VMA we created.
> > > >      */
> > > >     if (err) {
> > > > -           const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > > -
> > > > -           do_munmap(current->mm, vma->vm_start, len, NULL);
> > > > -
> > > > +           unmap_vma_locked(vma);
> > > >             if (action->error_hook) {
> > > >                     /* We may want to filter the error. */
> > > >                     err = action->error_hook(err);
> > > > diff --git a/mm/vma.c b/mm/vma.c
> > > > index 054cf1d262fb..ef9f5a5365d1 100644
> > > > --- a/mm/vma.c
> > > > +++ b/mm/vma.c
> > > > @@ -2705,21 +2705,35 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
> > > >     return false;
> > > >  }
> > > >
> > > > -static int call_action_complete(struct mmap_state *map,
> > > > -                           struct mmap_action *action,
> > > > -                           struct vm_area_struct *vma)
> > > > +static int call_mapped_hook(struct vm_area_struct *vma)
> > > >  {
> > > > -   int ret;
> > > > +   const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > > > +   void *vm_private_data = vma->vm_private_data;
> > > > +   int err;
> > > >
> > > > -   ret = mmap_action_complete(vma, action);
> > > > +   if (!vm_ops || !vm_ops->mapped)
> > > > +           return 0;
> > > > +   err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
> > > > +                        vma->vm_file, &vm_private_data);
> > > > +   if (err) {
> > > > +           unmap_vma_locked(vma);
> > > > +           return err;
> > > > +   }
> > > > +   /* Update private data if changed. */
> > > > +   if (vm_private_data != vma->vm_private_data)
> > > > +           vma->vm_private_data = vm_private_data;
> > > > +   return 0;
> > > > +}
> > > >
> > > > -   /* If we held the file rmap we need to release it. */
> > > > -   if (map->hold_file_rmap_lock) {
> > > > -           struct file *file = vma->vm_file;
> > > > +static void maybe_drop_file_rmap_lock(struct mmap_state *map,
> > > > +                                 struct vm_area_struct *vma)
> > > > +{
> > > > +   struct file *file;
> > > >
> > > > -           i_mmap_unlock_write(file->f_mapping);
> > > > -   }
> > > > -   return ret;
> > > > +   if (!map->hold_file_rmap_lock)
> > > > +           return;
> > > > +   file = vma->vm_file;
> > > > +   i_mmap_unlock_write(file->f_mapping);
> > > >  }
> > > >
> > > >  static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > > > @@ -2773,8 +2787,11 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > > >     __mmap_complete(&map, vma);
> > > >
> > > >     if (have_mmap_prepare && allocated_new) {
> > > > -           error = call_action_complete(&map, &desc.action, vma);
> > > > +           error = mmap_action_complete(vma, &desc.action);
> > > > +           if (!error)
> > > > +                   error = call_mapped_hook(vma);
> > > >
> > > > +           maybe_drop_file_rmap_lock(&map, vma);
> > > >             if (error)
> > > >                     return error;
> > > >     }
> > > > diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
> > > > index 908beb263307..47d8db809f31 100644
> > > > --- a/tools/testing/vma/include/dup.h
> > > > +++ b/tools/testing/vma/include/dup.h
> > > > @@ -606,12 +606,34 @@ struct vm_area_struct {
> > > >  } __randomize_layout;
> > > >
> > > >  struct vm_operations_struct {
> > > > -   void (*open)(struct vm_area_struct * area);
> > > > +   /**
> > > > +    * @open: Called when a VMA is remapped or split. Not called upon first
> > > > +    * mapping a VMA.
> > > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > +    */
>
> This comment should have been introduced in the previous patch.

It's the testing code, it's not really important. But if I respin I'll fix... :)

>
> > > > +   void (*open)(struct vm_area_struct *vma);
> > > >     /**
> > > >      * @close: Called when the VMA is being removed from the MM.
> > > >      * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > >      */
> > > > -   void (*close)(struct vm_area_struct * area);
> > > > +   void (*close)(struct vm_area_struct *vma);
> > > > +   /**
> > > > +    * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > > > +    * the new VMA is merged with an adjacent VMA.
> > > > +    *
> > > > +    * The @vm_private_data field is an output field allowing the user to
> > > > +    * modify vma->vm_private_data as necessary.
> > > > +    *
> > > > +    * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > > > +    * set from f_op->mmap.
> > > > +    *
> > > > +    * Returns %0 on success, or an error otherwise. On error, the VMA will
> > > > +    * be unmapped.
> > > > +    *
> > > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > +    */
> > > > +   int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > > > +                 const struct file *file, void **vm_private_data);
> > > >     /* Called any time before splitting to check if it's allowed */
> > > >     int (*may_split)(struct vm_area_struct *area, unsigned long addr);
> > > >     int (*mremap)(struct vm_area_struct *area);
> > > > @@ -1345,3 +1367,11 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
> > > >     swap(vma->vm_file, file);
> > > >     fput(file);
> > > >  }
> > > > +
> > > > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > > > +{
> > > > +   const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > > +
> > > > +   mmap_assert_locked(vma->vm_mm);
> > > > +   do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > > > +}
> > > > --
> > > > 2.53.0
> > > >
> > > >
> >
> > Cheers, Lorenzo
Re: [PATCH 04/15] mm: add vm_ops->mapped hook
Posted by Suren Baghdasaryan 3 weeks ago
On Mon, Mar 16, 2026 at 6:39 AM Lorenzo Stoakes (Oracle) <ljs@kernel.org> wrote:
>
> On Sun, Mar 15, 2026 at 07:18:38PM -0700, Suren Baghdasaryan wrote:
> > On Fri, Mar 13, 2026 at 4:58 AM Lorenzo Stoakes (Oracle) <ljs@kernel.org> wrote:
> > >
> > > On Fri, Mar 13, 2026 at 04:02:36AM -0700, Usama Arif wrote:
> > > > On Thu, 12 Mar 2026 20:27:19 +0000 "Lorenzo Stoakes (Oracle)" <ljs@kernel.org> wrote:
> > > >
> > > > > Previously, when a driver needed to do something like establish a reference
> > > > > count, it could do so in the mmap hook in the knowledge that the mapping
> > > > > would succeed.
> > > > >
> > > > > With the introduction of f_op->mmap_prepare this is no longer the case, as
> > > > > it is invoked prior to actually establishing the mapping.
> > > > >
> > > > > To take this into account, introduce a new vm_ops->mapped callback which is
> > > > > invoked when the VMA is first mapped (though notably - not when it is
> > > > > merged - which is correct and mirrors existing mmap/open/close behaviour).
> > > > >
> > > > > We do better that vm_ops->open() here, as this callback can return an
> > > > > error, at which point the VMA will be unmapped.
> > > > >
> > > > > Note that vm_ops->mapped() is invoked after any mmap action is
> > > > > complete (such as I/O remapping).
> > > > >
> > > > > We intentionally do not expose the VMA at this point, exposing only the
> > > > > fields that could be used, and an output parameter in case the operation
> > > > > needs to update the vma->vm_private_data field.
> > > > >
> > > > > In order to deal with stacked filesystems which invoke inner filesystem's
> > > > > mmap() invocations, add __compat_vma_mapped() and invoke it on
> > > > > vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is
> > > > > handled when an mmap() caller invokes a nested filesystem's mmap_prepare()
> > > > > callback.
> > > > >
> > > > > We can now also remove call_action_complete() and invoke
> > > > > mmap_action_complete() directly, as we separate out the rmap lock logic to
> > > > > be called in __mmap_region() instead via maybe_drop_file_rmap_lock().
> > > > >
> > > > > We also abstract unmapping of a VMA on mmap action completion into its own
> > > > > helper function, unmap_vma_locked().
> > > > >
> > > > > Additionally, update VMA userland test headers to reflect the change.
> > > > >
> > > > > Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> > > > > ---
> > > > >  include/linux/fs.h              |  9 +++-
> > > > >  include/linux/mm.h              | 17 +++++++
> > > > >  mm/internal.h                   | 10 ++++
> > > > >  mm/util.c                       | 86 ++++++++++++++++++++++++---------
> > > > >  mm/vma.c                        | 41 +++++++++++-----
> > > > >  tools/testing/vma/include/dup.h | 34 ++++++++++++-
> > > > >  6 files changed, 158 insertions(+), 39 deletions(-)
> > > > >
> > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > index a2628a12bd2b..c390f5c667e3 100644
> > > > > --- a/include/linux/fs.h
> > > > > +++ b/include/linux/fs.h
> > > > > @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file)
> > > > >  }
> > > > >
> > > > >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
> > > > > +int __vma_check_mmap_hook(struct vm_area_struct *vma);
> > > > >
> > > > >  static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
> > > > >  {
> > > > > +   int err;
> > > > > +
> > > > >     if (file->f_op->mmap_prepare)
> > > > >             return compat_vma_mmap(file, vma);
> > > > >
> > > > > -   return file->f_op->mmap(file, vma);
> > > > > +   err = file->f_op->mmap(file, vma);
> > > > > +   if (err)
> > > > > +           return err;
> > > > > +
> > > > > +   return __vma_check_mmap_hook(vma);
> > > > >  }
> > > > >
> > > > >  static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > > index 12a0b4c63736..7333d5db1221 100644
> > > > > --- a/include/linux/mm.h
> > > > > +++ b/include/linux/mm.h
> > > > > @@ -759,6 +759,23 @@ struct vm_operations_struct {
> > > > >      * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > >      */
> > > > >     void (*close)(struct vm_area_struct *vma);
> > > > > +   /**
> > > > > +    * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > > > > +    * the new VMA is merged with an adjacent VMA.
> > > > > +    *
> > > > > +    * The @vm_private_data field is an output field allowing the user to
> > > > > +    * modify vma->vm_private_data as necessary.
> > > > > +    *
> > > > > +    * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > > > > +    * set from f_op->mmap.
> > > > > +    *
> > > > > +    * Returns %0 on success, or an error otherwise. On error, the VMA will
> > > > > +    * be unmapped.
> > > > > +    *
> > > > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > > +    */
> > > > > +   int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > > > > +                 const struct file *file, void **vm_private_data);
> > > > >     /* Called any time before splitting to check if it's allowed */
> > > > >     int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
> > > > >     int (*mremap)(struct vm_area_struct *vma);
> > > > > diff --git a/mm/internal.h b/mm/internal.h
> > > > > index 7bfa85b5e78b..f0f2cf1caa36 100644
> > > > > --- a/mm/internal.h
> > > > > +++ b/mm/internal.h
> > > > > @@ -158,6 +158,8 @@ static inline void *folio_raw_mapping(const struct folio *folio)
> > > > >   * mmap hook and safely handle error conditions. On error, VMA hooks will be
> > > > >   * mutated.
> > > > >   *
> > > > > + * IMPORTANT: f_op->mmap() is deprecated, prefer f_op->mmap_prepare().
> > > > > + *
> >
> > What exactly would one do to "prefer f_op->mmap_prepare()"?
>
> I'm saying a person should implement f_op->mmap_prepare() rather than
> f_op->mmap(), since the latter is deprecated :)
>
> I think that's pretty clear no?
>
> > Since you are adding this comment for mmap_file(), I think you need to
> > describe more specifically what one should call instead.
>
> I think it'd be a complete distraction, since if you're at the point of calling
> mmap_file() you're already not implement mmap_prepare except as a compatbility
> layer.

Yep, it seems like a warning that comes too late.

>
> I mean maybe I'll just drop this as it seems to be causing confusion.

Maybe instead we add a comment that f_ops->mmap is deprecated in favor
of f_ops->mmap_prepare() in here:
https://elixir.bootlin.com/linux/v7.0-rc4/source/include/linux/fs.h#L1940
?

>
> >
> > > > >   * @file: File which backs the mapping.
> > > > >   * @vma:  VMA which we are mapping.
> > > > >   *
> > > > > @@ -201,6 +203,14 @@ static inline void vma_close(struct vm_area_struct *vma)
> > > > >  /* unmap_vmas is in mm/memory.c */
> > > > >  void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
> > > > >
> > > > > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > > > > +{
> > > > > +   const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > > > +
> > > > > +   mmap_assert_locked(vma->vm_mm);
> >
> > You must hold the mmap write lock when unmapping. Would be better to
> > assert mmap_assert_write_locked() or even vma_assert_write_locked(),
> > which implies mmap_assert_write_locked().
>
> I'm not sure why we don't assert this in those paths.
>
> I think I assumed we could only assert readonly because one of those paths
> downgrades the mmap write lock to a read lock.
>
> I don't think we can do a VMA write lock assert here, since at the point of
> do_munmap() all callers can't possibly have the VMA write lock, since they are
> _looking up_ the VMA at the specified address.

It sounds strange to me that we are unmapping a VMA that was not
locked beforehand. Let me look into the call chains a bit more to
convince myself one way or the other. The fact that do_munmap() looks
up the VMA by address and then write-locks it inside
vms_gather_munmap_vmas() does not mean the VMA was not already locked.
vma_start_write() is re-entrant.

>
> But I can convert this to an mmap_assert_write_locked()!

Ok, let's go with that. I don't want to slow down your patchset while
I investigate locking rules here. We can strengthen the assertion
later.

>
> >
> > > > > +   do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > > > > +}
> > > > > +
> > > > >  #ifdef CONFIG_MMU
> > > > >
> > > > >  static inline void get_anon_vma(struct anon_vma *anon_vma)
> > > > > diff --git a/mm/util.c b/mm/util.c
> > > > > index dba1191725b6..2b0ed54008d6 100644
> > > > > --- a/mm/util.c
> > > > > +++ b/mm/util.c
> > > > > @@ -1163,6 +1163,55 @@ void flush_dcache_folio(struct folio *folio)
> > > > >  EXPORT_SYMBOL(flush_dcache_folio);
> > > > >  #endif
> > > > >
> > > > > +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > > > > +{
> > > > > +   struct vm_area_desc desc = {
> > > > > +           .mm = vma->vm_mm,
> > > > > +           .file = file,
> > > > > +           .start = vma->vm_start,
> > > > > +           .end = vma->vm_end,
> > > > > +
> > > > > +           .pgoff = vma->vm_pgoff,
> > > > > +           .vm_file = vma->vm_file,
> > > > > +           .vma_flags = vma->flags,
> > > > > +           .page_prot = vma->vm_page_prot,
> > > > > +
> > > > > +           .action.type = MMAP_NOTHING, /* Default */
> > > > > +   };
> > > > > +   int err;
> > > > > +
> > > > > +   err = vfs_mmap_prepare(file, &desc);
> > > > > +   if (err)
> > > > > +           return err;
> > > > > +
> > > > > +   err = mmap_action_prepare(&desc, &desc.action);
> > > > > +   if (err)
> > > > > +           return err;
> > > > > +
> > > > > +   set_vma_from_desc(vma, &desc);
> > > > > +   return mmap_action_complete(vma, &desc.action);
> > > > > +}
> > > > > +
> > > > > +static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> > > > > +{
> > > > > +   const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > > > > +   void *vm_private_data = vma->vm_private_data;
> > > > > +   int err;
> > > > > +
> > > > > +   if (!vm_ops->mapped)
> > > > > +           return 0;
> > > > > +
> > > >
> > > > Hello!
> > > >
> > > > Can vm_ops be NULL here?  __compat_vma_mapped() is called from
> > > > compat_vma_mmap(), which is reached when a filesystem provides
> > > > mmap_prepare.  If the mmap_prepare hook does not set desc->vm_ops,
> > > > vma->vm_ops will be NULL and this dereferences a NULL pointer.
> > >
> > > I _think_ for this to ever be invoked, you would need to be dealing with a
> > > file-backed VMA so vm_ops->fault would HAVE to be defined.
> > >
> > > But you're right anyway as a matter of principle we should check it! Will fix.
> > >
> > > >
> > > > For e.g. drivers/char/mem.c, mmap_zero_prepare() would trigger
> > > > a NULL pointer dereference here.
> > > >
> > > > Would need to do
> > > >       if (!vm_ops || !vm_ops->mapped)
> > > >               return 0;
> > > >
> > > > here
> > >
> > > Yes.
> > >
> > > >
> > > >
> > > > > +   err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
> > > > > +                        &vm_private_data);
> > > > > +   if (err)
> > > > > +           unmap_vma_locked(vma);
> > > >
> > > > when mapped() returns an error, unmap_vma_locked(vma) is called
> > > > but execution continues into the vm_private_data update below.  After
> > > > unmap_vma_locked() the VMA may be freed (do_munmap can remove the VMA
> > > > entirely), so accessing vma->vm_private_data after that is a
> > > > use-after-free.
> > >
> > > Very good point :) will fix thanks!
> > >
> > > Probably:
> > >
> > >         if (err)
> > >                 unmap_vma_locked(vma);
> > >         else if (vm_private_data != vma->vm_private_data)
> > >                 vma->vm_private_data = vm_private_data;
> > >
> > >         return err;
> > >
> > > Would be fine.
> > >
> > > >
> > > > Probably need to do:
> > > >       if (err) {
> > > >               unmap_vma_locked(vma);
> > > >               return err;
> > > >       }
> > > >
> > > > > +   /* Update private data if changed. */
> > > > > +   if (vm_private_data != vma->vm_private_data)
> > > > > +           vma->vm_private_data = vm_private_data;
> > > > > +
> > > > > +   return err;
> > > > > +}
> > > > > +
> > > > >  /**
> > > > >   * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
> > > > >   * existing VMA and execute any requested actions.
> > > > > @@ -1191,34 +1240,26 @@ EXPORT_SYMBOL(flush_dcache_folio);
> > > > >   */
> > > > >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > > > >  {
> > > > > -   struct vm_area_desc desc = {
> > > > > -           .mm = vma->vm_mm,
> > > > > -           .file = file,
> > > > > -           .start = vma->vm_start,
> > > > > -           .end = vma->vm_end,
> > > > > -
> > > > > -           .pgoff = vma->vm_pgoff,
> > > > > -           .vm_file = vma->vm_file,
> > > > > -           .vma_flags = vma->flags,
> > > > > -           .page_prot = vma->vm_page_prot,
> > > > > -
> > > > > -           .action.type = MMAP_NOTHING, /* Default */
> > > > > -   };
> > > > >     int err;
> > > > >
> > > > > -   err = vfs_mmap_prepare(file, &desc);
> > > > > -   if (err)
> > > > > -           return err;
> > > > > -
> > > > > -   err = mmap_action_prepare(&desc, &desc.action);
> > > > > +   err = __compat_vma_mmap(file, vma);
> > > > >     if (err)
> > > > >             return err;
> > > > >
> > > > > -   set_vma_from_desc(vma, &desc);
> > > > > -   return mmap_action_complete(vma, &desc.action);
> > > > > +   return __compat_vma_mapped(file, vma);
> > > > >  }
> > > > >  EXPORT_SYMBOL(compat_vma_mmap);
> > > > >
> > > > > +int __vma_check_mmap_hook(struct vm_area_struct *vma)
> > > > > +{
> > > > > +   /* vm_ops->mapped is not valid if mmap() is specified. */
> > > > > +   if (WARN_ON_ONCE(vma->vm_ops->mapped))
> > > > > +           return -EINVAL;
> > > >
> > > > I think vma->vm_ops can be NULL here. Should be:
> > > >
> > > >       if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
> > > >               return -EINVAL;
> > >
> > > I think again you'd probably only invoke this on file-backed so be ok, but again
> > > as a matter of principle we should check it so will fix, thanks!
> > >
> > > >
> > > > > +
> > > > > +   return 0;
> > > > > +}
> > > > > +EXPORT_SYMBOL(__vma_check_mmap_hook);
> >
> > nit: Any reason __vma_check_mmap_hook() is not inlined next to its
> > user vfs_mmap()?
>
> Headers fun, fs.h is a 'before mm.h' header, so vm_operations_struct is not
> declared yet here, so we can't actually do the check there.

Ack.

>
> >
> > > > > +
> > > > >  static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
> > > > >                      const struct page *page)
> > > > >  {
> > > > > @@ -1316,10 +1357,7 @@ static int mmap_action_finish(struct vm_area_struct *vma,
> > > > >      * invoked if we do NOT merge, so we only clean up the VMA we created.
> > > > >      */
> > > > >     if (err) {
> > > > > -           const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > > > -
> > > > > -           do_munmap(current->mm, vma->vm_start, len, NULL);
> > > > > -
> > > > > +           unmap_vma_locked(vma);
> > > > >             if (action->error_hook) {
> > > > >                     /* We may want to filter the error. */
> > > > >                     err = action->error_hook(err);
> > > > > diff --git a/mm/vma.c b/mm/vma.c
> > > > > index 054cf1d262fb..ef9f5a5365d1 100644
> > > > > --- a/mm/vma.c
> > > > > +++ b/mm/vma.c
> > > > > @@ -2705,21 +2705,35 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
> > > > >     return false;
> > > > >  }
> > > > >
> > > > > -static int call_action_complete(struct mmap_state *map,
> > > > > -                           struct mmap_action *action,
> > > > > -                           struct vm_area_struct *vma)
> > > > > +static int call_mapped_hook(struct vm_area_struct *vma)
> > > > >  {
> > > > > -   int ret;
> > > > > +   const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > > > > +   void *vm_private_data = vma->vm_private_data;
> > > > > +   int err;
> > > > >
> > > > > -   ret = mmap_action_complete(vma, action);
> > > > > +   if (!vm_ops || !vm_ops->mapped)
> > > > > +           return 0;
> > > > > +   err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
> > > > > +                        vma->vm_file, &vm_private_data);
> > > > > +   if (err) {
> > > > > +           unmap_vma_locked(vma);
> > > > > +           return err;
> > > > > +   }
> > > > > +   /* Update private data if changed. */
> > > > > +   if (vm_private_data != vma->vm_private_data)
> > > > > +           vma->vm_private_data = vm_private_data;
> > > > > +   return 0;
> > > > > +}
> > > > >
> > > > > -   /* If we held the file rmap we need to release it. */
> > > > > -   if (map->hold_file_rmap_lock) {
> > > > > -           struct file *file = vma->vm_file;
> > > > > +static void maybe_drop_file_rmap_lock(struct mmap_state *map,
> > > > > +                                 struct vm_area_struct *vma)
> > > > > +{
> > > > > +   struct file *file;
> > > > >
> > > > > -           i_mmap_unlock_write(file->f_mapping);
> > > > > -   }
> > > > > -   return ret;
> > > > > +   if (!map->hold_file_rmap_lock)
> > > > > +           return;
> > > > > +   file = vma->vm_file;
> > > > > +   i_mmap_unlock_write(file->f_mapping);
> > > > >  }
> > > > >
> > > > >  static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > > > > @@ -2773,8 +2787,11 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > > > >     __mmap_complete(&map, vma);
> > > > >
> > > > >     if (have_mmap_prepare && allocated_new) {
> > > > > -           error = call_action_complete(&map, &desc.action, vma);
> > > > > +           error = mmap_action_complete(vma, &desc.action);
> > > > > +           if (!error)
> > > > > +                   error = call_mapped_hook(vma);
> > > > >
> > > > > +           maybe_drop_file_rmap_lock(&map, vma);
> > > > >             if (error)
> > > > >                     return error;
> > > > >     }
> > > > > diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
> > > > > index 908beb263307..47d8db809f31 100644
> > > > > --- a/tools/testing/vma/include/dup.h
> > > > > +++ b/tools/testing/vma/include/dup.h
> > > > > @@ -606,12 +606,34 @@ struct vm_area_struct {
> > > > >  } __randomize_layout;
> > > > >
> > > > >  struct vm_operations_struct {
> > > > > -   void (*open)(struct vm_area_struct * area);
> > > > > +   /**
> > > > > +    * @open: Called when a VMA is remapped or split. Not called upon first
> > > > > +    * mapping a VMA.
> > > > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > > +    */
> >
> > This comment should have been introduced in the previous patch.
>
> It's the testing code, it's not really important. But if I respin I'll fix... :)

Thanks!

>
> >
> > > > > +   void (*open)(struct vm_area_struct *vma);
> > > > >     /**
> > > > >      * @close: Called when the VMA is being removed from the MM.
> > > > >      * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > >      */
> > > > > -   void (*close)(struct vm_area_struct * area);
> > > > > +   void (*close)(struct vm_area_struct *vma);
> > > > > +   /**
> > > > > +    * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > > > > +    * the new VMA is merged with an adjacent VMA.
> > > > > +    *
> > > > > +    * The @vm_private_data field is an output field allowing the user to
> > > > > +    * modify vma->vm_private_data as necessary.
> > > > > +    *
> > > > > +    * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > > > > +    * set from f_op->mmap.
> > > > > +    *
> > > > > +    * Returns %0 on success, or an error otherwise. On error, the VMA will
> > > > > +    * be unmapped.
> > > > > +    *
> > > > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > > +    */
> > > > > +   int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > > > > +                 const struct file *file, void **vm_private_data);
> > > > >     /* Called any time before splitting to check if it's allowed */
> > > > >     int (*may_split)(struct vm_area_struct *area, unsigned long addr);
> > > > >     int (*mremap)(struct vm_area_struct *area);
> > > > > @@ -1345,3 +1367,11 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
> > > > >     swap(vma->vm_file, file);
> > > > >     fput(file);
> > > > >  }
> > > > > +
> > > > > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > > > > +{
> > > > > +   const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > > > +
> > > > > +   mmap_assert_locked(vma->vm_mm);
> > > > > +   do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > > > > +}
> > > > > --
> > > > > 2.53.0
> > > > >
> > > > >
> > >
> > > Cheers, Lorenzo
Re: [PATCH 04/15] mm: add vm_ops->mapped hook
Posted by Lorenzo Stoakes (Oracle) 3 weeks ago
On Mon, Mar 16, 2026 at 04:39:00PM -0700, Suren Baghdasaryan wrote:
> On Mon, Mar 16, 2026 at 6:39 AM Lorenzo Stoakes (Oracle) <ljs@kernel.org> wrote:
> >
> > On Sun, Mar 15, 2026 at 07:18:38PM -0700, Suren Baghdasaryan wrote:
> > > On Fri, Mar 13, 2026 at 4:58 AM Lorenzo Stoakes (Oracle) <ljs@kernel.org> wrote:
> > > >
> > > > On Fri, Mar 13, 2026 at 04:02:36AM -0700, Usama Arif wrote:
> > > > > On Thu, 12 Mar 2026 20:27:19 +0000 "Lorenzo Stoakes (Oracle)" <ljs@kernel.org> wrote:
> > > > >
> > > > > > Previously, when a driver needed to do something like establish a reference
> > > > > > count, it could do so in the mmap hook in the knowledge that the mapping
> > > > > > would succeed.
> > > > > >
> > > > > > With the introduction of f_op->mmap_prepare this is no longer the case, as
> > > > > > it is invoked prior to actually establishing the mapping.
> > > > > >
> > > > > > To take this into account, introduce a new vm_ops->mapped callback which is
> > > > > > invoked when the VMA is first mapped (though notably - not when it is
> > > > > > merged - which is correct and mirrors existing mmap/open/close behaviour).
> > > > > >
> > > > > > We do better that vm_ops->open() here, as this callback can return an
> > > > > > error, at which point the VMA will be unmapped.
> > > > > >
> > > > > > Note that vm_ops->mapped() is invoked after any mmap action is
> > > > > > complete (such as I/O remapping).
> > > > > >
> > > > > > We intentionally do not expose the VMA at this point, exposing only the
> > > > > > fields that could be used, and an output parameter in case the operation
> > > > > > needs to update the vma->vm_private_data field.
> > > > > >
> > > > > > In order to deal with stacked filesystems which invoke inner filesystem's
> > > > > > mmap() invocations, add __compat_vma_mapped() and invoke it on
> > > > > > vfs_mmap() (via compat_vma_mmap()) to ensure that the mapped callback is
> > > > > > handled when an mmap() caller invokes a nested filesystem's mmap_prepare()
> > > > > > callback.
> > > > > >
> > > > > > We can now also remove call_action_complete() and invoke
> > > > > > mmap_action_complete() directly, as we separate out the rmap lock logic to
> > > > > > be called in __mmap_region() instead via maybe_drop_file_rmap_lock().
> > > > > >
> > > > > > We also abstract unmapping of a VMA on mmap action completion into its own
> > > > > > helper function, unmap_vma_locked().
> > > > > >
> > > > > > Additionally, update VMA userland test headers to reflect the change.
> > > > > >
> > > > > > Signed-off-by: Lorenzo Stoakes (Oracle) <ljs@kernel.org>
> > > > > > ---
> > > > > >  include/linux/fs.h              |  9 +++-
> > > > > >  include/linux/mm.h              | 17 +++++++
> > > > > >  mm/internal.h                   | 10 ++++
> > > > > >  mm/util.c                       | 86 ++++++++++++++++++++++++---------
> > > > > >  mm/vma.c                        | 41 +++++++++++-----
> > > > > >  tools/testing/vma/include/dup.h | 34 ++++++++++++-
> > > > > >  6 files changed, 158 insertions(+), 39 deletions(-)
> > > > > >
> > > > > > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > > > > > index a2628a12bd2b..c390f5c667e3 100644
> > > > > > --- a/include/linux/fs.h
> > > > > > +++ b/include/linux/fs.h
> > > > > > @@ -2059,13 +2059,20 @@ static inline bool can_mmap_file(struct file *file)
> > > > > >  }
> > > > > >
> > > > > >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma);
> > > > > > +int __vma_check_mmap_hook(struct vm_area_struct *vma);
> > > > > >
> > > > > >  static inline int vfs_mmap(struct file *file, struct vm_area_struct *vma)
> > > > > >  {
> > > > > > +   int err;
> > > > > > +
> > > > > >     if (file->f_op->mmap_prepare)
> > > > > >             return compat_vma_mmap(file, vma);
> > > > > >
> > > > > > -   return file->f_op->mmap(file, vma);
> > > > > > +   err = file->f_op->mmap(file, vma);
> > > > > > +   if (err)
> > > > > > +           return err;
> > > > > > +
> > > > > > +   return __vma_check_mmap_hook(vma);
> > > > > >  }
> > > > > >
> > > > > >  static inline int vfs_mmap_prepare(struct file *file, struct vm_area_desc *desc)
> > > > > > diff --git a/include/linux/mm.h b/include/linux/mm.h
> > > > > > index 12a0b4c63736..7333d5db1221 100644
> > > > > > --- a/include/linux/mm.h
> > > > > > +++ b/include/linux/mm.h
> > > > > > @@ -759,6 +759,23 @@ struct vm_operations_struct {
> > > > > >      * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > > >      */
> > > > > >     void (*close)(struct vm_area_struct *vma);
> > > > > > +   /**
> > > > > > +    * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > > > > > +    * the new VMA is merged with an adjacent VMA.
> > > > > > +    *
> > > > > > +    * The @vm_private_data field is an output field allowing the user to
> > > > > > +    * modify vma->vm_private_data as necessary.
> > > > > > +    *
> > > > > > +    * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > > > > > +    * set from f_op->mmap.
> > > > > > +    *
> > > > > > +    * Returns %0 on success, or an error otherwise. On error, the VMA will
> > > > > > +    * be unmapped.
> > > > > > +    *
> > > > > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > > > +    */
> > > > > > +   int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > > > > > +                 const struct file *file, void **vm_private_data);
> > > > > >     /* Called any time before splitting to check if it's allowed */
> > > > > >     int (*may_split)(struct vm_area_struct *vma, unsigned long addr);
> > > > > >     int (*mremap)(struct vm_area_struct *vma);
> > > > > > diff --git a/mm/internal.h b/mm/internal.h
> > > > > > index 7bfa85b5e78b..f0f2cf1caa36 100644
> > > > > > --- a/mm/internal.h
> > > > > > +++ b/mm/internal.h
> > > > > > @@ -158,6 +158,8 @@ static inline void *folio_raw_mapping(const struct folio *folio)
> > > > > >   * mmap hook and safely handle error conditions. On error, VMA hooks will be
> > > > > >   * mutated.
> > > > > >   *
> > > > > > + * IMPORTANT: f_op->mmap() is deprecated, prefer f_op->mmap_prepare().
> > > > > > + *
> > >
> > > What exactly would one do to "prefer f_op->mmap_prepare()"?
> >
> > I'm saying a person should implement f_op->mmap_prepare() rather than
> > f_op->mmap(), since the latter is deprecated :)
> >
> > I think that's pretty clear no?
> >
> > > Since you are adding this comment for mmap_file(), I think you need to
> > > describe more specifically what one should call instead.
> >
> > I think it'd be a complete distraction, since if you're at the point of calling
> > mmap_file() you're already not implement mmap_prepare except as a compatbility
> > layer.
>
> Yep, it seems like a warning that comes too late.

Yeah, it's the wrong place for it, agreed.

>
> >
> > I mean maybe I'll just drop this as it seems to be causing confusion.
>
> Maybe instead we add a comment that f_ops->mmap is deprecated in favor
> of f_ops->mmap_prepare() in here:
> https://elixir.bootlin.com/linux/v7.0-rc4/source/include/linux/fs.h#L1940
> ?

Yeah could do, I think maybe once the mmap_prepare changes are further along
actually, as I am still essentially figuring out what functionality to
provide/the shape of it as I develop it.

It's a bit chicken-and-egg, but doing it this way has evolved to a pretty nice
approach so far matching what drivers _actually do_ + finding new ways of doing
them without risk of them breaking stuff which is kinda the whole point - this
isn't a rework for rework's sake, but rather effectively completely changing how
drivers perform mmap.

>
> >
> > >
> > > > > >   * @file: File which backs the mapping.
> > > > > >   * @vma:  VMA which we are mapping.
> > > > > >   *
> > > > > > @@ -201,6 +203,14 @@ static inline void vma_close(struct vm_area_struct *vma)
> > > > > >  /* unmap_vmas is in mm/memory.c */
> > > > > >  void unmap_vmas(struct mmu_gather *tlb, struct unmap_desc *unmap);
> > > > > >
> > > > > > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > > > > > +{
> > > > > > +   const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > > > > +
> > > > > > +   mmap_assert_locked(vma->vm_mm);
> > >
> > > You must hold the mmap write lock when unmapping. Would be better to
> > > assert mmap_assert_write_locked() or even vma_assert_write_locked(),
> > > which implies mmap_assert_write_locked().
> >
> > I'm not sure why we don't assert this in those paths.
> >
> > I think I assumed we could only assert readonly because one of those paths
> > downgrades the mmap write lock to a read lock.
> >
> > I don't think we can do a VMA write lock assert here, since at the point of
> > do_munmap() all callers can't possibly have the VMA write lock, since they are
> > _looking up_ the VMA at the specified address.
>
> It sounds strange to me that we are unmapping a VMA that was not
> locked beforehand. Let me look into the call chains a bit more to
> convince myself one way or the other. The fact that do_munmap() looks
> up the VMA by address and then write-locks it inside
> vms_gather_munmap_vmas() does not mean the VMA was not already locked.
> vma_start_write() is re-entrant.

Well I mean:

SYSCALL_DEFINE2(munmap, ...)
-> __vm_munmap [ takes mmap write lock ]
-> do_vmi_munmap()

do_munmap() [ assumes (but does not assert, we should add) mmap write lock]
-> do_vmi_munmap()

You can unmap more than one VMA from this interface, or even choose a range that
doesn't have anything mapped.

do_vmi_munmap() gets the first VMA and if none present exits early, then calls
into do_vmi_align_munmap() otherwise, which does the whole gather/complete
dance.

With respect to the mmap()'ing, actually we probably should always have VMA
write lock, because for any action to be taken, you couldn't merge since
VMA_SPECIAL_FLAGS would be specified (any kind of remap would be VMA_PFNMAP_BIT
+ friends, map kernel pages would be VMA_MIXEDMAP_BIT).

(Might be worth me adding an assert for that actually to avoid confusion.)

Not merging would mean __mmap_new_vma() would be called which naturally gets the
VMA write lock.

So you're right I think we should hold the VMA lock here, but I'm wondering if
it's much of a muchness since really we only _need_ the mmap write lock here.


>
> >
> > But I can convert this to an mmap_assert_write_locked()!
>
> Ok, let's go with that. I don't want to slow down your patchset while
> I investigate locking rules here. We can strengthen the assertion
> later.

Thanks!

>
> >
> > >
> > > > > > +   do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > > > > > +}
> > > > > > +
> > > > > >  #ifdef CONFIG_MMU
> > > > > >
> > > > > >  static inline void get_anon_vma(struct anon_vma *anon_vma)
> > > > > > diff --git a/mm/util.c b/mm/util.c
> > > > > > index dba1191725b6..2b0ed54008d6 100644
> > > > > > --- a/mm/util.c
> > > > > > +++ b/mm/util.c
> > > > > > @@ -1163,6 +1163,55 @@ void flush_dcache_folio(struct folio *folio)
> > > > > >  EXPORT_SYMBOL(flush_dcache_folio);
> > > > > >  #endif
> > > > > >
> > > > > > +static int __compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > > > > > +{
> > > > > > +   struct vm_area_desc desc = {
> > > > > > +           .mm = vma->vm_mm,
> > > > > > +           .file = file,
> > > > > > +           .start = vma->vm_start,
> > > > > > +           .end = vma->vm_end,
> > > > > > +
> > > > > > +           .pgoff = vma->vm_pgoff,
> > > > > > +           .vm_file = vma->vm_file,
> > > > > > +           .vma_flags = vma->flags,
> > > > > > +           .page_prot = vma->vm_page_prot,
> > > > > > +
> > > > > > +           .action.type = MMAP_NOTHING, /* Default */
> > > > > > +   };
> > > > > > +   int err;
> > > > > > +
> > > > > > +   err = vfs_mmap_prepare(file, &desc);
> > > > > > +   if (err)
> > > > > > +           return err;
> > > > > > +
> > > > > > +   err = mmap_action_prepare(&desc, &desc.action);
> > > > > > +   if (err)
> > > > > > +           return err;
> > > > > > +
> > > > > > +   set_vma_from_desc(vma, &desc);
> > > > > > +   return mmap_action_complete(vma, &desc.action);
> > > > > > +}
> > > > > > +
> > > > > > +static int __compat_vma_mapped(struct file *file, struct vm_area_struct *vma)
> > > > > > +{
> > > > > > +   const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > > > > > +   void *vm_private_data = vma->vm_private_data;
> > > > > > +   int err;
> > > > > > +
> > > > > > +   if (!vm_ops->mapped)
> > > > > > +           return 0;
> > > > > > +
> > > > >
> > > > > Hello!
> > > > >
> > > > > Can vm_ops be NULL here?  __compat_vma_mapped() is called from
> > > > > compat_vma_mmap(), which is reached when a filesystem provides
> > > > > mmap_prepare.  If the mmap_prepare hook does not set desc->vm_ops,
> > > > > vma->vm_ops will be NULL and this dereferences a NULL pointer.
> > > >
> > > > I _think_ for this to ever be invoked, you would need to be dealing with a
> > > > file-backed VMA so vm_ops->fault would HAVE to be defined.
> > > >
> > > > But you're right anyway as a matter of principle we should check it! Will fix.
> > > >
> > > > >
> > > > > For e.g. drivers/char/mem.c, mmap_zero_prepare() would trigger
> > > > > a NULL pointer dereference here.
> > > > >
> > > > > Would need to do
> > > > >       if (!vm_ops || !vm_ops->mapped)
> > > > >               return 0;
> > > > >
> > > > > here
> > > >
> > > > Yes.
> > > >
> > > > >
> > > > >
> > > > > > +   err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff, file,
> > > > > > +                        &vm_private_data);
> > > > > > +   if (err)
> > > > > > +           unmap_vma_locked(vma);
> > > > >
> > > > > when mapped() returns an error, unmap_vma_locked(vma) is called
> > > > > but execution continues into the vm_private_data update below.  After
> > > > > unmap_vma_locked() the VMA may be freed (do_munmap can remove the VMA
> > > > > entirely), so accessing vma->vm_private_data after that is a
> > > > > use-after-free.
> > > >
> > > > Very good point :) will fix thanks!
> > > >
> > > > Probably:
> > > >
> > > >         if (err)
> > > >                 unmap_vma_locked(vma);
> > > >         else if (vm_private_data != vma->vm_private_data)
> > > >                 vma->vm_private_data = vm_private_data;
> > > >
> > > >         return err;
> > > >
> > > > Would be fine.
> > > >
> > > > >
> > > > > Probably need to do:
> > > > >       if (err) {
> > > > >               unmap_vma_locked(vma);
> > > > >               return err;
> > > > >       }
> > > > >
> > > > > > +   /* Update private data if changed. */
> > > > > > +   if (vm_private_data != vma->vm_private_data)
> > > > > > +           vma->vm_private_data = vm_private_data;
> > > > > > +
> > > > > > +   return err;
> > > > > > +}
> > > > > > +
> > > > > >  /**
> > > > > >   * compat_vma_mmap() - Apply the file's .mmap_prepare() hook to an
> > > > > >   * existing VMA and execute any requested actions.
> > > > > > @@ -1191,34 +1240,26 @@ EXPORT_SYMBOL(flush_dcache_folio);
> > > > > >   */
> > > > > >  int compat_vma_mmap(struct file *file, struct vm_area_struct *vma)
> > > > > >  {
> > > > > > -   struct vm_area_desc desc = {
> > > > > > -           .mm = vma->vm_mm,
> > > > > > -           .file = file,
> > > > > > -           .start = vma->vm_start,
> > > > > > -           .end = vma->vm_end,
> > > > > > -
> > > > > > -           .pgoff = vma->vm_pgoff,
> > > > > > -           .vm_file = vma->vm_file,
> > > > > > -           .vma_flags = vma->flags,
> > > > > > -           .page_prot = vma->vm_page_prot,
> > > > > > -
> > > > > > -           .action.type = MMAP_NOTHING, /* Default */
> > > > > > -   };
> > > > > >     int err;
> > > > > >
> > > > > > -   err = vfs_mmap_prepare(file, &desc);
> > > > > > -   if (err)
> > > > > > -           return err;
> > > > > > -
> > > > > > -   err = mmap_action_prepare(&desc, &desc.action);
> > > > > > +   err = __compat_vma_mmap(file, vma);
> > > > > >     if (err)
> > > > > >             return err;
> > > > > >
> > > > > > -   set_vma_from_desc(vma, &desc);
> > > > > > -   return mmap_action_complete(vma, &desc.action);
> > > > > > +   return __compat_vma_mapped(file, vma);
> > > > > >  }
> > > > > >  EXPORT_SYMBOL(compat_vma_mmap);
> > > > > >
> > > > > > +int __vma_check_mmap_hook(struct vm_area_struct *vma)
> > > > > > +{
> > > > > > +   /* vm_ops->mapped is not valid if mmap() is specified. */
> > > > > > +   if (WARN_ON_ONCE(vma->vm_ops->mapped))
> > > > > > +           return -EINVAL;
> > > > >
> > > > > I think vma->vm_ops can be NULL here. Should be:
> > > > >
> > > > >       if (vma->vm_ops && WARN_ON_ONCE(vma->vm_ops->mapped))
> > > > >               return -EINVAL;
> > > >
> > > > I think again you'd probably only invoke this on file-backed so be ok, but again
> > > > as a matter of principle we should check it so will fix, thanks!
> > > >
> > > > >
> > > > > > +
> > > > > > +   return 0;
> > > > > > +}
> > > > > > +EXPORT_SYMBOL(__vma_check_mmap_hook);
> > >
> > > nit: Any reason __vma_check_mmap_hook() is not inlined next to its
> > > user vfs_mmap()?
> >
> > Headers fun, fs.h is a 'before mm.h' header, so vm_operations_struct is not
> > declared yet here, so we can't actually do the check there.
>
> Ack.
>
> >
> > >
> > > > > > +
> > > > > >  static void set_ps_flags(struct page_snapshot *ps, const struct folio *folio,
> > > > > >                      const struct page *page)
> > > > > >  {
> > > > > > @@ -1316,10 +1357,7 @@ static int mmap_action_finish(struct vm_area_struct *vma,
> > > > > >      * invoked if we do NOT merge, so we only clean up the VMA we created.
> > > > > >      */
> > > > > >     if (err) {
> > > > > > -           const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > > > > -
> > > > > > -           do_munmap(current->mm, vma->vm_start, len, NULL);
> > > > > > -
> > > > > > +           unmap_vma_locked(vma);
> > > > > >             if (action->error_hook) {
> > > > > >                     /* We may want to filter the error. */
> > > > > >                     err = action->error_hook(err);
> > > > > > diff --git a/mm/vma.c b/mm/vma.c
> > > > > > index 054cf1d262fb..ef9f5a5365d1 100644
> > > > > > --- a/mm/vma.c
> > > > > > +++ b/mm/vma.c
> > > > > > @@ -2705,21 +2705,35 @@ static bool can_set_ksm_flags_early(struct mmap_state *map)
> > > > > >     return false;
> > > > > >  }
> > > > > >
> > > > > > -static int call_action_complete(struct mmap_state *map,
> > > > > > -                           struct mmap_action *action,
> > > > > > -                           struct vm_area_struct *vma)
> > > > > > +static int call_mapped_hook(struct vm_area_struct *vma)
> > > > > >  {
> > > > > > -   int ret;
> > > > > > +   const struct vm_operations_struct *vm_ops = vma->vm_ops;
> > > > > > +   void *vm_private_data = vma->vm_private_data;
> > > > > > +   int err;
> > > > > >
> > > > > > -   ret = mmap_action_complete(vma, action);
> > > > > > +   if (!vm_ops || !vm_ops->mapped)
> > > > > > +           return 0;
> > > > > > +   err = vm_ops->mapped(vma->vm_start, vma->vm_end, vma->vm_pgoff,
> > > > > > +                        vma->vm_file, &vm_private_data);
> > > > > > +   if (err) {
> > > > > > +           unmap_vma_locked(vma);
> > > > > > +           return err;
> > > > > > +   }
> > > > > > +   /* Update private data if changed. */
> > > > > > +   if (vm_private_data != vma->vm_private_data)
> > > > > > +           vma->vm_private_data = vm_private_data;
> > > > > > +   return 0;
> > > > > > +}
> > > > > >
> > > > > > -   /* If we held the file rmap we need to release it. */
> > > > > > -   if (map->hold_file_rmap_lock) {
> > > > > > -           struct file *file = vma->vm_file;
> > > > > > +static void maybe_drop_file_rmap_lock(struct mmap_state *map,
> > > > > > +                                 struct vm_area_struct *vma)
> > > > > > +{
> > > > > > +   struct file *file;
> > > > > >
> > > > > > -           i_mmap_unlock_write(file->f_mapping);
> > > > > > -   }
> > > > > > -   return ret;
> > > > > > +   if (!map->hold_file_rmap_lock)
> > > > > > +           return;
> > > > > > +   file = vma->vm_file;
> > > > > > +   i_mmap_unlock_write(file->f_mapping);
> > > > > >  }
> > > > > >
> > > > > >  static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > > > > > @@ -2773,8 +2787,11 @@ static unsigned long __mmap_region(struct file *file, unsigned long addr,
> > > > > >     __mmap_complete(&map, vma);
> > > > > >
> > > > > >     if (have_mmap_prepare && allocated_new) {
> > > > > > -           error = call_action_complete(&map, &desc.action, vma);
> > > > > > +           error = mmap_action_complete(vma, &desc.action);
> > > > > > +           if (!error)
> > > > > > +                   error = call_mapped_hook(vma);
> > > > > >
> > > > > > +           maybe_drop_file_rmap_lock(&map, vma);
> > > > > >             if (error)
> > > > > >                     return error;
> > > > > >     }
> > > > > > diff --git a/tools/testing/vma/include/dup.h b/tools/testing/vma/include/dup.h
> > > > > > index 908beb263307..47d8db809f31 100644
> > > > > > --- a/tools/testing/vma/include/dup.h
> > > > > > +++ b/tools/testing/vma/include/dup.h
> > > > > > @@ -606,12 +606,34 @@ struct vm_area_struct {
> > > > > >  } __randomize_layout;
> > > > > >
> > > > > >  struct vm_operations_struct {
> > > > > > -   void (*open)(struct vm_area_struct * area);
> > > > > > +   /**
> > > > > > +    * @open: Called when a VMA is remapped or split. Not called upon first
> > > > > > +    * mapping a VMA.
> > > > > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > > > +    */
> > >
> > > This comment should have been introduced in the previous patch.
> >
> > It's the testing code, it's not really important. But if I respin I'll fix... :)
>
> Thanks!
>
> >
> > >
> > > > > > +   void (*open)(struct vm_area_struct *vma);
> > > > > >     /**
> > > > > >      * @close: Called when the VMA is being removed from the MM.
> > > > > >      * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > > >      */
> > > > > > -   void (*close)(struct vm_area_struct * area);
> > > > > > +   void (*close)(struct vm_area_struct *vma);
> > > > > > +   /**
> > > > > > +    * @mapped: Called when the VMA is first mapped in the MM. Not called if
> > > > > > +    * the new VMA is merged with an adjacent VMA.
> > > > > > +    *
> > > > > > +    * The @vm_private_data field is an output field allowing the user to
> > > > > > +    * modify vma->vm_private_data as necessary.
> > > > > > +    *
> > > > > > +    * ONLY valid if set from f_op->mmap_prepare. Will result in an error if
> > > > > > +    * set from f_op->mmap.
> > > > > > +    *
> > > > > > +    * Returns %0 on success, or an error otherwise. On error, the VMA will
> > > > > > +    * be unmapped.
> > > > > > +    *
> > > > > > +    * Context: User context.  May sleep.  Caller holds mmap_lock.
> > > > > > +    */
> > > > > > +   int (*mapped)(unsigned long start, unsigned long end, pgoff_t pgoff,
> > > > > > +                 const struct file *file, void **vm_private_data);
> > > > > >     /* Called any time before splitting to check if it's allowed */
> > > > > >     int (*may_split)(struct vm_area_struct *area, unsigned long addr);
> > > > > >     int (*mremap)(struct vm_area_struct *area);
> > > > > > @@ -1345,3 +1367,11 @@ static inline void vma_set_file(struct vm_area_struct *vma, struct file *file)
> > > > > >     swap(vma->vm_file, file);
> > > > > >     fput(file);
> > > > > >  }
> > > > > > +
> > > > > > +static inline void unmap_vma_locked(struct vm_area_struct *vma)
> > > > > > +{
> > > > > > +   const size_t len = vma_pages(vma) << PAGE_SHIFT;
> > > > > > +
> > > > > > +   mmap_assert_locked(vma->vm_mm);
> > > > > > +   do_munmap(vma->vm_mm, vma->vm_start, len, NULL);
> > > > > > +}
> > > > > > --
> > > > > > 2.53.0
> > > > > >
> > > > > >
> > > >
> > > > Cheers, Lorenzo

Cheers, Lorenzo