Update the mem char driver (backing /dev/mem and /dev/zero) to use
f_op->mmap_prepare hook rather than the deprecated f_op->mmap.
The /dev/zero implementation has a very unique and rather concerning
characteristic in that it converts MAP_PRIVATE mmap() mappings anonymous
when they are, in fact, not.
The new f_op->mmap_prepare() can support this, but rather than introducing
a helper function to perform this hack (and risk introducing other users),
simply set desc->vm_op to NULL here and add a comment describing what's
going on.
We also introduce shmem_zero_setup_desc() to allow for the shared mapping
case via an f_op->mmap_prepare() hook, and generalise the code between this
and shmem_zero_setup().
We also use the desc->action_error_hook to filter the remap error to
-EAGAIN to keep behaviour consistent.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
drivers/char/mem.c | 75 ++++++++++++++++++++++------------------
include/linux/shmem_fs.h | 3 +-
mm/shmem.c | 40 ++++++++++++++++-----
3 files changed, 76 insertions(+), 42 deletions(-)
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 34b815901b20..23194788ee41 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -304,13 +304,13 @@ static unsigned zero_mmap_capabilities(struct file *file)
}
/* can't do an in-place private mapping if there's no MMU */
-static inline int private_mapping_ok(struct vm_area_struct *vma)
+static inline int private_mapping_ok(struct vm_area_desc *desc)
{
- return is_nommu_shared_mapping(vma->vm_flags);
+ return is_nommu_shared_mapping(desc->vm_flags);
}
#else
-static inline int private_mapping_ok(struct vm_area_struct *vma)
+static inline int private_mapping_ok(struct vm_area_desc *desc)
{
return 1;
}
@@ -322,46 +322,50 @@ static const struct vm_operations_struct mmap_mem_ops = {
#endif
};
-static int mmap_mem(struct file *file, struct vm_area_struct *vma)
+static int mmap_filter_error(int err)
{
- size_t size = vma->vm_end - vma->vm_start;
- phys_addr_t offset = (phys_addr_t)vma->vm_pgoff << PAGE_SHIFT;
+ return -EAGAIN;
+}
+
+static int mmap_mem_prepare(struct vm_area_desc *desc)
+{
+ struct file *file = desc->file;
+ const size_t size = vma_desc_size(desc);
+ const phys_addr_t offset = (phys_addr_t)desc->pgoff << PAGE_SHIFT;
/* Does it even fit in phys_addr_t? */
- if (offset >> PAGE_SHIFT != vma->vm_pgoff)
+ if (offset >> PAGE_SHIFT != desc->pgoff)
return -EINVAL;
/* It's illegal to wrap around the end of the physical address space. */
if (offset + (phys_addr_t)size - 1 < offset)
return -EINVAL;
- if (!valid_mmap_phys_addr_range(vma->vm_pgoff, size))
+ if (!valid_mmap_phys_addr_range(desc->pgoff, size))
return -EINVAL;
- if (!private_mapping_ok(vma))
+ if (!private_mapping_ok(desc))
return -ENOSYS;
- if (!range_is_allowed(vma->vm_pgoff, size))
+ if (!range_is_allowed(desc->pgoff, size))
return -EPERM;
- if (!phys_mem_access_prot_allowed(file, vma->vm_pgoff, size,
- &vma->vm_page_prot))
+ if (!phys_mem_access_prot_allowed(file, desc->pgoff, size,
+ &desc->page_prot))
return -EINVAL;
- vma->vm_page_prot = phys_mem_access_prot(file, vma->vm_pgoff,
- size,
- vma->vm_page_prot);
+ desc->page_prot = phys_mem_access_prot(file, desc->pgoff,
+ size,
+ desc->page_prot);
- vma->vm_ops = &mmap_mem_ops;
+ desc->vm_ops = &mmap_mem_ops;
/* Remap-pfn-range will mark the range VM_IO */
- if (remap_pfn_range(vma,
- vma->vm_start,
- vma->vm_pgoff,
- size,
- vma->vm_page_prot)) {
- return -EAGAIN;
- }
+ mmap_action_remap(&desc->action, desc->start, desc->pgoff, size,
+ desc->page_prot);
+ /* We filter remap errors to -EAGAIN. */
+ desc->action.error_hook = mmap_filter_error;
+
return 0;
}
@@ -501,14 +505,18 @@ static ssize_t read_zero(struct file *file, char __user *buf,
return cleared;
}
-static int mmap_zero(struct file *file, struct vm_area_struct *vma)
+static int mmap_prepare_zero(struct vm_area_desc *desc)
{
#ifndef CONFIG_MMU
return -ENOSYS;
#endif
- if (vma->vm_flags & VM_SHARED)
- return shmem_zero_setup(vma);
- vma_set_anonymous(vma);
+ if (desc->vm_flags & VM_SHARED)
+ return shmem_zero_setup_desc(desc);
+ /*
+ * This is a highly unique situation where we mark a MAP_PRIVATE mapping
+ *of /dev/zero anonymous, despite it not being.
+ */
+ desc->vm_ops = NULL;
return 0;
}
@@ -526,10 +534,11 @@ static unsigned long get_unmapped_area_zero(struct file *file,
{
if (flags & MAP_SHARED) {
/*
- * mmap_zero() will call shmem_zero_setup() to create a file,
- * so use shmem's get_unmapped_area in case it can be huge;
- * and pass NULL for file as in mmap.c's get_unmapped_area(),
- * so as not to confuse shmem with our handle on "/dev/zero".
+ * mmap_prepare_zero() will call shmem_zero_setup() to create a
+ * file, so use shmem's get_unmapped_area in case it can be
+ * huge; and pass NULL for file as in mmap.c's
+ * get_unmapped_area(), so as not to confuse shmem with our
+ * handle on "/dev/zero".
*/
return shmem_get_unmapped_area(NULL, addr, len, pgoff, flags);
}
@@ -632,7 +641,7 @@ static const struct file_operations __maybe_unused mem_fops = {
.llseek = memory_lseek,
.read = read_mem,
.write = write_mem,
- .mmap = mmap_mem,
+ .mmap_prepare = mmap_mem_prepare,
.open = open_mem,
#ifndef CONFIG_MMU
.get_unmapped_area = get_unmapped_area_mem,
@@ -668,7 +677,7 @@ static const struct file_operations zero_fops = {
.write_iter = write_iter_zero,
.splice_read = copy_splice_read,
.splice_write = splice_write_zero,
- .mmap = mmap_zero,
+ .mmap_prepare = mmap_prepare_zero,
.get_unmapped_area = get_unmapped_area_zero,
#ifndef CONFIG_MMU
.mmap_capabilities = zero_mmap_capabilities,
diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
index 0e47465ef0fd..5b368f9549d6 100644
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -94,7 +94,8 @@ extern struct file *shmem_kernel_file_setup(const char *name, loff_t size,
unsigned long flags);
extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
const char *name, loff_t size, unsigned long flags);
-extern int shmem_zero_setup(struct vm_area_struct *);
+int shmem_zero_setup(struct vm_area_struct *vma);
+int shmem_zero_setup_desc(struct vm_area_desc *desc);
extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags);
extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
diff --git a/mm/shmem.c b/mm/shmem.c
index 990e33c6a776..cb6ff00eb4cb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -5893,14 +5893,9 @@ struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, const char *name,
}
EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
-/**
- * shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap
- */
-int shmem_zero_setup(struct vm_area_struct *vma)
+static struct file *__shmem_zero_setup(unsigned long start, unsigned long end, vm_flags_t vm_flags)
{
- struct file *file;
- loff_t size = vma->vm_end - vma->vm_start;
+ loff_t size = end - start;
/*
* Cloning a new file under mmap_lock leads to a lock ordering conflict
@@ -5908,7 +5903,17 @@ int shmem_zero_setup(struct vm_area_struct *vma)
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
*/
- file = shmem_kernel_file_setup("dev/zero", size, vma->vm_flags);
+ return shmem_kernel_file_setup("dev/zero", size, vm_flags);
+}
+
+/**
+ * shmem_zero_setup - setup a shared anonymous mapping
+ * @vma: the vma to be mmapped is prepared by do_mmap
+ */
+int shmem_zero_setup(struct vm_area_struct *vma)
+{
+ struct file *file = __shmem_zero_setup(vma->vm_start, vma->vm_end, vma->vm_flags);
+
if (IS_ERR(file))
return PTR_ERR(file);
@@ -5920,6 +5925,25 @@ int shmem_zero_setup(struct vm_area_struct *vma)
return 0;
}
+/**
+ * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA
+ * descriptor for convenience.
+ * @desc: Describes VMA
+ * Returns: 0 on success, or error
+ */
+int shmem_zero_setup_desc(struct vm_area_desc *desc)
+{
+ struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ desc->vm_file = file;
+ desc->vm_ops = &shmem_anon_vm_ops;
+
+ return 0;
+}
+
/**
* shmem_read_folio_gfp - read into page cache, using specified page allocation flags.
* @mapping: the folio's address_space
--
2.51.0
On Wed, 10 Sep 2025 21:22:06 +0100 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
> Update the mem char driver (backing /dev/mem and /dev/zero) to use
> f_op->mmap_prepare hook rather than the deprecated f_op->mmap.
>
> The /dev/zero implementation has a very unique and rather concerning
> characteristic in that it converts MAP_PRIVATE mmap() mappings anonymous
> when they are, in fact, not.
>
> The new f_op->mmap_prepare() can support this, but rather than introducing
> a helper function to perform this hack (and risk introducing other users),
> simply set desc->vm_op to NULL here and add a comment describing what's
> going on.
>
> We also introduce shmem_zero_setup_desc() to allow for the shared mapping
> case via an f_op->mmap_prepare() hook, and generalise the code between this
> and shmem_zero_setup().
>
> We also use the desc->action_error_hook to filter the remap error to
> -EAGAIN to keep behaviour consistent.
>
> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> ---
> drivers/char/mem.c | 75 ++++++++++++++++++++++------------------
> include/linux/shmem_fs.h | 3 +-
> mm/shmem.c | 40 ++++++++++++++++-----
> 3 files changed, 76 insertions(+), 42 deletions(-)
>
[ ... ]
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 990e33c6a776..cb6ff00eb4cb 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
[ ... ]
> @@ -5920,6 +5925,25 @@ int shmem_zero_setup(struct vm_area_struct *vma)
> return 0;
> }
>
> +/**
> + * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA
> + * descriptor for convenience.
> + * @desc: Describes VMA
> + * Returns: 0 on success, or error
> + */
> +int shmem_zero_setup_desc(struct vm_area_desc *desc)
> +{
> + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
> +
> + if (IS_ERR(file))
> + return PTR_ERR(file);
> +
> + desc->vm_file = file;
> + desc->vm_ops = &shmem_anon_vm_ops;
> +
> + return 0;
> +}
> +
Hi Lorenzo,
shmem_zero_setup() does a if (vma->vm_file) fput(vma->vm_file) dance.
It looks like we need one here too?
-chris
On Thu, Sep 18, 2025 at 12:11:05PM -0700, Chris Mason wrote:
> On Wed, 10 Sep 2025 21:22:06 +0100 Lorenzo Stoakes <lorenzo.stoakes@oracle.com> wrote:
>
> > Update the mem char driver (backing /dev/mem and /dev/zero) to use
> > f_op->mmap_prepare hook rather than the deprecated f_op->mmap.
> >
> > The /dev/zero implementation has a very unique and rather concerning
> > characteristic in that it converts MAP_PRIVATE mmap() mappings anonymous
> > when they are, in fact, not.
> >
> > The new f_op->mmap_prepare() can support this, but rather than introducing
> > a helper function to perform this hack (and risk introducing other users),
> > simply set desc->vm_op to NULL here and add a comment describing what's
> > going on.
> >
> > We also introduce shmem_zero_setup_desc() to allow for the shared mapping
> > case via an f_op->mmap_prepare() hook, and generalise the code between this
> > and shmem_zero_setup().
> >
> > We also use the desc->action_error_hook to filter the remap error to
> > -EAGAIN to keep behaviour consistent.
> >
> > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > ---
> > drivers/char/mem.c | 75 ++++++++++++++++++++++------------------
> > include/linux/shmem_fs.h | 3 +-
> > mm/shmem.c | 40 ++++++++++++++++-----
> > 3 files changed, 76 insertions(+), 42 deletions(-)
> >
>
> [ ... ]
>
> > diff --git a/mm/shmem.c b/mm/shmem.c
> > index 990e33c6a776..cb6ff00eb4cb 100644
> > --- a/mm/shmem.c
> > +++ b/mm/shmem.c
>
> [ ... ]
>
> > @@ -5920,6 +5925,25 @@ int shmem_zero_setup(struct vm_area_struct *vma)
> > return 0;
> > }
> >
> > +/**
> > + * shmem_zero_setup_desc - same as shmem_zero_setup, but determined by VMA
> > + * descriptor for convenience.
> > + * @desc: Describes VMA
> > + * Returns: 0 on success, or error
> > + */
> > +int shmem_zero_setup_desc(struct vm_area_desc *desc)
> > +{
> > + struct file *file = __shmem_zero_setup(desc->start, desc->end, desc->vm_flags);
> > +
> > + if (IS_ERR(file))
> > + return PTR_ERR(file);
> > +
> > + desc->vm_file = file;
> > + desc->vm_ops = &shmem_anon_vm_ops;
> > +
> > + return 0;
> > +}
> > +
>
> Hi Lorenzo,
>
> shmem_zero_setup() does a if (vma->vm_file) fput(vma->vm_file) dance.
>
> It looks like we need one here too?
No we don't, it's intentionally designed to avoid this because mmap_prepare is
done at a time prior to the file pointer having had been pinned like this.
This is necessary in mmap() but not in mmap_prepare(), equally you can just
assign VMA flags or any other field without any need for special helpers or
lock/refcount dances etc.
>
> -chris
Cheers, Lorenzo
© 2016 - 2026 Red Hat, Inc.