MAINTAINERS | 1 + include/uapi/linux/process_vm.h | 9 +++++++++ mm/process_vm_access.c | 20 +++++++++++++++----- 3 files changed, 25 insertions(+), 5 deletions(-) create mode 100644 include/uapi/linux/process_vm.h
From: Alban Crequy <albancrequy@microsoft.com>
- PROCESS_VM_PIDFD: refer to the remote process via PID file descriptor
instead of PID. Such a file descriptor can be obtained with
pidfd_open(2).
- PROCESS_VM_NOWAIT: do not block on IO if the memory access causes a
page fault.
If a given flag is unsupported, the syscall returns the error EINVAL
without checking the buffers. This gives a way to userspace to detect
whether the current kernel supports a specific flag:
process_vm_readv(pid, NULL, 1, NULL, 1, PROCESS_VM_PIDFD)
-> EINVAL if the kernel does not support the flag PROCESS_VM_PIDFD
(before this patch)
-> EFAULT if the kernel supports the flag (after this patch)
Signed-off-by: Alban Crequy <albancrequy@microsoft.com>
---
MAINTAINERS | 1 +
include/uapi/linux/process_vm.h | 9 +++++++++
mm/process_vm_access.c | 20 +++++++++++++++-----
3 files changed, 25 insertions(+), 5 deletions(-)
create mode 100644 include/uapi/linux/process_vm.h
diff --git a/MAINTAINERS b/MAINTAINERS
index e64b94e6b5a9..91b4647cf761 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16272,6 +16272,7 @@ F: include/linux/pgtable.h
F: include/linux/ptdump.h
F: include/linux/vmpressure.h
F: include/linux/vmstat.h
+F: include/uapi/linux/process_vm.h
F: kernel/fork.c
F: mm/Kconfig
F: mm/debug.c
diff --git a/include/uapi/linux/process_vm.h b/include/uapi/linux/process_vm.h
new file mode 100644
index 000000000000..4168e09f3f4e
--- /dev/null
+++ b/include/uapi/linux/process_vm.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_PROCESS_VM_H
+#define _UAPI_LINUX_PROCESS_VM_H
+
+/* Flags for process_vm_readv/process_vm_writev */
+#define PROCESS_VM_PIDFD (1UL << 0)
+#define PROCESS_VM_NOWAIT (1UL << 1)
+
+#endif /* _UAPI_LINUX_PROCESS_VM_H */
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 656d3e88755b..b5eac870ef24 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -14,6 +14,7 @@
#include <linux/ptrace.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
+#include <linux/process_vm.h>
/**
* process_vm_rw_pages - read/write pages from task specified
@@ -68,6 +69,7 @@ static int process_vm_rw_pages(struct page **pages,
* @mm: mm for task
* @task: task to read/write from
* @vm_write: 0 means copy from, 1 means copy to
+ * @pvm_flags: PROCESS_VM_* flags
* Returns 0 on success or on failure error code
*/
static int process_vm_rw_single_vec(unsigned long addr,
@@ -76,7 +78,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
struct page **process_pages,
struct mm_struct *mm,
struct task_struct *task,
- int vm_write)
+ int vm_write,
+ unsigned int pvm_flags)
{
unsigned long pa = addr & PAGE_MASK;
unsigned long start_offset = addr - pa;
@@ -91,6 +94,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
if (vm_write)
flags |= FOLL_WRITE;
+ if (pvm_flags & PROCESS_VM_NOWAIT)
+ flags |= FOLL_NOWAIT;
while (!rc && nr_pages && iov_iter_count(iter)) {
int pinned_pages = min_t(unsigned long, nr_pages, PVM_MAX_USER_PAGES);
@@ -141,7 +146,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
* @iter: where to copy to/from locally
* @rvec: iovec array specifying where to copy to/from in the other process
* @riovcnt: size of rvec array
- * @flags: currently unused
+ * @flags: process_vm_readv/writev flags
* @vm_write: 0 if reading from other process, 1 if writing to other process
*
* Returns the number of bytes read/written or error code. May
@@ -163,6 +168,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
unsigned long nr_pages_iov;
ssize_t iov_len;
size_t total_len = iov_iter_count(iter);
+ unsigned int f_flags;
/*
* Work out how many pages of struct pages we're going to need
@@ -194,7 +200,11 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
}
/* Get process information */
- task = find_get_task_by_vpid(pid);
+ if (flags & PROCESS_VM_PIDFD)
+ task = pidfd_get_task(pid, &f_flags);
+ else
+ task = find_get_task_by_vpid(pid);
+
if (!task) {
rc = -ESRCH;
goto free_proc_pages;
@@ -215,7 +225,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
rc = process_vm_rw_single_vec(
(unsigned long)rvec[i].iov_base, rvec[i].iov_len,
- iter, process_pages, mm, task, vm_write);
+ iter, process_pages, mm, task, vm_write, flags);
/* copied = space before - space after */
total_len -= iov_iter_count(iter);
@@ -266,7 +276,7 @@ static ssize_t process_vm_rw(pid_t pid,
ssize_t rc;
int dir = vm_write ? ITER_SOURCE : ITER_DEST;
- if (flags != 0)
+ if (flags & ~(PROCESS_VM_NOWAIT | PROCESS_VM_PIDFD))
return -EINVAL;
/* Check iovecs */
--
2.45.0
On Tue, Nov 18, 2025 at 02:23:48PM +0100, Alban Crequy wrote:
> From: Alban Crequy <albancrequy@microsoft.com>
>
> - PROCESS_VM_PIDFD: refer to the remote process via PID file descriptor
> instead of PID. Such a file descriptor can be obtained with
> pidfd_open(2).
>
> - PROCESS_VM_NOWAIT: do not block on IO if the memory access causes a
> page fault.
>
> If a given flag is unsupported, the syscall returns the error EINVAL
> without checking the buffers. This gives a way to userspace to detect
> whether the current kernel supports a specific flag:
>
> process_vm_readv(pid, NULL, 1, NULL, 1, PROCESS_VM_PIDFD)
> -> EINVAL if the kernel does not support the flag PROCESS_VM_PIDFD
> (before this patch)
> -> EFAULT if the kernel supports the flag (after this patch)
>
> Signed-off-by: Alban Crequy <albancrequy@microsoft.com>
> ---
Looks good to me overall,
Reviewed-by: Christian Brauner <brauner@kernel.org>
I do want to note an inconsistency though we should address - unrelated
to this patchset.
pidfd_get_task() supports PIDFD_SELF_THREAD and PIDFD_SELF_THREAD_GROUP.
This means it allows to operate on an individual thread and not the
thread-group leader.
This is highly relevant for permission checking as stuff like
ptrace_may_access() (via mm_access()) takes the creds of the task into
account and individual threads may have different creds than the
thread-group leader.
So suddently process_madvise(), process_mrelease(), and with this patch
now also process_vm_readv() can operate on individual threads...
except that's not completely true as they only allow to operate on the
individual thread of the caller. Arbitrary non-constant pidfds will
enforce that it's a thread-group leader that was specified and otherwise
return NULL...
We should make this consistent:
(1) Enforce thread-group leader based operations only
(2) Support threads for arbitrary pidfds
Someone needs to tell me what is wanted or needed and then I can fix
pidfd_get_task() accordingly.
The current state is not actively harmful afaict but it's inconsistent.
> MAINTAINERS | 1 +
> include/uapi/linux/process_vm.h | 9 +++++++++
> mm/process_vm_access.c | 20 +++++++++++++++-----
> 3 files changed, 25 insertions(+), 5 deletions(-)
> create mode 100644 include/uapi/linux/process_vm.h
>
> diff --git a/MAINTAINERS b/MAINTAINERS
> index e64b94e6b5a9..91b4647cf761 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -16272,6 +16272,7 @@ F: include/linux/pgtable.h
> F: include/linux/ptdump.h
> F: include/linux/vmpressure.h
> F: include/linux/vmstat.h
> +F: include/uapi/linux/process_vm.h
> F: kernel/fork.c
> F: mm/Kconfig
> F: mm/debug.c
> diff --git a/include/uapi/linux/process_vm.h b/include/uapi/linux/process_vm.h
> new file mode 100644
> index 000000000000..4168e09f3f4e
> --- /dev/null
> +++ b/include/uapi/linux/process_vm.h
> @@ -0,0 +1,9 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_PROCESS_VM_H
> +#define _UAPI_LINUX_PROCESS_VM_H
> +
> +/* Flags for process_vm_readv/process_vm_writev */
> +#define PROCESS_VM_PIDFD (1UL << 0)
> +#define PROCESS_VM_NOWAIT (1UL << 1)
> +
> +#endif /* _UAPI_LINUX_PROCESS_VM_H */
> diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
> index 656d3e88755b..b5eac870ef24 100644
> --- a/mm/process_vm_access.c
> +++ b/mm/process_vm_access.c
> @@ -14,6 +14,7 @@
> #include <linux/ptrace.h>
> #include <linux/slab.h>
> #include <linux/syscalls.h>
> +#include <linux/process_vm.h>
>
> /**
> * process_vm_rw_pages - read/write pages from task specified
> @@ -68,6 +69,7 @@ static int process_vm_rw_pages(struct page **pages,
> * @mm: mm for task
> * @task: task to read/write from
> * @vm_write: 0 means copy from, 1 means copy to
> + * @pvm_flags: PROCESS_VM_* flags
> * Returns 0 on success or on failure error code
> */
> static int process_vm_rw_single_vec(unsigned long addr,
> @@ -76,7 +78,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
> struct page **process_pages,
> struct mm_struct *mm,
> struct task_struct *task,
> - int vm_write)
> + int vm_write,
> + unsigned int pvm_flags)
> {
> unsigned long pa = addr & PAGE_MASK;
> unsigned long start_offset = addr - pa;
> @@ -91,6 +94,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
>
> if (vm_write)
> flags |= FOLL_WRITE;
> + if (pvm_flags & PROCESS_VM_NOWAIT)
> + flags |= FOLL_NOWAIT;
>
> while (!rc && nr_pages && iov_iter_count(iter)) {
> int pinned_pages = min_t(unsigned long, nr_pages, PVM_MAX_USER_PAGES);
> @@ -141,7 +146,7 @@ static int process_vm_rw_single_vec(unsigned long addr,
> * @iter: where to copy to/from locally
> * @rvec: iovec array specifying where to copy to/from in the other process
> * @riovcnt: size of rvec array
> - * @flags: currently unused
> + * @flags: process_vm_readv/writev flags
> * @vm_write: 0 if reading from other process, 1 if writing to other process
> *
> * Returns the number of bytes read/written or error code. May
> @@ -163,6 +168,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
> unsigned long nr_pages_iov;
> ssize_t iov_len;
> size_t total_len = iov_iter_count(iter);
> + unsigned int f_flags;
>
> /*
> * Work out how many pages of struct pages we're going to need
> @@ -194,7 +200,11 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
> }
>
> /* Get process information */
> - task = find_get_task_by_vpid(pid);
> + if (flags & PROCESS_VM_PIDFD)
> + task = pidfd_get_task(pid, &f_flags);
> + else
> + task = find_get_task_by_vpid(pid);
> +
> if (!task) {
> rc = -ESRCH;
> goto free_proc_pages;
> @@ -215,7 +225,7 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter,
> for (i = 0; i < riovcnt && iov_iter_count(iter) && !rc; i++)
> rc = process_vm_rw_single_vec(
> (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
> - iter, process_pages, mm, task, vm_write);
> + iter, process_pages, mm, task, vm_write, flags);
>
> /* copied = space before - space after */
> total_len -= iov_iter_count(iter);
> @@ -266,7 +276,7 @@ static ssize_t process_vm_rw(pid_t pid,
> ssize_t rc;
> int dir = vm_write ? ITER_SOURCE : ITER_DEST;
>
> - if (flags != 0)
> + if (flags & ~(PROCESS_VM_NOWAIT | PROCESS_VM_PIDFD))
> return -EINVAL;
>
> /* Check iovecs */
> --
> 2.45.0
>
© 2016 - 2025 Red Hat, Inc.