mm/madvise.c | 55 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 19 deletions(-)
The process_madvise() call was introduced in commit ecb8ac8b1f14
("mm/madvise: introduce process_madvise() syscall: an external memory
hinting API") as a means of performing madvise() operations on another
process.
However, as it provides the means by which to perform multiple madvise()
operations in a batch via an iovec, it is useful to utilise the same
interface for performing operations on the current process rather than a
remote one.
Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check
if same mm") removed the need for a caller invoking process_madvise() on
its own pidfd to possess the CAP_SYS_NICE capability, however this leaves
the restrictions on operation in place.
Resolve this by only applying the restriction on operations when accessing
a remote process.
Moving forward we plan to implement a simpler means of specifying this
condition other than needing to establish a self pidfd, perhaps in the form
of a sentinel pidfd.
Also take the opportunity to refactor the system call implementation
abstracting the vectorised operation.
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
v3:
* Avoid introducing PR_MADV_SELF and defer a non-pidfd version until later.
v2:
* Fix silly mistake referencing unassigned mm variable.
* Add PR_MADV_SELF to architecture-specific mman headers.
https://lore.kernel.org/linux-mm/9cf0e96d-287f-4749-ba85-8414c06aa54c@lucifer.local/
v1:
https://lore.kernel.org/all/cover.1727106751.git.lorenzo.stoakes@oracle.com/
mm/madvise.c | 55 ++++++++++++++++++++++++++++++++++------------------
1 file changed, 36 insertions(+), 19 deletions(-)
diff --git a/mm/madvise.c b/mm/madvise.c
index 50d223ab3894..e871a72a6c32 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1208,7 +1208,8 @@ madvise_behavior_valid(int behavior)
}
}
-static bool process_madvise_behavior_valid(int behavior)
+/* Can we invoke process_madvise() on a remote mm for the specified behavior? */
+static bool process_madvise_remote_valid(int behavior)
{
switch (behavior) {
case MADV_COLD:
@@ -1477,6 +1478,28 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
return do_madvise(current->mm, start, len_in, behavior);
}
+/* Perform an madvise operation over a vector of addresses and lengths. */
+static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
+ int behavior)
+{
+ ssize_t ret = 0;
+ size_t total_len;
+
+ total_len = iov_iter_count(iter);
+
+ while (iov_iter_count(iter)) {
+ ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter),
+ iter_iov_len(iter), behavior);
+ if (ret < 0)
+ break;
+ iov_iter_advance(iter, iter_iov_len(iter));
+ }
+
+ ret = (total_len - iov_iter_count(iter)) ? : ret;
+
+ return ret;
+}
+
SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
size_t, vlen, int, behavior, unsigned int, flags)
{
@@ -1486,7 +1509,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
struct iov_iter iter;
struct task_struct *task;
struct mm_struct *mm;
- size_t total_len;
unsigned int f_flags;
if (flags != 0) {
@@ -1504,11 +1526,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto free_iov;
}
- if (!process_madvise_behavior_valid(behavior)) {
- ret = -EINVAL;
- goto release_task;
- }
-
/* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */
mm = mm_access(task, PTRACE_MODE_READ_FSCREDS);
if (IS_ERR(mm)) {
@@ -1516,26 +1533,26 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto release_task;
}
+ /*
+ * We need only perform this check if we are attempting to manipulate a
+ * remote process's address space.
+ */
+ if (mm != current->mm && !process_madvise_remote_valid(behavior)) {
+ ret = -EINVAL;
+ goto release_mm;
+ }
+
/*
* Require CAP_SYS_NICE for influencing process performance. Note that
- * only non-destructive hints are currently supported.
+ * only non-destructive hints are currently supported for remote
+ * processes.
*/
if (mm != current->mm && !capable(CAP_SYS_NICE)) {
ret = -EPERM;
goto release_mm;
}
- total_len = iov_iter_count(&iter);
-
- while (iov_iter_count(&iter)) {
- ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter),
- iter_iov_len(&iter), behavior);
- if (ret < 0)
- break;
- iov_iter_advance(&iter, iter_iov_len(&iter));
- }
-
- ret = (total_len - iov_iter_count(&iter)) ? : ret;
+ ret = vector_madvise(mm, &iter, behavior);
release_mm:
mmput(mm);
--
2.46.0
On 9/26/24 17:10, Lorenzo Stoakes wrote: > The process_madvise() call was introduced in commit ecb8ac8b1f14 > ("mm/madvise: introduce process_madvise() syscall: an external memory > hinting API") as a means of performing madvise() operations on another > process. > > However, as it provides the means by which to perform multiple madvise() > operations in a batch via an iovec, it is useful to utilise the same > interface for performing operations on the current process rather than a > remote one. > > Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check > if same mm") removed the need for a caller invoking process_madvise() on > its own pidfd to possess the CAP_SYS_NICE capability, however this leaves > the restrictions on operation in place. > > Resolve this by only applying the restriction on operations when accessing > a remote process. > > Moving forward we plan to implement a simpler means of specifying this > condition other than needing to establish a self pidfd, perhaps in the form > of a sentinel pidfd. > > Also take the opportunity to refactor the system call implementation > abstracting the vectorised operation. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Looks like the destructive modes should work with the vectorized version too, and with how it returns a partial success. We'll need a man page update though, right?
On Fri, Sep 27, 2024 at 10:12:33AM GMT, Vlastimil Babka wrote: > On 9/26/24 17:10, Lorenzo Stoakes wrote: > > The process_madvise() call was introduced in commit ecb8ac8b1f14 > > ("mm/madvise: introduce process_madvise() syscall: an external memory > > hinting API") as a means of performing madvise() operations on another > > process. > > > > However, as it provides the means by which to perform multiple madvise() > > operations in a batch via an iovec, it is useful to utilise the same > > interface for performing operations on the current process rather than a > > remote one. > > > > Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check > > if same mm") removed the need for a caller invoking process_madvise() on > > its own pidfd to possess the CAP_SYS_NICE capability, however this leaves > > the restrictions on operation in place. > > > > Resolve this by only applying the restriction on operations when accessing > > a remote process. > > > > Moving forward we plan to implement a simpler means of specifying this > > condition other than needing to establish a self pidfd, perhaps in the form > > of a sentinel pidfd. > > > > Also take the opportunity to refactor the system call implementation > > abstracting the vectorised operation. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Acked-by: Vlastimil Babka <vbabka@suse.cz> > > Looks like the destructive modes should work with the vectorized version > too, and with how it returns a partial success. > Right yeah, intent is to allow those modes when referring to the current process. Partial success logic across ranges mirrors how we generally handle partial success in destructive operations so should all be good. > We'll need a man page update though, right? > Ack, I intend to send a patch for this separately.
On Thu, Sep 26, 2024 at 04:10:19PM GMT, Lorenzo Stoakes wrote: > The process_madvise() call was introduced in commit ecb8ac8b1f14 > ("mm/madvise: introduce process_madvise() syscall: an external memory > hinting API") as a means of performing madvise() operations on another > process. > > However, as it provides the means by which to perform multiple madvise() > operations in a batch via an iovec, it is useful to utilise the same > interface for performing operations on the current process rather than a > remote one. > > Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check > if same mm") removed the need for a caller invoking process_madvise() on > its own pidfd to possess the CAP_SYS_NICE capability, however this leaves > the restrictions on operation in place. > > Resolve this by only applying the restriction on operations when accessing > a remote process. > > Moving forward we plan to implement a simpler means of specifying this > condition other than needing to establish a self pidfd, perhaps in the form > of a sentinel pidfd. > > Also take the opportunity to refactor the system call implementation > abstracting the vectorised operation. > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Acked-by: Shakeel Butt <shakeel.butt@linux.dev> > --- > v3: > * Avoid introducing PR_MADV_SELF and defer a non-pidfd version until later. Seems like a good plan to decouple this patch from PR_MADV_SELF vs PIDFD_SELF decision. I am hoping to see the follow up patch as well. thanks, Shakeel
On Thu, Sep 26, 2024 at 08:52:32AM GMT, Shakeel Butt wrote: > On Thu, Sep 26, 2024 at 04:10:19PM GMT, Lorenzo Stoakes wrote: > > The process_madvise() call was introduced in commit ecb8ac8b1f14 > > ("mm/madvise: introduce process_madvise() syscall: an external memory > > hinting API") as a means of performing madvise() operations on another > > process. > > > > However, as it provides the means by which to perform multiple madvise() > > operations in a batch via an iovec, it is useful to utilise the same > > interface for performing operations on the current process rather than a > > remote one. > > > > Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check > > if same mm") removed the need for a caller invoking process_madvise() on > > its own pidfd to possess the CAP_SYS_NICE capability, however this leaves > > the restrictions on operation in place. > > > > Resolve this by only applying the restriction on operations when accessing > > a remote process. > > > > Moving forward we plan to implement a simpler means of specifying this > > condition other than needing to establish a self pidfd, perhaps in the form > > of a sentinel pidfd. > > > > Also take the opportunity to refactor the system call implementation > > abstracting the vectorised operation. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Acked-by: Shakeel Butt <shakeel.butt@linux.dev> > > > --- > > v3: > > * Avoid introducing PR_MADV_SELF and defer a non-pidfd version until later. > > Seems like a good plan to decouple this patch from PR_MADV_SELF vs > PIDFD_SELF decision. I am hoping to see the follow up patch as well. PIDFD_SELF should absolutely not be a per-system call thing. It should be generic across all pidfd based system calls similar to AT_FDCWD. IOW, that should be in: include/uapi/linux/pidfd.h #define PIDFD_SELF -200
On Fri, Sep 27, 2024 at 10:04:25AM GMT, Christian Brauner wrote: > On Thu, Sep 26, 2024 at 08:52:32AM GMT, Shakeel Butt wrote: > > On Thu, Sep 26, 2024 at 04:10:19PM GMT, Lorenzo Stoakes wrote: > > > The process_madvise() call was introduced in commit ecb8ac8b1f14 > > > ("mm/madvise: introduce process_madvise() syscall: an external memory > > > hinting API") as a means of performing madvise() operations on another > > > process. > > > > > > However, as it provides the means by which to perform multiple madvise() > > > operations in a batch via an iovec, it is useful to utilise the same > > > interface for performing operations on the current process rather than a > > > remote one. > > > > > > Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check > > > if same mm") removed the need for a caller invoking process_madvise() on > > > its own pidfd to possess the CAP_SYS_NICE capability, however this leaves > > > the restrictions on operation in place. > > > > > > Resolve this by only applying the restriction on operations when accessing > > > a remote process. > > > > > > Moving forward we plan to implement a simpler means of specifying this > > > condition other than needing to establish a self pidfd, perhaps in the form > > > of a sentinel pidfd. > > > > > > Also take the opportunity to refactor the system call implementation > > > abstracting the vectorised operation. > > > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > > Acked-by: Shakeel Butt <shakeel.butt@linux.dev> > > > > > --- > > > v3: > > > * Avoid introducing PR_MADV_SELF and defer a non-pidfd version until later. > > > > Seems like a good plan to decouple this patch from PR_MADV_SELF vs > > PIDFD_SELF decision. I am hoping to see the follow up patch as well. > > PIDFD_SELF should absolutely not be a per-system call thing. It should > be generic across all pidfd based system calls similar to AT_FDCWD. > > IOW, that should be in: > > include/uapi/linux/pidfd.h > > #define PIDFD_SELF -200 Yes this is what I was saying elsewhere in the thread :) this is why it's important to have this as a separate enterprise. And indeed this is the intent, I will be working on a separate patch series to this effect. It also gives us the space to implement it in calls which use pidfd where it makes sense and to extend testing accordingly.
On Thu, Sep 26, 2024 at 08:52:32AM GMT, Shakeel Butt wrote: > On Thu, Sep 26, 2024 at 04:10:19PM GMT, Lorenzo Stoakes wrote: > > The process_madvise() call was introduced in commit ecb8ac8b1f14 > > ("mm/madvise: introduce process_madvise() syscall: an external memory > > hinting API") as a means of performing madvise() operations on another > > process. > > > > However, as it provides the means by which to perform multiple madvise() > > operations in a batch via an iovec, it is useful to utilise the same > > interface for performing operations on the current process rather than a > > remote one. > > > > Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check > > if same mm") removed the need for a caller invoking process_madvise() on > > its own pidfd to possess the CAP_SYS_NICE capability, however this leaves > > the restrictions on operation in place. > > > > Resolve this by only applying the restriction on operations when accessing > > a remote process. > > > > Moving forward we plan to implement a simpler means of specifying this > > condition other than needing to establish a self pidfd, perhaps in the form > > of a sentinel pidfd. > > > > Also take the opportunity to refactor the system call implementation > > abstracting the vectorised operation. > > > > Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Acked-by: Shakeel Butt <shakeel.butt@linux.dev> Thanks! > > > --- > > v3: > > * Avoid introducing PR_MADV_SELF and defer a non-pidfd version until later. > > Seems like a good plan to decouple this patch from PR_MADV_SELF vs > PIDFD_SELF decision. I am hoping to see the follow up patch as well. > > thanks, > Shakeel > Yes agreed, this gets an important part of the change in, and gives us room to take our time on that side of things. Plan right now is I will work on a sentinel solution in parallel to other stuff and see how that goes, if it integrates well can bring it in in a separate patch series. Cheers, Lorenzo
© 2016 - 2024 Red Hat, Inc.