Implement ptrace getter/setter interface for syscall user dispatch.
These prctl settings are presently write-only, making it impossible to
implement transparent checkpoint/restore via software like CRIU.
'on_dispatch' field is not exposed because it is a kernel-internal
only field that cannot be 'true' when returning to userland.
Signed-off-by: Gregory Price <gregory.price@memverge.com>
---
.../admin-guide/syscall-user-dispatch.rst | 4 ++
include/linux/syscall_user_dispatch.h | 18 ++++++++
include/uapi/linux/ptrace.h | 29 +++++++++++++
kernel/entry/syscall_user_dispatch.c | 42 +++++++++++++++++++
kernel/ptrace.c | 9 ++++
5 files changed, 102 insertions(+)
diff --git a/Documentation/admin-guide/syscall-user-dispatch.rst b/Documentation/admin-guide/syscall-user-dispatch.rst
index 60314953c728..f7648c08297e 100644
--- a/Documentation/admin-guide/syscall-user-dispatch.rst
+++ b/Documentation/admin-guide/syscall-user-dispatch.rst
@@ -73,6 +73,10 @@ thread-wide, without the need to invoke the kernel directly. selector
can be set to SYSCALL_DISPATCH_FILTER_ALLOW or SYSCALL_DISPATCH_FILTER_BLOCK.
Any other value should terminate the program with a SIGSYS.
+Additionally, a task's syscall user dispatch configuration can be peeked
+and poked via the PTRACE_(GET|SET)_SYSCALL_USER_DISPATCH_CONFIG ptrace
+requests. This is useful for checkpoint/restart software.
+
Security Notes
--------------
diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h
index a0ae443fb7df..641ca8880995 100644
--- a/include/linux/syscall_user_dispatch.h
+++ b/include/linux/syscall_user_dispatch.h
@@ -22,6 +22,12 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
#define clear_syscall_work_syscall_user_dispatch(tsk) \
clear_task_syscall_work(tsk, SYSCALL_USER_DISPATCH)
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+ void __user *data);
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+ void __user *data);
+
#else
struct syscall_user_dispatch {};
@@ -35,6 +41,18 @@ static inline void clear_syscall_work_syscall_user_dispatch(struct task_struct *
{
}
+static inline int syscall_user_dispatch_get_config(struct task_struct *task,
+ unsigned long size, void __user *data)
+{
+ return -EINVAL;
+}
+
+static inline int syscall_user_dispatch_set_config(struct task_struct *task,
+ unsigned long size, void __user *data)
+{
+ return -EINVAL;
+}
+
#endif /* CONFIG_GENERIC_ENTRY */
#endif /* _SYSCALL_USER_DISPATCH_H */
diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h
index 195ae64a8c87..1e77b02344c3 100644
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -112,6 +112,35 @@ struct ptrace_rseq_configuration {
__u32 pad;
};
+#define PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG 0x4210
+#define PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG 0x4211
+
+/*
+ * struct ptrace_sud_config - Per-task configuration for SUD
+ * @mode: One of PR_SYS_DISPATCH_ON or PR_SYS_DISPATCH_OFF
+ * @selector: Tracee's user virtual address of SUD selector
+ * @offset: SUD exclusion area (virtual address)
+ * @len: Length of SUD exclusion area
+ *
+ * Used to get/set the syscall user dispatch configuration for tracee.
+ * process. Selector is optional (may be NULL), and if invalid will produce
+ * a SIGSEGV in the tracee upon first access.
+ *
+ * If mode is PR_SYS_DISPATCH_ON, syscall dispatch will be enabled. If
+ * PR_SYS_DISPATCH_OFF, syscall dispatch will be disabled and all other
+ * parameters must be 0. The value in *selector (if not null), also determines
+ * whether syscall dispatch will occur.
+ *
+ * The SUD Exclusion area described by offset/len is the virtual address space
+ * from which syscalls will not produce a user dispatch.
+ */
+struct ptrace_sud_config {
+ __u64 mode;
+ __u64 selector;
+ __u64 offset;
+ __u64 len;
+};
+
/*
* These values are stored in task->ptrace_message
* by ptrace_stop to describe the current syscall-stop.
diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c
index 22396b234854..95b7218be71d 100644
--- a/kernel/entry/syscall_user_dispatch.c
+++ b/kernel/entry/syscall_user_dispatch.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
#include <linux/prctl.h>
+#include <linux/ptrace.h>
#include <linux/syscall_user_dispatch.h>
#include <linux/uaccess.h>
#include <linux/signal.h>
@@ -113,3 +114,44 @@ int set_syscall_user_dispatch(unsigned long mode, unsigned long offset,
{
return task_set_syscall_user_dispatch(current, mode, offset, len, selector);
}
+
+int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ struct syscall_user_dispatch *sd = &task->syscall_dispatch;
+ struct ptrace_sud_config config;
+
+ if (size != sizeof(struct ptrace_sud_config))
+ return -EINVAL;
+
+ if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH))
+ config.mode = PR_SYS_DISPATCH_ON;
+ else
+ config.mode = PR_SYS_DISPATCH_OFF;
+
+ config.offset = sd->offset;
+ config.len = sd->len;
+ config.selector = (__u64)(uintptr_t)sd->selector;
+
+ if (copy_to_user(data, &config, sizeof(config)))
+ return -EFAULT;
+
+ return 0;
+}
+
+int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size,
+ void __user *data)
+{
+ int rc;
+ struct ptrace_sud_config cfg;
+
+ if (size != sizeof(struct ptrace_sud_config))
+ return -EINVAL;
+
+ if (copy_from_user(&cfg, data, sizeof(struct ptrace_sud_config)))
+ return -EFAULT;
+
+ rc = task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset,
+ cfg.len, (char __user *)(uintptr_t)cfg.selector);
+ return rc;
+}
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0786450074c1..443057bee87c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -32,6 +32,7 @@
#include <linux/compat.h>
#include <linux/sched/signal.h>
#include <linux/minmax.h>
+#include <linux/syscall_user_dispatch.h>
#include <asm/syscall.h> /* for syscall_get_* */
@@ -1259,6 +1260,14 @@ int ptrace_request(struct task_struct *child, long request,
break;
#endif
+ case PTRACE_SET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_set_config(child, addr, datavp);
+ break;
+
+ case PTRACE_GET_SYSCALL_USER_DISPATCH_CONFIG:
+ ret = syscall_user_dispatch_get_config(child, addr, datavp);
+ break;
+
default:
break;
}
--
2.39.1
Gregory, I can't resist, I have a couple of cosmetic nits. On 02/24, Gregory Price wrote: > > +int syscall_user_dispatch_get_config(struct task_struct *task, unsigned long size, > + void __user *data) > +{ > + struct syscall_user_dispatch *sd = &task->syscall_dispatch; > + struct ptrace_sud_config config; > + > + if (size != sizeof(struct ptrace_sud_config)) > + return -EINVAL; > + > + if (test_task_syscall_work(task, SYSCALL_USER_DISPATCH)) > + config.mode = PR_SYS_DISPATCH_ON; > + else > + config.mode = PR_SYS_DISPATCH_OFF; > + > + config.offset = sd->offset; > + config.len = sd->len; > + config.selector = (__u64)(uintptr_t)sd->selector; > + > + if (copy_to_user(data, &config, sizeof(config))) Let me repeat, do not mix sizeof(struct ptrace_sud_config) and sizeof(config). Perhaps this is just me, but this looks confusing to me. Please use sizeof(config) both times. > +int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size, > + void __user *data) > +{ > + int rc; > + struct ptrace_sud_config cfg; ^^^ Again, this is cosmetic but a bit annoying. Please use either "config" or "cfg" in both functions to make the naming more consistent. > + rc = task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, > + cfg.len, (char __user *)(uintptr_t)cfg.selector); rc = task_set_syscall_user_dispatch(task, cfg.mode, cfg.offset, cfg.len, (char __user *)(uintptr_t)cfg.selector); looks a bit better to me. Oleg.
Hi Gragory, On Fri, 24 Feb 2023 at 23:40, Gregory Price <gourry.memverge@gmail.com> wrote: > > Implement ptrace getter/setter interface for syscall user dispatch. > > These prctl settings are presently write-only, making it impossible to > implement transparent checkpoint/restore via software like CRIU. > > 'on_dispatch' field is not exposed because it is a kernel-internal > only field that cannot be 'true' when returning to userland. > > Signed-off-by: Gregory Price <gregory.price@memverge.com> > --- [..] > +int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size, > + void __user *data) > +{ > + int rc; > + struct ptrace_sud_config cfg; > + > + if (size != sizeof(struct ptrace_sud_config)) > + return -EINVAL; > + > + if (copy_from_user(&cfg, data, sizeof(struct ptrace_sud_config))) > + return -EFAULT; It seems that the tool you want here would be copy_struct_from_user(), which is designed for extendable syscalls. Thanks, Dmitry
On 02/27, Dmitry Safonov wrote: > > > +int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size, > > + void __user *data) > > +{ > > + int rc; > > + struct ptrace_sud_config cfg; > > + > > + if (size != sizeof(struct ptrace_sud_config)) > > + return -EINVAL; > > + > > + if (copy_from_user(&cfg, data, sizeof(struct ptrace_sud_config))) > > + return -EFAULT; > > It seems that the tool you want here would be copy_struct_from_user(), > which is designed for extendable syscalls. Hmm. Why? In this case ksize == usize, so why do we need copy_struct_from_user ? Oleg.
On 2/28/23 16:52, Oleg Nesterov wrote: > On 02/27, Dmitry Safonov wrote: >> >>> +int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size, >>> + void __user *data) >>> +{ >>> + int rc; >>> + struct ptrace_sud_config cfg; >>> + >>> + if (size != sizeof(struct ptrace_sud_config)) >>> + return -EINVAL; >>> + >>> + if (copy_from_user(&cfg, data, sizeof(struct ptrace_sud_config))) >>> + return -EFAULT; >> >> It seems that the tool you want here would be copy_struct_from_user(), >> which is designed for extendable syscalls. > > Hmm. Why? > > In this case ksize == usize, so why do we need copy_struct_from_user ? In case the structure extends in future, that will let newer userspace run on an older kernel (as long as it doesn't use [set] any new fields). With regular sizeof(struct ptrace_sud_config) instead of adding size-related defines. It was Christian's idea how-to add/design new syscalls in an "extensible" manner. Here are his LPC slides: https://lpc.events/event/7/contributions/657/attachments/639/1159/extensible_syscalls.pdf [7/18 slide on checks] And an LWN article: https://lwn.net/Articles/830666/ Thanks, Dmitry
On 02/28, Dmitry Safonov wrote: > > On 2/28/23 16:52, Oleg Nesterov wrote: > > On 02/27, Dmitry Safonov wrote: > >> > >>> +int syscall_user_dispatch_set_config(struct task_struct *task, unsigned long size, > >>> + void __user *data) > >>> +{ > >>> + int rc; > >>> + struct ptrace_sud_config cfg; > >>> + > >>> + if (size != sizeof(struct ptrace_sud_config)) > >>> + return -EINVAL; > >>> + > >>> + if (copy_from_user(&cfg, data, sizeof(struct ptrace_sud_config))) > >>> + return -EFAULT; > >> > >> It seems that the tool you want here would be copy_struct_from_user(), > >> which is designed for extendable syscalls. > > > > Hmm. Why? > > > > In this case ksize == usize, so why do we need copy_struct_from_user ? > > In case the structure extends in future, that will let newer userspace > run on an older kernel (as long as it doesn't use [set] any new fields). Sure, I understand that, but I don't think it's worth the trouble in this case. If (unlikely, I think) this structure ever extends we can switch to copy_struct_from_user() or do something else if check_zeroed_user() makes no real sense for the new fields. Right now I think it is more important to ensure that the new users of this API use the correct size. Oleg.
© 2016 - 2025 Red Hat, Inc.