seccomp_unotify allows more privileged processes does actions on behalf
of less privileged processes.
In many cases, the workflow is fully synchronous. It means a target
process triggers a system call and passes controls to a supervisor
process that handles the system call and returns controls to the target
process. In this context, "synchronous" means that only one process is
running and another one is waiting.
There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
move the wakee to the current CPU. For such synchronous workflows, it
makes context switches a few times faster.
Right now, each interaction takes 12µs. With this patch, it takes about
3µs.
This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
it used to enable the sync mode.
Signed-off-by: Andrei Vagin <avagin@gmail.com>
---
include/uapi/linux/seccomp.h | 4 ++++
kernel/seccomp.c | 35 +++++++++++++++++++++++++++++++++--
2 files changed, 37 insertions(+), 2 deletions(-)
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index 0fdc6ef02b94..dbfc9b37fcae 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -115,6 +115,8 @@ struct seccomp_notif_resp {
__u32 flags;
};
+#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
+
/* valid flags for seccomp_notif_addfd */
#define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */
#define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */
@@ -150,4 +152,6 @@ struct seccomp_notif_addfd {
#define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \
struct seccomp_notif_addfd)
+#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64)
+
#endif /* _UAPI_LINUX_SECCOMP_H */
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 667fd2d89464..c24900eb8ced 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -143,11 +143,14 @@ struct seccomp_kaddfd {
* filter->notify_lock.
* @next_id: The id of the next request.
* @notifications: A list of struct seccomp_knotif elements.
+ * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
*/
+
struct notification {
atomic_t requests;
u64 next_id;
struct list_head notifications;
+ int flags;
};
#ifdef SECCOMP_ARCH_NATIVE
@@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall,
INIT_LIST_HEAD(&n.addfd);
atomic_add(1, &match->notif->requests);
- wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
+ if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
+ else
+ wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
/*
* This is where we wait for a reply from userspace.
@@ -1574,7 +1580,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
knotif->error = resp.error;
knotif->val = resp.val;
knotif->flags = resp.flags;
- complete(&knotif->ready);
+ if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ complete_on_current_cpu(&knotif->ready);
+ else
+ complete(&knotif->ready);
out:
mutex_unlock(&filter->notify_lock);
return ret;
@@ -1604,6 +1613,26 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
return ret;
}
+static long seccomp_notify_set_flags(struct seccomp_filter *filter,
+ void __user *buf)
+{
+ u64 flags;
+ long ret;
+
+ if (copy_from_user(&flags, buf, sizeof(flags)))
+ return -EFAULT;
+
+ if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
+ return -EINVAL;
+
+ ret = mutex_lock_interruptible(&filter->notify_lock);
+ if (ret < 0)
+ return ret;
+ filter->notif->flags = flags;
+ mutex_unlock(&filter->notify_lock);
+ return ret;
+}
+
static long seccomp_notify_addfd(struct seccomp_filter *filter,
struct seccomp_notif_addfd __user *uaddfd,
unsigned int size)
@@ -1733,6 +1762,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
case SECCOMP_IOCTL_NOTIF_ID_VALID:
return seccomp_notify_id_valid(filter, buf);
+ case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
+ return seccomp_notify_set_flags(filter, buf);
}
/* Extensible Argument ioctls */
--
2.37.2
On Mon, Aug 29, 2022 at 06:43:56PM -0700, Andrei Vagin wrote:
> seccomp_unotify allows more privileged processes does actions on behalf
> of less privileged processes.
>
> In many cases, the workflow is fully synchronous. It means a target
> process triggers a system call and passes controls to a supervisor
> process that handles the system call and returns controls to the target
> process. In this context, "synchronous" means that only one process is
> running and another one is waiting.
>
> There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
> move the wakee to the current CPU. For such synchronous workflows, it
> makes context switches a few times faster.
>
> Right now, each interaction takes 12µs. With this patch, it takes about
> 3µs.
Seems like a nice idea though I leave it to the sched people to judge
whether this is sane or not. So the supervisor which gets woken will be
moved to the current cpu in this synchronous scenario.
I have no strong opinions on this patch. There are two things I wonder
about. First, how meaningful is that speed up given that the supervisor
will most often do a lot of heavy-handed things anyway.
Second, this flag is a very specific thing and I wonder how much
userspace will really use this and what's more use this correctly.
Just to note that LXD - one of the biggest user of this feature - isn't
synchronous iiuc for example. Each container gets a separate seccomp
supervisor thread (well, go routine but whatever) which exposes a socket
that the container manager connects to and sends the seccomp
notifications it received from its payload according to an api we
established. And each notification is handled in a separate thread
(again, go routine but whatever).
>
> This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
> it used to enable the sync mode.
>
> Signed-off-by: Andrei Vagin <avagin@gmail.com>
> ---
> include/uapi/linux/seccomp.h | 4 ++++
> kernel/seccomp.c | 35 +++++++++++++++++++++++++++++++++--
> 2 files changed, 37 insertions(+), 2 deletions(-)
>
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index 0fdc6ef02b94..dbfc9b37fcae 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -115,6 +115,8 @@ struct seccomp_notif_resp {
> __u32 flags;
> };
>
> +#define SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP (1UL << 0)
> +
> /* valid flags for seccomp_notif_addfd */
> #define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */
> #define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */
> @@ -150,4 +152,6 @@ struct seccomp_notif_addfd {
> #define SECCOMP_IOCTL_NOTIF_ADDFD SECCOMP_IOW(3, \
> struct seccomp_notif_addfd)
>
> +#define SECCOMP_IOCTL_NOTIF_SET_FLAGS SECCOMP_IOW(4, __u64)
> +
> #endif /* _UAPI_LINUX_SECCOMP_H */
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 667fd2d89464..c24900eb8ced 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -143,11 +143,14 @@ struct seccomp_kaddfd {
> * filter->notify_lock.
> * @next_id: The id of the next request.
> * @notifications: A list of struct seccomp_knotif elements.
> + * @flags: A set of SECCOMP_USER_NOTIF_FD_* flags.
> */
> +
> struct notification {
> atomic_t requests;
> u64 next_id;
> struct list_head notifications;
> + int flags;
> };
>
> #ifdef SECCOMP_ARCH_NATIVE
> @@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall,
> INIT_LIST_HEAD(&n.addfd);
>
> atomic_add(1, &match->notif->requests);
> - wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
> + if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> + wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
> + else
> + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
(We're accumulating a lot of conditional wake primitives in the notifier.)
>
> /*
> * This is where we wait for a reply from userspace.
> @@ -1574,7 +1580,10 @@ static long seccomp_notify_send(struct seccomp_filter *filter,
> knotif->error = resp.error;
> knotif->val = resp.val;
> knotif->flags = resp.flags;
> - complete(&knotif->ready);
> + if (filter->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> + complete_on_current_cpu(&knotif->ready);
> + else
> + complete(&knotif->ready);
> out:
> mutex_unlock(&filter->notify_lock);
> return ret;
> @@ -1604,6 +1613,26 @@ static long seccomp_notify_id_valid(struct seccomp_filter *filter,
> return ret;
> }
>
> +static long seccomp_notify_set_flags(struct seccomp_filter *filter,
> + void __user *buf)
> +{
> + u64 flags;
> + long ret;
> +
> + if (copy_from_user(&flags, buf, sizeof(flags)))
> + return -EFAULT;
> +
> + if (flags & ~SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> + return -EINVAL;
> +
> + ret = mutex_lock_interruptible(&filter->notify_lock);
> + if (ret < 0)
> + return ret;
> + filter->notif->flags = flags;
Might be better to just keep the uapi type and the in-kernel type in sync.
> + mutex_unlock(&filter->notify_lock);
> + return ret;
> +}
> +
> static long seccomp_notify_addfd(struct seccomp_filter *filter,
> struct seccomp_notif_addfd __user *uaddfd,
> unsigned int size)
> @@ -1733,6 +1762,8 @@ static long seccomp_notify_ioctl(struct file *file, unsigned int cmd,
> case SECCOMP_IOCTL_NOTIF_ID_VALID_WRONG_DIR:
> case SECCOMP_IOCTL_NOTIF_ID_VALID:
> return seccomp_notify_id_valid(filter, buf);
> + case SECCOMP_IOCTL_NOTIF_SET_FLAGS:
> + return seccomp_notify_set_flags(filter, buf);
> }
>
> /* Extensible Argument ioctls */
> --
> 2.37.2
>
>
On Tue, Aug 30, 2022 at 3:43 AM Christian Brauner <brauner@kernel.org> wrote:
>
> On Mon, Aug 29, 2022 at 06:43:56PM -0700, Andrei Vagin wrote:
> > seccomp_unotify allows more privileged processes does actions on behalf
> > of less privileged processes.
> >
> > In many cases, the workflow is fully synchronous. It means a target
> > process triggers a system call and passes controls to a supervisor
> > process that handles the system call and returns controls to the target
> > process. In this context, "synchronous" means that only one process is
> > running and another one is waiting.
> >
> > There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
> > move the wakee to the current CPU. For such synchronous workflows, it
> > makes context switches a few times faster.
> >
> > Right now, each interaction takes 12µs. With this patch, it takes about
> > 3µs.
>
> Seems like a nice idea though I leave it to the sched people to judge
> whether this is sane or not. So the supervisor which gets woken will be
> moved to the current cpu in this synchronous scenario.
>
> I have no strong opinions on this patch. There are two things I wonder
> about. First, how meaningful is that speed up given that the supervisor
> will most often do a lot of heavy-handed things anyway.
I would not use the "most often" phrase in this case;). It is true for LXC-like
use cases when we need to handle rare syscalls. In this case, the performance
of this interface doesn't play a big role. But my use case is very different. I
have a prototype of the gVisor platform, where seccomp is used to trap
guest system calls. In this case, the difference between 12µs and 3µs is
tremendous.
The idea of WF_CURRENT_CPU is not mine. I spied it from the umcg series.
I took the second patch from that series without any changes.
>
> Second, this flag is a very specific thing and I wonder how much
> userspace will really use this and what's more use this correctly.
>
> Just to note that LXD - one of the biggest user of this feature - isn't
> synchronous iiuc for example. Each container gets a separate seccomp
> supervisor thread (well, go routine but whatever) which exposes a socket
> that the container manager connects to and sends the seccomp
> notifications it received from its payload according to an api we
> established. And each notification is handled in a separate thread
> (again, go routine but whatever).
It could be synchronous if seccomp events had been handled in [lxc monitor]. But
right now, [lxc monitor] is just a proxy. In this case, you are right, lxc will
not get any benefits by setting this flag. But we can look at this from another
side. If we add these changes, we will have another big user of the interface. I
think the number of gVisor containers that are started each day is comparable
with the number of LXC/LXD containers.
>
> >
> > This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
> > it used to enable the sync mode.
> >
> > Signed-off-by: Andrei Vagin <avagin@gmail.com>
> > ---
> > include/uapi/linux/seccomp.h | 4 ++++
> > kernel/seccomp.c | 35 +++++++++++++++++++++++++++++++++--
> > 2 files changed, 37 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> > index 0fdc6ef02b94..dbfc9b37fcae 100644
> > --- a/include/uapi/linux/seccomp.h
> > +++ b/include/uapi/linux/seccomp.h
> > @@ -115,6 +115,8 @@ struct seccomp_notif_resp {
> > __u32 flags;
> > };
> >
<snip>
> >
> > #ifdef SECCOMP_ARCH_NATIVE
> > @@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall,
> > INIT_LIST_HEAD(&n.addfd);
> >
> > atomic_add(1, &match->notif->requests);
> > - wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
> > + if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> > + wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
> > + else
> > + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
>
> (We're accumulating a lot of conditional wake primitives in the notifier.)
>
I am not sure that I understand what you mean here.
Thanks,
Andrei.
On Tue, Aug 30, 2022 at 02:23:24PM -0700, Andrei Vagin wrote:
> On Tue, Aug 30, 2022 at 3:43 AM Christian Brauner <brauner@kernel.org> wrote:
> >
> > On Mon, Aug 29, 2022 at 06:43:56PM -0700, Andrei Vagin wrote:
> > > seccomp_unotify allows more privileged processes does actions on behalf
> > > of less privileged processes.
> > >
> > > In many cases, the workflow is fully synchronous. It means a target
> > > process triggers a system call and passes controls to a supervisor
> > > process that handles the system call and returns controls to the target
> > > process. In this context, "synchronous" means that only one process is
> > > running and another one is waiting.
> > >
> > > There is the WF_CURRENT_CPU flag that is used to advise the scheduler to
> > > move the wakee to the current CPU. For such synchronous workflows, it
> > > makes context switches a few times faster.
> > >
> > > Right now, each interaction takes 12µs. With this patch, it takes about
> > > 3µs.
> >
> > Seems like a nice idea though I leave it to the sched people to judge
> > whether this is sane or not. So the supervisor which gets woken will be
> > moved to the current cpu in this synchronous scenario.
> >
> > I have no strong opinions on this patch. There are two things I wonder
> > about. First, how meaningful is that speed up given that the supervisor
> > will most often do a lot of heavy-handed things anyway.
>
> I would not use the "most often" phrase in this case;). It is true for LXC-like
> use cases when we need to handle rare syscalls. In this case, the performance
> of this interface doesn't play a big role. But my use case is very different. I
> have a prototype of the gVisor platform, where seccomp is used to trap
> guest system calls. In this case, the difference between 12µs and 3µs is
> tremendous.
Oh yeah, makes sense. I don't know enough about gVisor but I know we can
trust your word! :)
>
> The idea of WF_CURRENT_CPU is not mine. I spied it from the umcg series.
> I took the second patch from that series without any changes.
>
> >
> > Second, this flag is a very specific thing and I wonder how much
> > userspace will really use this and what's more use this correctly.
> >
> > Just to note that LXD - one of the biggest user of this feature - isn't
> > synchronous iiuc for example. Each container gets a separate seccomp
> > supervisor thread (well, go routine but whatever) which exposes a socket
> > that the container manager connects to and sends the seccomp
> > notifications it received from its payload according to an api we
> > established. And each notification is handled in a separate thread
> > (again, go routine but whatever).
>
> It could be synchronous if seccomp events had been handled in [lxc monitor]. But
> right now, [lxc monitor] is just a proxy. In this case, you are right, lxc will
Yep.
> not get any benefits by setting this flag. But we can look at this from another
> side. If we add these changes, we will have another big user of the interface. I
> think the number of gVisor containers that are started each day is comparable
> with the number of LXC/LXD containers.
Sure, if there's users that would benefit from this then no reason to
not consider it. It's just a lot of low-level knobs we give userspace
here but I guess for the notifier it makes sense.
>
> >
> > >
> > > This change introduce the SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP flag that
> > > it used to enable the sync mode.
> > >
> > > Signed-off-by: Andrei Vagin <avagin@gmail.com>
> > > ---
> > > include/uapi/linux/seccomp.h | 4 ++++
> > > kernel/seccomp.c | 35 +++++++++++++++++++++++++++++++++--
> > > 2 files changed, 37 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> > > index 0fdc6ef02b94..dbfc9b37fcae 100644
> > > --- a/include/uapi/linux/seccomp.h
> > > +++ b/include/uapi/linux/seccomp.h
> > > @@ -115,6 +115,8 @@ struct seccomp_notif_resp {
> > > __u32 flags;
> > > };
> > >
>
> <snip>
>
> > >
> > > #ifdef SECCOMP_ARCH_NATIVE
> > > @@ -1117,7 +1120,10 @@ static int seccomp_do_user_notification(int this_syscall,
> > > INIT_LIST_HEAD(&n.addfd);
> > >
> > > atomic_add(1, &match->notif->requests);
> > > - wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
> > > + if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
> > > + wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
> > > + else
> > > + wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
> >
> > (We're accumulating a lot of conditional wake primitives in the notifier.)
> >
>
> I am not sure that I understand what you mean here.
I just meant that we have
if (wait_killable)
err = wait_for_completion_killable(&n.ready);
else
err = wait_for_completion_interruptible(&n.ready);
and now also
if (match->notif->flags & SECCOMP_USER_NOTIF_FD_SYNC_WAKE_UP)
wake_up_poll_on_current_cpu(&match->wqh, EPOLLIN | EPOLLRDNORM);
else
wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM);
which is a bit unpleasant but nothing that would mean we can't do this.
© 2016 - 2026 Red Hat, Inc.