[v3] seccomp: support nested listeners

[PATCH v3 6/7] seccomp: allow nested listeners

Posted by Alexander Mikhalitsyn 2 months ago

Now everything is ready to get rid of "only one listener per tree"
limitation.

Let's introduce a new uAPI flag
SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS, so userspace may explicitly
allow nested listeners when installing a listener.

Note, that to install n-th listener, this flag must be set on all
the listeners up the tree.

Cc: linux-doc@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: bpf@vger.kernel.org
Cc: Kees Cook <kees@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will Drewry <wad@chromium.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Aleksa Sarai <cyphar@cyphar.com>
Cc: Tycho Andersen <tycho@tycho.pizza>
Cc: Andrei Vagin <avagin@gmail.com>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Stéphane Graber <stgraber@stgraber.org>
Signed-off-by: Alexander Mikhalitsyn <aleksandr.mikhalitsyn@canonical.com>
---
 .../userspace-api/seccomp_filter.rst          |  6 +++++
 include/linux/seccomp.h                       |  3 ++-
 include/uapi/linux/seccomp.h                  | 13 ++++++-----
 kernel/seccomp.c                              | 22 +++++++++++++++----
 tools/include/uapi/linux/seccomp.h            | 13 ++++++-----
 5 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
index cff0fa7f3175..b9633ab1ed47 100644
--- a/Documentation/userspace-api/seccomp_filter.rst
+++ b/Documentation/userspace-api/seccomp_filter.rst
@@ -210,6 +210,12 @@ notifications from both tasks will appear on the same filter fd. Reads and
 writes to/from a filter fd are also synchronized, so a filter fd can safely
 have many readers.
 
+By default, only one listener within seccomp filters tree is allowed. On attempt
+to add a new listener when one already exists in the filter tree, the
+``seccomp()`` call will fail with ``-EBUSY``. To allow multiple listeners, the
+``SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS`` flag can be passed in addition to
+the ``SECCOMP_FILTER_FLAG_NEW_LISTENER`` flag.
+
 The interface for a seccomp notification fd consists of two structures:
 
 .. code-block:: c
diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
index 9b959972bf4a..9b060946019d 100644
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -10,7 +10,8 @@
 					 SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
 					 SECCOMP_FILTER_FLAG_NEW_LISTENER | \
 					 SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
-					 SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
+					 SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV | \
+					 SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS)
 
 /* sizeof() the first published struct seccomp_notif_addfd */
 #define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
index dbfc9b37fcae..de78d8e7a70b 100644
--- a/include/uapi/linux/seccomp.h
+++ b/include/uapi/linux/seccomp.h
@@ -18,13 +18,14 @@
 #define SECCOMP_GET_NOTIF_SIZES		3
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC		(1UL << 0)
-#define SECCOMP_FILTER_FLAG_LOG			(1UL << 1)
-#define SECCOMP_FILTER_FLAG_SPEC_ALLOW		(1UL << 2)
-#define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
-#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH		(1UL << 4)
+#define SECCOMP_FILTER_FLAG_TSYNC			(1UL << 0)
+#define SECCOMP_FILTER_FLAG_LOG				(1UL << 1)
+#define SECCOMP_FILTER_FLAG_SPEC_ALLOW			(1UL << 2)
+#define SECCOMP_FILTER_FLAG_NEW_LISTENER		(1UL << 3)
+#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH			(1UL << 4)
 /* Received notifications wait in killable state (only respond to fatal signals) */
-#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV	(1UL << 5)
+#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV		(1UL << 5)
+#define SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS	(1UL << 6)
 
 /*
  * All BPF programs must return a 32-bit value.
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 51d0d8adaffb..7667f443ff6c 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -206,6 +206,7 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
  * @wait_killable_recv: Put notifying process in killable state once the
  *			notification is received by the userspace listener.
  * @first_listener: true if this is the first seccomp listener installed in the tree.
+ * @allow_nested_listeners: Allow nested seccomp listeners.
  * @prev: points to a previously installed, or inherited, filter
  * @prog: the BPF program to evaluate
  * @notif: the struct that holds all notification related information
@@ -228,6 +229,7 @@ struct seccomp_filter {
 	bool log : 1;
 	bool wait_killable_recv : 1;
 	bool first_listener : 1;
+	bool allow_nested_listeners : 1;
 	struct action_cache cache;
 	struct seccomp_filter *prev;
 	struct bpf_prog *prog;
@@ -956,6 +958,10 @@ static long seccomp_attach_filter(unsigned int flags,
 	if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
 		filter->wait_killable_recv = true;
 
+	/* Set nested listeners allow flag, if present. */
+	if (flags & SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS)
+		filter->allow_nested_listeners = true;
+
 	/*
 	 * If there is an existing filter, make it the prev and don't drop its
 	 * task reference.
@@ -1997,7 +2003,8 @@ static struct file *init_listener(struct seccomp_filter *filter)
 }
 
 /*
- * Does @new_child have a listener while an ancestor also has a listener?
+ * Does @new_child have a listener while an ancestor also has a listener
+ * and hasn't allowed nesting?
  * If so, we'll want to reject this filter.
  * This only has to be tested for the current process, even in the TSYNC case,
  * because TSYNC installs @child with the same parent on all threads.
@@ -2015,7 +2022,12 @@ static bool check_duplicate_listener(struct seccomp_filter *new_child)
 		return false;
 	for (cur = current->seccomp.filter; cur; cur = cur->prev) {
 		if (!IS_ERR_OR_NULL(cur->notif))
-			return true;
+			/*
+			 * We don't need to go up further, because if there is a
+			 * listener with nesting allowed, then all the listeners
+			 * up the tree have allowed nesting as well.
+			 */
+			return !cur->allow_nested_listeners;
 	}
 
 	/* Mark first listener in the tree. */
@@ -2062,10 +2074,12 @@ static long seccomp_set_mode_filter(unsigned int flags,
 		return -EINVAL;
 
 	/*
-	 * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
+	 * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT and
+	 * SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS flags don't make sense
 	 * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
 	 */
-	if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
+	if (((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) ||
+	     (flags & SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS)) &&
 	    ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
 		return -EINVAL;
 
diff --git a/tools/include/uapi/linux/seccomp.h b/tools/include/uapi/linux/seccomp.h
index dbfc9b37fcae..de78d8e7a70b 100644
--- a/tools/include/uapi/linux/seccomp.h
+++ b/tools/include/uapi/linux/seccomp.h
@@ -18,13 +18,14 @@
 #define SECCOMP_GET_NOTIF_SIZES		3
 
 /* Valid flags for SECCOMP_SET_MODE_FILTER */
-#define SECCOMP_FILTER_FLAG_TSYNC		(1UL << 0)
-#define SECCOMP_FILTER_FLAG_LOG			(1UL << 1)
-#define SECCOMP_FILTER_FLAG_SPEC_ALLOW		(1UL << 2)
-#define SECCOMP_FILTER_FLAG_NEW_LISTENER	(1UL << 3)
-#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH		(1UL << 4)
+#define SECCOMP_FILTER_FLAG_TSYNC			(1UL << 0)
+#define SECCOMP_FILTER_FLAG_LOG				(1UL << 1)
+#define SECCOMP_FILTER_FLAG_SPEC_ALLOW			(1UL << 2)
+#define SECCOMP_FILTER_FLAG_NEW_LISTENER		(1UL << 3)
+#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH			(1UL << 4)
 /* Received notifications wait in killable state (only respond to fatal signals) */
-#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV	(1UL << 5)
+#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV		(1UL << 5)
+#define SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS	(1UL << 6)
 
 /*
  * All BPF programs must return a 32-bit value.
-- 
2.43.0

Re: [PATCH v3 6/7] seccomp: allow nested listeners

Posted by Andrei Vagin 2 weeks, 5 days ago

On Thu, Dec 11, 2025 at 4:46 AM Alexander Mikhalitsyn
<aleksandr.mikhalitsyn@canonical.com> wrote:
>
> Now everything is ready to get rid of "only one listener per tree"
> limitation.
>
> Let's introduce a new uAPI flag
> SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS, so userspace may explicitly
> allow nested listeners when installing a listener.

I am not sure we really need SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS.
If nested listeners are completely functional, why would we want to
implicitly allow or disallow someone from using them?

Actually, even the current behavior of SECCOMP_RET_USER_NOTIF looks a
bit illogical. I think the following behavior would be more expected:
instead of running all filters and picking the most restrictive result,
the kernel should execute them one by one (most recent fist). If a filter
returns USER_NOTIF, the kernel pauses immediately to let the listener
handle the call. If that listener then issues "CONTINUE", the kernel
resumes by running the remaining older filters in the chain.

Thanks,
Andrei

Re: [PATCH v3 6/7] seccomp: allow nested listeners

Posted by Aleksa Sarai 2 weeks, 5 days ago

On 2026-01-20, Andrei Vagin <avagin@gmail.com> wrote:
> On Thu, Dec 11, 2025 at 4:46 AM Alexander Mikhalitsyn
> <aleksandr.mikhalitsyn@canonical.com> wrote:
> >
> > Now everything is ready to get rid of "only one listener per tree"
> > limitation.
> >
> > Let's introduce a new uAPI flag
> > SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS, so userspace may explicitly
> > allow nested listeners when installing a listener.
> 
> I am not sure we really need SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS.
> If nested listeners are completely functional, why would we want to
> implicitly allow or disallow someone from using them?

It can be quite easy to deadlock a process using seccomp-notify (even
in the single-notifier case) so especially in the case of container
managers I can see the argument for wanting this to be an opt-in thing
once container runtimes have verified their notifier won't break
nesting.

Then again, you can also use seccomp to block
SECCOMP_FILTER_FLAG_NEW_LISTENER directly, so you don't really need a
separate flag to allow nested listeners (unless I'm missing something)?
That would make it opt-out but presumably filters that allow seccomp
already use an allow-list for flags.

> Actually, even the current behavior of SECCOMP_RET_USER_NOTIF looks a
> bit illogical. I think the following behavior would be more expected:
> instead of running all filters and picking the most restrictive result,
> the kernel should execute them one by one (most recent fist). If a filter
> returns USER_NOTIF, the kernel pauses immediately to let the listener
> handle the call. If that listener then issues "CONTINUE", the kernel
> resumes by running the remaining older filters in the chain.

I guess there is a philosophical argument that earlier filters are "more
trusted" but the seccomp security model has always been that the
strictest filter return wins and I don't really see a strong argument
for deviating from that for USER_NOTIF.

-- 
Aleksa Sarai
https://www.cyphar.com/

Re: [PATCH v3 6/7] seccomp: allow nested listeners

Posted by Andy Lutomirski 2 weeks, 5 days ago

On Wed, Jan 21, 2026 at 7:43 AM Aleksa Sarai <cyphar@cyphar.com> wrote:
>
> On 2026-01-20, Andrei Vagin <avagin@gmail.com> wrote:
> > On Thu, Dec 11, 2025 at 4:46 AM Alexander Mikhalitsyn
> > <aleksandr.mikhalitsyn@canonical.com> wrote:
> > >
> > > Now everything is ready to get rid of "only one listener per tree"
> > > limitation.
> > >
> > > Let's introduce a new uAPI flag
> > > SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS, so userspace may explicitly
> > > allow nested listeners when installing a listener.
> >
> > I am not sure we really need SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS.
> > If nested listeners are completely functional, why would we want to
> > implicitly allow or disallow someone from using them?
>
> It can be quite easy to deadlock a process using seccomp-notify (even
> in the single-notifier case) so especially in the case of container
> managers I can see the argument for wanting this to be an opt-in thing
> once container runtimes have verified their notifier won't break
> nesting.

Is the deadlock such that a process and its manager can deadlock in a
way that's hard to kill?  Or is there some problem that could
adversely affect an outer manager?  It would be nice for these
features to be automatic instead of opt in.

(I just wasted half an hour yesterday removing use of
unshare(CLONE_FILES) from a program that didn't run under a container
manager that, for some reason, thought that was a sensitive syscall.)

--Andy

>
> > Actually, even the current behavior of SECCOMP_RET_USER_NOTIF looks a
> > bit illogical. I think the following behavior would be more expected:
> > instead of running all filters and picking the most restrictive result,
> > the kernel should execute them one by one (most recent fist). If a filter
> > returns USER_NOTIF, the kernel pauses immediately to let the listener
> > handle the call. If that listener then issues "CONTINUE", the kernel
> > resumes by running the remaining older filters in the chain.
>
> I guess there is a philosophical argument that earlier filters are "more
> trusted" but the seccomp security model has always been that the
> strictest filter return wins and I don't really see a strong argument
> for deviating from that for USER_NOTIF.
>

I don't know if I agree with that philosophy.  I would think the best
philosophy is that, when filters are nested, the innermost filter +
filtered task combination acts as a unit that is filtered by the outer
filter.

Without notifiers and without filters that overwrite errno, I think
strictest-wins is a decent approximation -- the choices are kill or
allow, although one might quibble about the various forms of "kill".

With SECCOMP_RET_ERRNO, I would argue that the behavior would be
superior if we just stopped processing filters after an inner filter
returned SECCOMP_RET_ERROR.  After all, the effect is to do no syscall
at all, and having a process that didn't do a syscall get killed
because it tried a bad syscall is kind of weird.

With notifiers, this is all rather more complex.  Notifiers can
emulate syscalls, and having an outer notifier somehow process the
syscall that was replaced by an inner notifier seems rather weird.  Or
suppose that an outer filter wants to prevent some operation, but an
inner system wants to emulate it in a way that doesn't do the
offending syscall, why not allow it?

So I'd argue for considering changing the behavior for everything,
maybe optionally?  I'm not really sure where TRACE fits in.

--Andy

-- 
Andy Lutomirski
AMA Capital Management, LLC

Re: [PATCH v3 6/7] seccomp: allow nested listeners

Posted by Andrei Vagin 2 weeks, 3 days ago

On Wed, Jan 21, 2026 at 9:59 AM Andy Lutomirski <luto@amacapital.net> wrote:
>
> On Wed, Jan 21, 2026 at 7:43 AM Aleksa Sarai <cyphar@cyphar.com> wrote:
> >
> > On 2026-01-20, Andrei Vagin <avagin@gmail.com> wrote:
> > > On Thu, Dec 11, 2025 at 4:46 AM Alexander Mikhalitsyn
> > > <aleksandr.mikhalitsyn@canonical.com> wrote:
> > > >
> > > > Now everything is ready to get rid of "only one listener per tree"
> > > > limitation.
> > > >
> > > > Let's introduce a new uAPI flag
> > > > SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS, so userspace may explicitly
> > > > allow nested listeners when installing a listener.
> > >
> > > I am not sure we really need SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS.
> > > If nested listeners are completely functional, why would we want to
> > > implicitly allow or disallow someone from using them?
> >
> > It can be quite easy to deadlock a process using seccomp-notify (even
> > in the single-notifier case) so especially in the case of container
> > managers I can see the argument for wanting this to be an opt-in thing
> > once container runtimes have verified their notifier won't break
> > nesting.
>
> Is the deadlock such that a process and its manager can deadlock in a
> way that's hard to kill?  Or is there some problem that could
> adversely affect an outer manager?  It would be nice for these
> features to be automatic instead of opt in.

Both a process and its manager can always be killed with SIGKILL.
I’m not sure I follow the specific deadlock Aleksa is referring to here.
In my view, an outer manager should not care about any syscalls that
processes are calling and intercepting. The outer manager must be
triggered only when a syscall is going to be executed "natively".
This kind of overlaps with the second part...

BTW: If a user wants to prevent the usage of seccomp notify, they can
always install a seccomp filter that rejects the seccomp syscall called
with SECCOMP_FILTER_FLAG_NEW_LISTENER.

>
> (I just wasted half an hour yesterday removing use of
> unshare(CLONE_FILES) from a program that didn't run under a container
> manager that, for some reason, thought that was a sensitive syscall.)
>
> --Andy
>
> >
> > > Actually, even the current behavior of SECCOMP_RET_USER_NOTIF looks a
> > > bit illogical. I think the following behavior would be more expected:
> > > instead of running all filters and picking the most restrictive result,
> > > the kernel should execute them one by one (most recent fist). If a filter
> > > returns USER_NOTIF, the kernel pauses immediately to let the listener
> > > handle the call. If that listener then issues "CONTINUE", the kernel
> > > resumes by running the remaining older filters in the chain.
> >
> > I guess there is a philosophical argument that earlier filters are "more
> > trusted" but the seccomp security model has always been that the
> > strictest filter return wins and I don't really see a strong argument
> > for deviating from that for USER_NOTIF.
> >
>
> I don't know if I agree with that philosophy.  I would think the best
> philosophy is that, when filters are nested, the innermost filter +
> filtered task combination acts as a unit that is filtered by the outer
> filter.
>
> Without notifiers and without filters that overwrite errno, I think
> strictest-wins is a decent approximation -- the choices are kill or
> allow, although one might quibble about the various forms of "kill".
>
> With SECCOMP_RET_ERRNO, I would argue that the behavior would be
> superior if we just stopped processing filters after an inner filter
> returned SECCOMP_RET_ERROR.  After all, the effect is to do no syscall
> at all, and having a process that didn't do a syscall get killed
> because it tried a bad syscall is kind of weird.
>
> With notifiers, this is all rather more complex.  Notifiers can
> emulate syscalls, and having an outer notifier somehow process the
> syscall that was replaced by an inner notifier seems rather weird.  Or
> suppose that an outer filter wants to prevent some operation, but an
> inner system wants to emulate it in a way that doesn't do the
> offending syscall, why not allow it?
>
> So I'd argue for considering changing the behavior for everything,
> maybe optionally?  I'm not really sure where TRACE fits in.
>

gVisor (a user-mode kernel similar to User-Mode Linux) is a real-world
example that is impacted by the current seccomp behavior. The gVisor
systrap platform uses seccomp to intercept guest syscalls so they can
be handled by the Sentry (the gVisor kernel). All guest syscalls are
managed by the Sentry and are never executed natively.

Thanks,
Andrei

Re: [PATCH v3 6/7] seccomp: allow nested listeners

Posted by Andy Lutomirski 1 month, 4 weeks ago

On Thu, Dec 11, 2025 at 8:47 PM Alexander Mikhalitsyn
<aleksandr.mikhalitsyn@canonical.com> wrote:
>
> Now everything is ready to get rid of "only one listener per tree"
> limitation.
>
> Let's introduce a new uAPI flag
> SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS, so userspace may explicitly
> allow nested listeners when installing a listener.
>
> Note, that to install n-th listener, this flag must be set on all
> the listeners up the tree.


> diff --git a/Documentation/userspace-api/seccomp_filter.rst b/Documentation/userspace-api/seccomp_filter.rst
> index cff0fa7f3175..b9633ab1ed47 100644
> --- a/Documentation/userspace-api/seccomp_filter.rst
> +++ b/Documentation/userspace-api/seccomp_filter.rst
> @@ -210,6 +210,12 @@ notifications from both tasks will appear on the same filter fd. Reads and
>  writes to/from a filter fd are also synchronized, so a filter fd can safely
>  have many readers.
>
> +By default, only one listener within seccomp filters tree is allowed. On attempt
> +to add a new listener when one already exists in the filter tree, the
> +``seccomp()`` call will fail with ``-EBUSY``. To allow multiple listeners, the
> +``SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS`` flag can be passed in addition to
> +the ``SECCOMP_FILTER_FLAG_NEW_LISTENER`` flag.
> +

I read this, and I contemplated: does this mean that this permits
additional filters (added later, nested inside) to have listeners or
does it permit applying a listener when there already is one?  I
thought it was surely it's the former, but I had to read the code to
confirm that.

Maybe clarify the text?

(Yes, I realize it's also in the commit message, but that's not a
great place to hide this info.)


>  The interface for a seccomp notification fd consists of two structures:
>
>  .. code-block:: c
> diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> index 9b959972bf4a..9b060946019d 100644
> --- a/include/linux/seccomp.h
> +++ b/include/linux/seccomp.h
> @@ -10,7 +10,8 @@
>                                          SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
>                                          SECCOMP_FILTER_FLAG_NEW_LISTENER | \
>                                          SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
> -                                        SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
> +                                        SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV | \
> +                                        SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS)
>
>  /* sizeof() the first published struct seccomp_notif_addfd */
>  #define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index dbfc9b37fcae..de78d8e7a70b 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -18,13 +18,14 @@
>  #define SECCOMP_GET_NOTIF_SIZES                3
>
>  /* Valid flags for SECCOMP_SET_MODE_FILTER */
> -#define SECCOMP_FILTER_FLAG_TSYNC              (1UL << 0)
> -#define SECCOMP_FILTER_FLAG_LOG                        (1UL << 1)
> -#define SECCOMP_FILTER_FLAG_SPEC_ALLOW         (1UL << 2)
> -#define SECCOMP_FILTER_FLAG_NEW_LISTENER       (1UL << 3)
> -#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH                (1UL << 4)
> +#define SECCOMP_FILTER_FLAG_TSYNC                      (1UL << 0)
> +#define SECCOMP_FILTER_FLAG_LOG                                (1UL << 1)
> +#define SECCOMP_FILTER_FLAG_SPEC_ALLOW                 (1UL << 2)
> +#define SECCOMP_FILTER_FLAG_NEW_LISTENER               (1UL << 3)
> +#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH                        (1UL << 4)
>  /* Received notifications wait in killable state (only respond to fatal signals) */
> -#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
> +#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV         (1UL << 5)
> +#define SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS     (1UL << 6)
>
>  /*
>   * All BPF programs must return a 32-bit value.
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index 51d0d8adaffb..7667f443ff6c 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -206,6 +206,7 @@ static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter)
>   * @wait_killable_recv: Put notifying process in killable state once the
>   *                     notification is received by the userspace listener.
>   * @first_listener: true if this is the first seccomp listener installed in the tree.
> + * @allow_nested_listeners: Allow nested seccomp listeners.
>   * @prev: points to a previously installed, or inherited, filter
>   * @prog: the BPF program to evaluate
>   * @notif: the struct that holds all notification related information
> @@ -228,6 +229,7 @@ struct seccomp_filter {
>         bool log : 1;
>         bool wait_killable_recv : 1;
>         bool first_listener : 1;
> +       bool allow_nested_listeners : 1;
>         struct action_cache cache;
>         struct seccomp_filter *prev;
>         struct bpf_prog *prog;
> @@ -956,6 +958,10 @@ static long seccomp_attach_filter(unsigned int flags,
>         if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
>                 filter->wait_killable_recv = true;
>
> +       /* Set nested listeners allow flag, if present. */
> +       if (flags & SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS)
> +               filter->allow_nested_listeners = true;
> +
>         /*
>          * If there is an existing filter, make it the prev and don't drop its
>          * task reference.
> @@ -1997,7 +2003,8 @@ static struct file *init_listener(struct seccomp_filter *filter)
>  }
>
>  /*
> - * Does @new_child have a listener while an ancestor also has a listener?
> + * Does @new_child have a listener while an ancestor also has a listener
> + * and hasn't allowed nesting?
>   * If so, we'll want to reject this filter.
>   * This only has to be tested for the current process, even in the TSYNC case,
>   * because TSYNC installs @child with the same parent on all threads.
> @@ -2015,7 +2022,12 @@ static bool check_duplicate_listener(struct seccomp_filter *new_child)
>                 return false;
>         for (cur = current->seccomp.filter; cur; cur = cur->prev) {
>                 if (!IS_ERR_OR_NULL(cur->notif))
> -                       return true;
> +                       /*
> +                        * We don't need to go up further, because if there is a
> +                        * listener with nesting allowed, then all the listeners
> +                        * up the tree have allowed nesting as well.
> +                        */
> +                       return !cur->allow_nested_listeners;
>         }
>
>         /* Mark first listener in the tree. */
> @@ -2062,10 +2074,12 @@ static long seccomp_set_mode_filter(unsigned int flags,
>                 return -EINVAL;
>
>         /*
> -        * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't make sense
> +        * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT and
> +        * SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS flags don't make sense
>          * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
>          */
> -       if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
> +       if (((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) ||
> +            (flags & SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS)) &&
>             ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
>                 return -EINVAL;
>
> diff --git a/tools/include/uapi/linux/seccomp.h b/tools/include/uapi/linux/seccomp.h
> index dbfc9b37fcae..de78d8e7a70b 100644
> --- a/tools/include/uapi/linux/seccomp.h
> +++ b/tools/include/uapi/linux/seccomp.h
> @@ -18,13 +18,14 @@
>  #define SECCOMP_GET_NOTIF_SIZES                3
>
>  /* Valid flags for SECCOMP_SET_MODE_FILTER */
> -#define SECCOMP_FILTER_FLAG_TSYNC              (1UL << 0)
> -#define SECCOMP_FILTER_FLAG_LOG                        (1UL << 1)
> -#define SECCOMP_FILTER_FLAG_SPEC_ALLOW         (1UL << 2)
> -#define SECCOMP_FILTER_FLAG_NEW_LISTENER       (1UL << 3)
> -#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH                (1UL << 4)
> +#define SECCOMP_FILTER_FLAG_TSYNC                      (1UL << 0)
> +#define SECCOMP_FILTER_FLAG_LOG                                (1UL << 1)
> +#define SECCOMP_FILTER_FLAG_SPEC_ALLOW                 (1UL << 2)
> +#define SECCOMP_FILTER_FLAG_NEW_LISTENER               (1UL << 3)
> +#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH                        (1UL << 4)
>  /* Received notifications wait in killable state (only respond to fatal signals) */
> -#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
> +#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV         (1UL << 5)
> +#define SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS     (1UL << 6)
>
>  /*
>   * All BPF programs must return a 32-bit value.
> --
> 2.43.0
>


--
Andy Lutomirski
AMA Capital Management, LLC

Re: [PATCH v3 6/7] seccomp: allow nested listeners

Posted by Alexander Mikhalitsyn 1 week, 5 days ago

On Fri, 2025-12-12 at 21:57 +0800, Andy Lutomirski wrote:
> On Thu, Dec 11, 2025 at 8:47 PM Alexander Mikhalitsyn
> <aleksandr.mikhalitsyn@canonical.com> wrote:
> > 
> > Now everything is ready to get rid of "only one listener per tree"
> > limitation.
> > 
> > Let's introduce a new uAPI flag
> > SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS, so userspace may
> > explicitly
> > allow nested listeners when installing a listener.
> > 
> > Note, that to install n-th listener, this flag must be set on all
> > the listeners up the tree.
> 
> 
> > diff --git a/Documentation/userspace-api/seccomp_filter.rst
> > b/Documentation/userspace-api/seccomp_filter.rst
> > index cff0fa7f3175..b9633ab1ed47 100644
> > --- a/Documentation/userspace-api/seccomp_filter.rst
> > +++ b/Documentation/userspace-api/seccomp_filter.rst
> > @@ -210,6 +210,12 @@ notifications from both tasks will appear on
> > the same filter fd. Reads and
> >  writes to/from a filter fd are also synchronized, so a filter fd
> > can safely
> >  have many readers.
> > 
> > +By default, only one listener within seccomp filters tree is
> > allowed. On attempt
> > +to add a new listener when one already exists in the filter tree,
> > the
> > +``seccomp()`` call will fail with ``-EBUSY``. To allow multiple
> > listeners, the
> > +``SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS`` flag can be passed
> > in addition to
> > +the ``SECCOMP_FILTER_FLAG_NEW_LISTENER`` flag.
> > +

Hi Andy,

thank you for looking into this!

> 
> I read this, and I contemplated: does this mean that this permits
> additional filters (added later, nested inside) to have listeners or
> does it permit applying a listener when there already is one?  I
> thought it was surely it's the former, but I had to read the code to
> confirm that.
> 
> Maybe clarify the text?

Sure, sorry about that! I'll fix that in the next version.

I'm going to do some massive rework on this one, because during LPC [1]
we've made a conclusion that we gonna fix something in seccomp behavior
we have right now and then this series can go on top.

[1] https://www.youtube.com/watch?v=-pSeoN68hP8

Kind regards,
Alex

> 
> (Yes, I realize it's also in the commit message, but that's not a
> great place to hide this info.)
> 
> 
> >  The interface for a seccomp notification fd consists of two
> > structures:
> > 
> >  .. code-block:: c
> > diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h
> > index 9b959972bf4a..9b060946019d 100644
> > --- a/include/linux/seccomp.h
> > +++ b/include/linux/seccomp.h
> > @@ -10,7 +10,8 @@
> >                                         
> > SECCOMP_FILTER_FLAG_SPEC_ALLOW | \
> >                                         
> > SECCOMP_FILTER_FLAG_NEW_LISTENER | \
> >                                         
> > SECCOMP_FILTER_FLAG_TSYNC_ESRCH | \
> > -                                       
> > SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
> > +                                       
> > SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV | \
> > +                                       
> > SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS)
> > 
> >  /* sizeof() the first published struct seccomp_notif_addfd */
> >  #define SECCOMP_NOTIFY_ADDFD_SIZE_VER0 24
> > diff --git a/include/uapi/linux/seccomp.h
> > b/include/uapi/linux/seccomp.h
> > index dbfc9b37fcae..de78d8e7a70b 100644
> > --- a/include/uapi/linux/seccomp.h
> > +++ b/include/uapi/linux/seccomp.h
> > @@ -18,13 +18,14 @@
> >  #define SECCOMP_GET_NOTIF_SIZES                3
> > 
> >  /* Valid flags for SECCOMP_SET_MODE_FILTER */
> > -#define SECCOMP_FILTER_FLAG_TSYNC              (1UL << 0)
> > -#define SECCOMP_FILTER_FLAG_LOG                        (1UL << 1)
> > -#define SECCOMP_FILTER_FLAG_SPEC_ALLOW         (1UL << 2)
> > -#define SECCOMP_FILTER_FLAG_NEW_LISTENER       (1UL << 3)
> > -#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH                (1UL << 4)
> > +#define SECCOMP_FILTER_FLAG_TSYNC                      (1UL << 0)
> > +#define SECCOMP_FILTER_FLAG_LOG                               
> > (1UL << 1)
> > +#define SECCOMP_FILTER_FLAG_SPEC_ALLOW                 (1UL << 2)
> > +#define SECCOMP_FILTER_FLAG_NEW_LISTENER               (1UL << 3)
> > +#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH                       
> > (1UL << 4)
> >  /* Received notifications wait in killable state (only respond to
> > fatal signals) */
> > -#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
> > +#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV         (1UL << 5)
> > +#define SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS     (1UL << 6)
> > 
> >  /*
> >   * All BPF programs must return a 32-bit value.
> > diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> > index 51d0d8adaffb..7667f443ff6c 100644
> > --- a/kernel/seccomp.c
> > +++ b/kernel/seccomp.c
> > @@ -206,6 +206,7 @@ static inline void seccomp_cache_prepare(struct
> > seccomp_filter *sfilter)
> >   * @wait_killable_recv: Put notifying process in killable state
> > once the
> >   *                     notification is received by the userspace
> > listener.
> >   * @first_listener: true if this is the first seccomp listener
> > installed in the tree.
> > + * @allow_nested_listeners: Allow nested seccomp listeners.
> >   * @prev: points to a previously installed, or inherited, filter
> >   * @prog: the BPF program to evaluate
> >   * @notif: the struct that holds all notification related
> > information
> > @@ -228,6 +229,7 @@ struct seccomp_filter {
> >         bool log : 1;
> >         bool wait_killable_recv : 1;
> >         bool first_listener : 1;
> > +       bool allow_nested_listeners : 1;
> >         struct action_cache cache;
> >         struct seccomp_filter *prev;
> >         struct bpf_prog *prog;
> > @@ -956,6 +958,10 @@ static long seccomp_attach_filter(unsigned int
> > flags,
> >         if (flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV)
> >                 filter->wait_killable_recv = true;
> > 
> > +       /* Set nested listeners allow flag, if present. */
> > +       if (flags & SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS)
> > +               filter->allow_nested_listeners = true;
> > +
> >         /*
> >          * If there is an existing filter, make it the prev and
> > don't drop its
> >          * task reference.
> > @@ -1997,7 +2003,8 @@ static struct file *init_listener(struct
> > seccomp_filter *filter)
> >  }
> > 
> >  /*
> > - * Does @new_child have a listener while an ancestor also has a
> > listener?
> > + * Does @new_child have a listener while an ancestor also has a
> > listener
> > + * and hasn't allowed nesting?
> >   * If so, we'll want to reject this filter.
> >   * This only has to be tested for the current process, even in the
> > TSYNC case,
> >   * because TSYNC installs @child with the same parent on all
> > threads.
> > @@ -2015,7 +2022,12 @@ static bool check_duplicate_listener(struct
> > seccomp_filter *new_child)
> >                 return false;
> >         for (cur = current->seccomp.filter; cur; cur = cur->prev) {
> >                 if (!IS_ERR_OR_NULL(cur->notif))
> > -                       return true;
> > +                       /*
> > +                        * We don't need to go up further, because
> > if there is a
> > +                        * listener with nesting allowed, then all
> > the listeners
> > +                        * up the tree have allowed nesting as
> > well.
> > +                        */
> > +                       return !cur->allow_nested_listeners;
> >         }
> > 
> >         /* Mark first listener in the tree. */
> > @@ -2062,10 +2074,12 @@ static long
> > seccomp_set_mode_filter(unsigned int flags,
> >                 return -EINVAL;
> > 
> >         /*
> > -        * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT flag doesn't
> > make sense
> > +        * The SECCOMP_FILTER_FLAG_WAIT_KILLABLE_SENT and
> > +        * SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS flags don't
> > make sense
> >          * without the SECCOMP_FILTER_FLAG_NEW_LISTENER flag.
> >          */
> > -       if ((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) &&
> > +       if (((flags & SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV) ||
> > +            (flags & SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS))
> > &&
> >             ((flags & SECCOMP_FILTER_FLAG_NEW_LISTENER) == 0))
> >                 return -EINVAL;
> > 
> > diff --git a/tools/include/uapi/linux/seccomp.h
> > b/tools/include/uapi/linux/seccomp.h
> > index dbfc9b37fcae..de78d8e7a70b 100644
> > --- a/tools/include/uapi/linux/seccomp.h
> > +++ b/tools/include/uapi/linux/seccomp.h
> > @@ -18,13 +18,14 @@
> >  #define SECCOMP_GET_NOTIF_SIZES                3
> > 
> >  /* Valid flags for SECCOMP_SET_MODE_FILTER */
> > -#define SECCOMP_FILTER_FLAG_TSYNC              (1UL << 0)
> > -#define SECCOMP_FILTER_FLAG_LOG                        (1UL << 1)
> > -#define SECCOMP_FILTER_FLAG_SPEC_ALLOW         (1UL << 2)
> > -#define SECCOMP_FILTER_FLAG_NEW_LISTENER       (1UL << 3)
> > -#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH                (1UL << 4)
> > +#define SECCOMP_FILTER_FLAG_TSYNC                      (1UL << 0)
> > +#define SECCOMP_FILTER_FLAG_LOG                               
> > (1UL << 1)
> > +#define SECCOMP_FILTER_FLAG_SPEC_ALLOW                 (1UL << 2)
> > +#define SECCOMP_FILTER_FLAG_NEW_LISTENER               (1UL << 3)
> > +#define SECCOMP_FILTER_FLAG_TSYNC_ESRCH                       
> > (1UL << 4)
> >  /* Received notifications wait in killable state (only respond to
> > fatal signals) */
> > -#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV (1UL << 5)
> > +#define SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV         (1UL << 5)
> > +#define SECCOMP_FILTER_FLAG_ALLOW_NESTED_LISTENERS     (1UL << 6)
> > 
> >  /*
> >   * All BPF programs must return a 32-bit value.
> > --
> > 2.43.0
> > 
> 
> 
> --
> Andy Lutomirski
> AMA Capital Management, LLC