To get an operable version of an O_PATH file descriptor, it is possible
to use openat(fd, ".", O_DIRECTORY) for directories, but other files
currently require going through open("/proc/<pid>/fd/<nr>"), which
depends on a functioning procfs.
This patch adds the OPENAT2_EMPTY_PATH flag to openat2(2). If passed,
LOOKUP_EMPTY is set at path resolve time.
Note: This implies that you cannot rely anymore on disabling procfs from
being mounted (e.g. inside a container without procfs mounted and with
CAP_SYS_ADMIN dropped) to prevent O_PATH fds from being re-opened
read-write.
Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
fs/fcntl.c | 4 ++--
fs/open.c | 11 +++++------
include/linux/fcntl.h | 5 ++++-
include/uapi/linux/openat2.h | 4 ++++
4 files changed, 15 insertions(+), 9 deletions(-)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index beab8080badf..d9ae3c71edfe 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -1169,8 +1169,8 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
- BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
- HWEIGHT32(
+ BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
+ HWEIGHT64(
(VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
__FMODE_EXEC));
diff --git a/fs/open.c b/fs/open.c
index 91f1139591ab..e019ddecc73c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1160,12 +1160,12 @@ struct file *kernel_file_open(const struct path *path, int flags,
EXPORT_SYMBOL_GPL(kernel_file_open);
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
-#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
+#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC | OPENAT2_EMPTY_PATH)
inline struct open_how build_open_how(int flags, umode_t mode)
{
struct open_how how = {
- .flags = flags & VALID_OPEN_FLAGS,
+ .flags = ((unsigned int) flags) & VALID_OPEN_FLAGS,
.mode = mode & S_IALLUGO,
};
@@ -1185,9 +1185,6 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
int lookup_flags = 0;
int acc_mode = ACC_MODE(flags);
- BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
- "struct open_flags doesn't yet handle flags > 32 bits");
-
/*
* Strip flags that aren't relevant in determining struct open_flags.
*/
@@ -1281,6 +1278,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
lookup_flags |= LOOKUP_DIRECTORY;
if (!(flags & O_NOFOLLOW))
lookup_flags |= LOOKUP_FOLLOW;
+ if (flags & OPENAT2_EMPTY_PATH)
+ lookup_flags |= LOOKUP_EMPTY;
if (how->resolve & RESOLVE_NO_XDEV)
lookup_flags |= LOOKUP_NO_XDEV;
@@ -1362,7 +1361,7 @@ static int do_sys_openat2(int dfd, const char __user *filename,
if (unlikely(err))
return err;
- CLASS(filename, name)(filename);
+ CLASS(filename_flags, name)(filename, op.lookup_flags);
return FD_ADD(how->flags, do_file_open(dfd, name, &op));
}
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index a332e79b3207..d1bb87ff70e3 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -7,10 +7,13 @@
/* List of all valid flags for the open/openat flags argument: */
#define VALID_OPEN_FLAGS \
+ /* lower 32-bit flags */ \
(O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
O_APPEND | O_NDELAY | O_NONBLOCK | __O_SYNC | O_DSYNC | \
FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
- O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
+ O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | \
+ /* upper 32-bit flags (openat2(2) only) */ \
+ OPENAT2_EMPTY_PATH)
/* List of all valid flags for the how->resolve argument: */
#define VALID_RESOLVE_FLAGS \
diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
index a5feb7604948..c34f32e6fa96 100644
--- a/include/uapi/linux/openat2.h
+++ b/include/uapi/linux/openat2.h
@@ -40,4 +40,8 @@ struct open_how {
return -EAGAIN if that's not
possible. */
+/* openat2(2) exclusive flags are defined in the upper 32 bits of
+ open_how->flags */
+#define OPENAT2_EMPTY_PATH 0x100000000 /* (1ULL << 32) */
+
#endif /* _UAPI_LINUX_OPENAT2_H */
--
2.53.0
On Thu, 2026-03-26 at 19:20 +0100, Jori Koolstra wrote:
> To get an operable version of an O_PATH file descriptor, it is possible
> to use openat(fd, ".", O_DIRECTORY) for directories, but other files
> currently require going through open("/proc/<pid>/fd/<nr>"), which
> depends on a functioning procfs.
>
> This patch adds the OPENAT2_EMPTY_PATH flag to openat2(2). If passed,
> LOOKUP_EMPTY is set at path resolve time.
>
> Note: This implies that you cannot rely anymore on disabling procfs from
> being mounted (e.g. inside a container without procfs mounted and with
> CAP_SYS_ADMIN dropped) to prevent O_PATH fds from being re-opened
> read-write.
>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
> ---
> fs/fcntl.c | 4 ++--
> fs/open.c | 11 +++++------
> include/linux/fcntl.h | 5 ++++-
> include/uapi/linux/openat2.h | 4 ++++
> 4 files changed, 15 insertions(+), 9 deletions(-)
>
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index beab8080badf..d9ae3c71edfe 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -1169,8 +1169,8 @@ static int __init fcntl_init(void)
> * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
> * is defined as O_NONBLOCK on some platforms and not on others.
> */
> - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
> - HWEIGHT32(
> + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> + HWEIGHT64(
> (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> __FMODE_EXEC));
>
> diff --git a/fs/open.c b/fs/open.c
> index 91f1139591ab..e019ddecc73c 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1160,12 +1160,12 @@ struct file *kernel_file_open(const struct path *path, int flags,
> EXPORT_SYMBOL_GPL(kernel_file_open);
>
> #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
> -#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
> +#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC | OPENAT2_EMPTY_PATH)
>
> inline struct open_how build_open_how(int flags, umode_t mode)
> {
> struct open_how how = {
> - .flags = flags & VALID_OPEN_FLAGS,
> + .flags = ((unsigned int) flags) & VALID_OPEN_FLAGS,
> .mode = mode & S_IALLUGO,
> };
>
> @@ -1185,9 +1185,6 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> int lookup_flags = 0;
> int acc_mode = ACC_MODE(flags);
>
> - BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
> - "struct open_flags doesn't yet handle flags > 32 bits");
> -
> /*
> * Strip flags that aren't relevant in determining struct open_flags.
> */
> @@ -1281,6 +1278,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> lookup_flags |= LOOKUP_DIRECTORY;
> if (!(flags & O_NOFOLLOW))
> lookup_flags |= LOOKUP_FOLLOW;
> + if (flags & OPENAT2_EMPTY_PATH)
> + lookup_flags |= LOOKUP_EMPTY;
>
> if (how->resolve & RESOLVE_NO_XDEV)
> lookup_flags |= LOOKUP_NO_XDEV;
> @@ -1362,7 +1361,7 @@ static int do_sys_openat2(int dfd, const char __user *filename,
> if (unlikely(err))
> return err;
>
> - CLASS(filename, name)(filename);
> + CLASS(filename_flags, name)(filename, op.lookup_flags);
> return FD_ADD(how->flags, do_file_open(dfd, name, &op));
> }
>
> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> index a332e79b3207..d1bb87ff70e3 100644
> --- a/include/linux/fcntl.h
> +++ b/include/linux/fcntl.h
> @@ -7,10 +7,13 @@
>
> /* List of all valid flags for the open/openat flags argument: */
> #define VALID_OPEN_FLAGS \
> + /* lower 32-bit flags */ \
> (O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
> O_APPEND | O_NDELAY | O_NONBLOCK | __O_SYNC | O_DSYNC | \
> FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
> - O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
> + O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | \
> + /* upper 32-bit flags (openat2(2) only) */ \
> + OPENAT2_EMPTY_PATH)
>
> /* List of all valid flags for the how->resolve argument: */
> #define VALID_RESOLVE_FLAGS \
> diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
> index a5feb7604948..c34f32e6fa96 100644
> --- a/include/uapi/linux/openat2.h
> +++ b/include/uapi/linux/openat2.h
> @@ -40,4 +40,8 @@ struct open_how {
> return -EAGAIN if that's not
> possible. */
>
> +/* openat2(2) exclusive flags are defined in the upper 32 bits of
> + open_how->flags */
> +#define OPENAT2_EMPTY_PATH 0x100000000 /* (1ULL << 32) */
> +
> #endif /* _UAPI_LINUX_OPENAT2_H */
Looks sane to me. Can this be merged apart from the rest of the series?
It doesn't seem like the transitive stuff is dependent on this.
Reviewed-by: Jeff Layton <jlayton@kernel.org>
On 2026-03-30, Jeff Layton <jlayton@kernel.org> wrote: > Looks sane to me. Can this be merged apart from the rest of the series? > It doesn't seem like the transitive stuff is dependent on this. No it isn't. I mentioned this in another mail but my thinking from a long time ago was that we should harden these re-open paths if we want to fully support O_EMPTYPATH -- but we already allow all of this stuff today so we might as well merge this thing and make everyone's lives easier. The transitive stuff also needs a lot more work IMHO, and I suspect the full version is going to be a hard sell. My only comment is that I would like this to be usable in open() as it is one of the very few cases of an O_* flag that is actually backwards compatible with all of the brokenness of open() and the systemd folks would probably want to use it (they can't use openat2() yet). -- Aleksa Sarai https://www.cyphar.com/
> Op 01-04-2026 14:23 CEST schreef Aleksa Sarai <cyphar@cyphar.com>: > > > On 2026-03-30, Jeff Layton <jlayton@kernel.org> wrote: > > Looks sane to me. Can this be merged apart from the rest of the series? > > It doesn't seem like the transitive stuff is dependent on this. > > My only comment is that I would like this to be usable in open() as it > is one of the very few cases of an O_* flag that is actually backwards > compatible with all of the brokenness of open() and the systemd folks > would probably want to use it (they can't use openat2() yet). OK. I'll just wait a bit on what Christian has to say about adding it to openat(), since he initially rejected that, before I write up the new patch. > > -- > Aleksa Sarai > https://www.cyphar.com/ Thanks, Jori.
> Op 30-03-2026 14:12 CEST schreef Jeff Layton <jlayton@kernel.org>:
>
>
> On Thu, 2026-03-26 at 19:20 +0100, Jori Koolstra wrote:
> > To get an operable version of an O_PATH file descriptor, it is possible
> > to use openat(fd, ".", O_DIRECTORY) for directories, but other files
> > currently require going through open("/proc/<pid>/fd/<nr>"), which
> > depends on a functioning procfs.
> >
> > This patch adds the OPENAT2_EMPTY_PATH flag to openat2(2). If passed,
> > LOOKUP_EMPTY is set at path resolve time.
> >
> > Note: This implies that you cannot rely anymore on disabling procfs from
> > being mounted (e.g. inside a container without procfs mounted and with
> > CAP_SYS_ADMIN dropped) to prevent O_PATH fds from being re-opened
> > read-write.
> >
> > Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
>
> Looks sane to me. Can this be merged apart from the rest of the series?
> It doesn't seem like the transitive stuff is dependent on this.
>
> Reviewed-by: Jeff Layton <jlayton@kernel.org>
Yes, there is no dependence. However, Aleksa suggested that we DO add it
to openat() as well because then systemd folks can use it too. I don't
if there are any objections to that right now.
Thanks,
Jori.
On 2026-03-26, Jori Koolstra <jkoolstra@xs4all.nl> wrote:
> To get an operable version of an O_PATH file descriptor, it is possible
> to use openat(fd, ".", O_DIRECTORY) for directories, but other files
> currently require going through open("/proc/<pid>/fd/<nr>"), which
> depends on a functioning procfs.
>
> This patch adds the OPENAT2_EMPTY_PATH flag to openat2(2). If passed,
> LOOKUP_EMPTY is set at path resolve time.
>
> Note: This implies that you cannot rely anymore on disabling procfs from
> being mounted (e.g. inside a container without procfs mounted and with
> CAP_SYS_ADMIN dropped) to prevent O_PATH fds from being re-opened
> read-write.
Actually, this flag doesn't need to be openat2(2) only -- all existing
kernels will reject a pathname of "" with ENOENT. This means that
O_EMPTYPATH being set acting as a no-op is fine for older kernels (no
program will get an unexpected result from O_EMPTYPATH).
In my view, adding it to openat(2) is preferable because it means that
systemd et al. can use it (they currently block openat2(2) with
seccomp). This is what I did in the original openat2(2) patchset[1].
[1]: https://lore.kernel.org/lkml/20190930183316.10190-4-cyphar@cyphar.com/
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
> ---
> fs/fcntl.c | 4 ++--
> fs/open.c | 11 +++++------
> include/linux/fcntl.h | 5 ++++-
> include/uapi/linux/openat2.h | 4 ++++
> 4 files changed, 15 insertions(+), 9 deletions(-)
>
> diff --git a/fs/fcntl.c b/fs/fcntl.c
> index beab8080badf..d9ae3c71edfe 100644
> --- a/fs/fcntl.c
> +++ b/fs/fcntl.c
> @@ -1169,8 +1169,8 @@ static int __init fcntl_init(void)
> * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
> * is defined as O_NONBLOCK on some platforms and not on others.
> */
> - BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ !=
> - HWEIGHT32(
> + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ !=
> + HWEIGHT64(
> (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) |
> __FMODE_EXEC));
>
> diff --git a/fs/open.c b/fs/open.c
> index 91f1139591ab..e019ddecc73c 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1160,12 +1160,12 @@ struct file *kernel_file_open(const struct path *path, int flags,
> EXPORT_SYMBOL_GPL(kernel_file_open);
>
> #define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
> -#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
> +#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC | OPENAT2_EMPTY_PATH)
>
> inline struct open_how build_open_how(int flags, umode_t mode)
> {
> struct open_how how = {
> - .flags = flags & VALID_OPEN_FLAGS,
> + .flags = ((unsigned int) flags) & VALID_OPEN_FLAGS,
> .mode = mode & S_IALLUGO,
> };
>
> @@ -1185,9 +1185,6 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> int lookup_flags = 0;
> int acc_mode = ACC_MODE(flags);
>
> - BUILD_BUG_ON_MSG(upper_32_bits(VALID_OPEN_FLAGS),
> - "struct open_flags doesn't yet handle flags > 32 bits");
> -
> /*
> * Strip flags that aren't relevant in determining struct open_flags.
> */
> @@ -1281,6 +1278,8 @@ inline int build_open_flags(const struct open_how *how, struct open_flags *op)
> lookup_flags |= LOOKUP_DIRECTORY;
> if (!(flags & O_NOFOLLOW))
> lookup_flags |= LOOKUP_FOLLOW;
> + if (flags & OPENAT2_EMPTY_PATH)
> + lookup_flags |= LOOKUP_EMPTY;
>
> if (how->resolve & RESOLVE_NO_XDEV)
> lookup_flags |= LOOKUP_NO_XDEV;
> @@ -1362,7 +1361,7 @@ static int do_sys_openat2(int dfd, const char __user *filename,
> if (unlikely(err))
> return err;
>
> - CLASS(filename, name)(filename);
> + CLASS(filename_flags, name)(filename, op.lookup_flags);
> return FD_ADD(how->flags, do_file_open(dfd, name, &op));
> }
>
> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> index a332e79b3207..d1bb87ff70e3 100644
> --- a/include/linux/fcntl.h
> +++ b/include/linux/fcntl.h
> @@ -7,10 +7,13 @@
>
> /* List of all valid flags for the open/openat flags argument: */
> #define VALID_OPEN_FLAGS \
> + /* lower 32-bit flags */ \
> (O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \
> O_APPEND | O_NDELAY | O_NONBLOCK | __O_SYNC | O_DSYNC | \
> FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \
> - O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE)
> + O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE | \
> + /* upper 32-bit flags (openat2(2) only) */ \
> + OPENAT2_EMPTY_PATH)
>
> /* List of all valid flags for the how->resolve argument: */
> #define VALID_RESOLVE_FLAGS \
> diff --git a/include/uapi/linux/openat2.h b/include/uapi/linux/openat2.h
> index a5feb7604948..c34f32e6fa96 100644
> --- a/include/uapi/linux/openat2.h
> +++ b/include/uapi/linux/openat2.h
> @@ -40,4 +40,8 @@ struct open_how {
> return -EAGAIN if that's not
> possible. */
>
> +/* openat2(2) exclusive flags are defined in the upper 32 bits of
> + open_how->flags */
> +#define OPENAT2_EMPTY_PATH 0x100000000 /* (1ULL << 32) */
> +
> #endif /* _UAPI_LINUX_OPENAT2_H */
> --
> 2.53.0
>
--
Aleksa Sarai
https://www.cyphar.com/
> Op 27-03-2026 07:26 CET schreef Aleksa Sarai <cyphar@cyphar.com>:
>
>
> On 2026-03-26, Jori Koolstra <jkoolstra@xs4all.nl> wrote:
> > To get an operable version of an O_PATH file descriptor, it is possible
> > to use openat(fd, ".", O_DIRECTORY) for directories, but other files
> > currently require going through open("/proc/<pid>/fd/<nr>"), which
> > depends on a functioning procfs.
> >
> > This patch adds the OPENAT2_EMPTY_PATH flag to openat2(2). If passed,
> > LOOKUP_EMPTY is set at path resolve time.
> >
> > Note: This implies that you cannot rely anymore on disabling procfs from
> > being mounted (e.g. inside a container without procfs mounted and with
> > CAP_SYS_ADMIN dropped) to prevent O_PATH fds from being re-opened
> > read-write.
>
> Actually, this flag doesn't need to be openat2(2) only -- all existing
> kernels will reject a pathname of "" with ENOENT. This means that
> O_EMPTYPATH being set acting as a no-op is fine for older kernels (no
> program will get an unexpected result from O_EMPTYPATH).
>
> In my view, adding it to openat(2) is preferable because it means that
> systemd et al. can use it (they currently block openat2(2) with
> seccomp). This is what I did in the original openat2(2) patchset[1].
I changed this in response to feedback from Christian [1]. He did mention
that if someone really wants to add it to openat(), we should wait for their
reasons :)
But if systemd could use it, I think it is worth considering. I am not sure
why Christian was against it in the first place. Maybe to save flag space for
things that really really need to be also in openat().
[1]: https://lore.kernel.org/linux-fsdevel/20260224-karotten-wegnimmt-79410ef99aeb@brauner/.
Thanks,
Jori.
© 2016 - 2026 Red Hat, Inc.