[RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()

Jori Koolstra posted 2 patches 14 hours ago
[RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
Posted by Jori Koolstra 14 hours ago
Currently there is no way to race-freely create and open a directory.
For regular files we have open(O_CREAT) for creating a new file inode,
and returning a pinning fd to it. The lack of such functionality for
directories means that when populating a directory tree there's always
a race involved: the inodes first need to be created, and then opened
to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
but in the time window between the creation and the opening they might
be replaced by something else.

Addressing this race without proper APIs is possible (by immediately
fstat()ing what was opened, to verify that it has the right inode type),
but difficult to get right. Hence, mkdirat_fd() that creates a directory
and returns an O_DIRECTORY fd is useful.

This feature idea (and description) is taken from the UAPI group:
https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes

Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
---
 arch/x86/entry/syscalls/syscall_64.tbl |  1 +
 fs/internal.h                          |  1 +
 fs/namei.c                             | 26 ++++++++++++++++++++++++--
 include/linux/fcntl.h                  |  2 ++
 include/linux/syscalls.h               |  2 ++
 include/uapi/asm-generic/fcntl.h       |  3 +++
 include/uapi/asm-generic/unistd.h      |  5 ++++-
 scripts/syscall.tbl                    |  1 +
 8 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 524155d655da..dda920c26941 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -396,6 +396,7 @@
 469	common	file_setattr		sys_file_setattr
 470	common	listns			sys_listns
 471	common	rseq_slice_yield	sys_rseq_slice_yield
+472	common	mkdirat_fd		sys_mkdirat_fd
 
 #
 # Due to a historical design error, certain syscalls are numbered differently
diff --git a/fs/internal.h b/fs/internal.h
index cbc384a1aa09..2885a3e4ebdd 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -58,6 +58,7 @@ int filename_unlinkat(int dfd, struct filename *name);
 int may_linkat(struct mnt_idmap *idmap, const struct path *link);
 int filename_renameat2(int olddfd, struct filename *oldname, int newdfd,
 		 struct filename *newname, unsigned int flags);
+int filename_mkdirat_fd(int dfd, struct filename *name, umode_t mode, unsigned int flags);
 int filename_mkdirat(int dfd, struct filename *name, umode_t mode);
 int filename_mknodat(int dfd, struct filename *name, umode_t mode, unsigned int dev);
 int filename_symlinkat(struct filename *from, int newdfd, struct filename *to);
diff --git a/fs/namei.c b/fs/namei.c
index 1eb9db055292..93252937983e 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -5256,6 +5256,11 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 EXPORT_SYMBOL(vfs_mkdir);
 
 int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
+{
+	return filename_mkdirat_fd(dfd, name, mode, 0);
+}
+
+int filename_mkdirat_fd(int dfd, struct filename *name, umode_t mode, unsigned int flags)
 {
 	struct dentry *dentry;
 	struct path path;
@@ -5263,7 +5268,7 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
 	unsigned int lookup_flags = LOOKUP_DIRECTORY;
 	struct delegated_inode delegated_inode = { };
 
-retry:
+start:
 	dentry = filename_create(dfd, name, &path, lookup_flags);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
@@ -5276,7 +5281,6 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
 		if (IS_ERR(dentry))
 			error = PTR_ERR(dentry);
 	}
-	end_creating_path(&path, dentry);
 	if (is_delegated(&delegated_inode)) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
@@ -5286,7 +5290,25 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
 		lookup_flags |= LOOKUP_REVAL;
 		goto retry;
 	}
+
+	if (!error && (flags & MKDIRAT_FD_NEED_FD)) {
+		struct path new_path = { .mnt = path.mnt, .dentry = dentry };
+		error = FD_ADD(0, dentry_open(&new_path, O_DIRECTORY, current_cred()));
+	}
+	end_creating_path(&path, dentry);
 	return error;
+retry:
+	end_creating_path(&path, dentry);
+	goto start;
+}
+
+SYSCALL_DEFINE4(mkdirat_fd, int, dfd, const char __user *, pathname, umode_t, mode,
+		unsigned int, flags)
+{
+	CLASS(filename, name)(pathname);
+	if (flags & ~VALID_MKDIRAT_FD_FLAGS)
+		return -EINVAL;
+	return filename_mkdirat_fd(dfd, name, mode, flags | MKDIRAT_FD_NEED_FD);
 }
 
 SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
index a332e79b3207..d2f0fdb82847 100644
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -25,6 +25,8 @@
 #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
 #endif
 
+#define VALID_MKDIRAT_FD_FLAGS	(MKDIRAT_FD_NEED_FD)
+
 #if BITS_PER_LONG == 32
 #define IS_GETLK32(cmd)		((cmd) == F_GETLK)
 #define IS_SETLK32(cmd)		((cmd) == F_SETLK)
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 02bd6ddb6278..52e7f09d5525 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -999,6 +999,8 @@ asmlinkage long sys_lsm_get_self_attr(unsigned int attr, struct lsm_ctx __user *
 asmlinkage long sys_lsm_set_self_attr(unsigned int attr, struct lsm_ctx __user *ctx,
 				      u32 size, u32 flags);
 asmlinkage long sys_lsm_list_modules(u64 __user *ids, u32 __user *size, u32 flags);
+asmlinkage long sys_mkdirat_fd(int dfd, const char __user *pathname, umode_t mode,
+				     unsigned int flags)
 
 /*
  * Architecture-specific system calls
diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
index 613475285643..621458bf1fbf 100644
--- a/include/uapi/asm-generic/fcntl.h
+++ b/include/uapi/asm-generic/fcntl.h
@@ -95,6 +95,9 @@
 #define O_NDELAY	O_NONBLOCK
 #endif
 
+/* Flags for mkdirat_fd */
+#define MKDIRAT_FD_NEED_FD	0x01
+
 #define F_DUPFD		0	/* dup */
 #define F_GETFD		1	/* get close_on_exec */
 #define F_SETFD		2	/* set/clear close_on_exec */
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index a627acc8fb5f..5bae1029f5d9 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -863,8 +863,11 @@ __SYSCALL(__NR_listns, sys_listns)
 #define __NR_rseq_slice_yield 471
 __SYSCALL(__NR_rseq_slice_yield, sys_rseq_slice_yield)
 
+#define __NR_mkdirat_fd 472
+__SYSCALL(__NR_mkdirat_fd, sys_mkdirat_fd)
+
 #undef __NR_syscalls
-#define __NR_syscalls 472
+#define __NR_syscalls 473
 
 /*
  * 32 bit systems traditionally used different
diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl
index 7a42b32b6577..db3bd97d4a1a 100644
--- a/scripts/syscall.tbl
+++ b/scripts/syscall.tbl
@@ -412,3 +412,4 @@
 469	common	file_setattr			sys_file_setattr
 470	common	listns				sys_listns
 471	common	rseq_slice_yield		sys_rseq_slice_yield
+472	common	mkdirat_fd			sys_mkdirat_fd
-- 
2.53.0
Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
Posted by Mateusz Guzik 3 hours ago
On Tue, Mar 31, 2026 at 07:19:58PM +0200, Jori Koolstra wrote:
> @@ -5286,7 +5290,25 @@ int filename_mkdirat(int dfd, struct filename *name, umode_t mode)
>  		lookup_flags |= LOOKUP_REVAL;
>  		goto retry;
>  	}
> +
> +	if (!error && (flags & MKDIRAT_FD_NEED_FD)) {
> +		struct path new_path = { .mnt = path.mnt, .dentry = dentry };
> +		error = FD_ADD(0, dentry_open(&new_path, O_DIRECTORY, current_cred()));
> +	}
> +	end_creating_path(&path, dentry);
>  	return error;


You can't do it like this. Should it turn out no fd can be allocated,
the entire thing is going to error out while keeping the newly created
directory behind. You need to allocate the fd first, then do the hard
work, and only then fd_install and or free the fd. The FD_ADD machinery
can probably still be used provided proper wrapping of the real new
mkdir.

It should be perfectly feasible to de facto wrap existing mkdir
functionality by this syscall.

On top of that similarly to what other people mentioned the new syscall
will definitely want to support O_CLOEXEC and probably other flags down
the line.

Trying to handle this in open() is a no-go. openat2 is rather
problematic.

I tend to agree mkdirat_fd is not a good name for the syscall either,
but I don't have a suggestion I'm happy with. I think least bad name
would follow the existing stuff and be mkdirat2 or similar.

The routine would have to start with validating the passed O_ flags, for
now only allowing O_CLOEXEC and EINVAL-ing otherwise.
Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
Posted by Yann Droneaud 11 hours ago
Hi,

Le 31/03/2026 à 19:19, Jori Koolstra a écrit :
> Currently there is no way to race-freely create and open a directory.
> For regular files we have open(O_CREAT) for creating a new file inode,
> and returning a pinning fd to it. The lack of such functionality for
> directories means that when populating a directory tree there's always
> a race involved: the inodes first need to be created, and then opened
> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
> but in the time window between the creation and the opening they might
> be replaced by something else.
>
> Addressing this race without proper APIs is possible (by immediately
> fstat()ing what was opened, to verify that it has the right inode type),
> but difficult to get right. Hence, mkdirat_fd() that creates a directory
> and returns an O_DIRECTORY fd is useful.
>
> This feature idea (and description) is taken from the UAPI group:
> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
> ---
>   arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>   fs/internal.h                          |  1 +
>   fs/namei.c                             | 26 ++++++++++++++++++++++++--
>   include/linux/fcntl.h                  |  2 ++
>   include/linux/syscalls.h               |  2 ++
>   include/uapi/asm-generic/fcntl.h       |  3 +++
>   include/uapi/asm-generic/unistd.h      |  5 ++++-
>   scripts/syscall.tbl                    |  1 +
>   8 files changed, 38 insertions(+), 3 deletions(-)

> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
> index a332e79b3207..d2f0fdb82847 100644
> --- a/include/linux/fcntl.h
> +++ b/include/linux/fcntl.h
> @@ -25,6 +25,8 @@
>   #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
>   #endif
>   
> +#define VALID_MKDIRAT_FD_FLAGS	(MKDIRAT_FD_NEED_FD)
> +

I don't see support for O_CLOEXEC-ish flag, is the file descriptor in 
close-on-exec mode by default ? If yes, it should be mentioned.


> diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
> index 613475285643..621458bf1fbf 100644
> --- a/include/uapi/asm-generic/fcntl.h
> +++ b/include/uapi/asm-generic/fcntl.h
> @@ -95,6 +95,9 @@
>   #define O_NDELAY	O_NONBLOCK
>   #endif
>   
> +/* Flags for mkdirat_fd */
> +#define MKDIRAT_FD_NEED_FD	0x01
> +


Regards.


Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
Posted by H. Peter Anvin 11 hours ago
On March 31, 2026 1:25:03 PM PDT, Yann Droneaud <yann@droneaud.fr> wrote:
>Hi,
>
>Le 31/03/2026 à 19:19, Jori Koolstra a écrit :
>> Currently there is no way to race-freely create and open a directory.
>> For regular files we have open(O_CREAT) for creating a new file inode,
>> and returning a pinning fd to it. The lack of such functionality for
>> directories means that when populating a directory tree there's always
>> a race involved: the inodes first need to be created, and then opened
>> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
>> but in the time window between the creation and the opening they might
>> be replaced by something else.
>> 
>> Addressing this race without proper APIs is possible (by immediately
>> fstat()ing what was opened, to verify that it has the right inode type),
>> but difficult to get right. Hence, mkdirat_fd() that creates a directory
>> and returns an O_DIRECTORY fd is useful.
>> 
>> This feature idea (and description) is taken from the UAPI group:
>> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>> 
>> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>
>> ---
>>   arch/x86/entry/syscalls/syscall_64.tbl |  1 +
>>   fs/internal.h                          |  1 +
>>   fs/namei.c                             | 26 ++++++++++++++++++++++++--
>>   include/linux/fcntl.h                  |  2 ++
>>   include/linux/syscalls.h               |  2 ++
>>   include/uapi/asm-generic/fcntl.h       |  3 +++
>>   include/uapi/asm-generic/unistd.h      |  5 ++++-
>>   scripts/syscall.tbl                    |  1 +
>>   8 files changed, 38 insertions(+), 3 deletions(-)
>
>> diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h
>> index a332e79b3207..d2f0fdb82847 100644
>> --- a/include/linux/fcntl.h
>> +++ b/include/linux/fcntl.h
>> @@ -25,6 +25,8 @@
>>   #define force_o_largefile() (!IS_ENABLED(CONFIG_ARCH_32BIT_OFF_T))
>>   #endif
>>   +#define VALID_MKDIRAT_FD_FLAGS	(MKDIRAT_FD_NEED_FD)
>> +
>
>I don't see support for O_CLOEXEC-ish flag, is the file descriptor in close-on-exec mode by default ? If yes, it should be mentioned.
>
>
>> diff --git a/include/uapi/asm-generic/fcntl.h b/include/uapi/asm-generic/fcntl.h
>> index 613475285643..621458bf1fbf 100644
>> --- a/include/uapi/asm-generic/fcntl.h
>> +++ b/include/uapi/asm-generic/fcntl.h
>> @@ -95,6 +95,9 @@
>>   #define O_NDELAY	O_NONBLOCK
>>   #endif
>>   +/* Flags for mkdirat_fd */
>> +#define MKDIRAT_FD_NEED_FD	0x01
>> +
>
>
>Regards.
>
>

And even if it is, POSIX already has O_CLOFORK and we should expect that that will be needed, too.
Re: [RFC PATCH 1/2] vfs: syscalls: add mkdirat_fd()
Posted by Arnd Bergmann 13 hours ago
On Tue, Mar 31, 2026, at 19:19, Jori Koolstra wrote:
> Currently there is no way to race-freely create and open a directory.
> For regular files we have open(O_CREAT) for creating a new file inode,
> and returning a pinning fd to it. The lack of such functionality for
> directories means that when populating a directory tree there's always
> a race involved: the inodes first need to be created, and then opened
> to adjust their permissions/ownership/labels/timestamps/acls/xattrs/...,
> but in the time window between the creation and the opening they might
> be replaced by something else.
>
> Addressing this race without proper APIs is possible (by immediately
> fstat()ing what was opened, to verify that it has the right inode type),
> but difficult to get right. Hence, mkdirat_fd() that creates a directory
> and returns an O_DIRECTORY fd is useful.
>
> This feature idea (and description) is taken from the UAPI group:
> https://github.com/uapi-group/kernel-features?tab=readme-ov-file#race-free-creation-and-opening-of-non-file-inodes
>
> Signed-off-by: Jori Koolstra <jkoolstra@xs4all.nl>

I checked that the calling conventions are fine, i.e. this will work
as expected across all architectures. I assume you are also aware
that the non-RFC patch will need to add the syscall number to all
.tbl files.

The hardest problem here does seem to be the naming of the
new syscall, and I'm sorry to not be able to offer any solution
either, just two observations:

- mkdirat/mkdirat_fd sounds similar to the existing
  quotactl/quotactl_fd pair, but quotactl_fd() takes a file
  descriptor argument rather than returning it, which makes
  this addition quite confusing.

- the nicest interface IMO would have been a variation of
  openat(dfd, filename, O_CREAT | O_DIRECTORY, mode)
  but that is a minefield of incompatible implementations[1],
  so we can't do that without changing the behavior for
  existing callers that currently run into an error.

       Arnd

[1] https://lwn.net/Articles/926782/