include/linux/fs_struct.h | 11 ++++++++ kernel/audit.h | 7 +++++ kernel/auditsc.c | 56 +++++++++++++++++++++++++++++++++++---- 3 files changed, 69 insertions(+), 5 deletions(-)
It is found that on large SMP systems with a large number of CPUs and
auditing enabled, workloads that generate a massive amount of syscalls
(like open/close) in parallel on the same working directory can cause
significant spinlock contention of the lockref.lock of the working
directory's dentry.
One possible way to reduce such spinlock contention scenario is to
keep the pwd references in audit_free_names() and reuse it in the next
audit_alloc_name() call if there is no change in working directory.
A new pwd_reset field is added to audit_context to indicate, if set,
that the pwd has been reset but still hold mount and dentry references.
A new fs_pwd_equal() helper is added to fs_struct.h to check if the
fs->pwd has been changed. When audit_alloc_name() is called with the
context->pwd still valid and the fs->pwd hasn't been changed, it can
return without calling get_fs_pwd().
By adding test code to count the number of effective audit_free_names()
and audit_alloc_name() calls and the number of relevant path_put() and
path_get() calls after rebooting a patched kernel with auditing enabled
on a 2-socket 96-CPU system, there were about 30k path_get()/path_put()
out of a total of about 1 million audit_free_names()/audit_alloc_name()
calls. It is about 3% of those before the patch for this particular case.
After resetting the counters and running a parallel kernel build,
the new figures were about 202k path_get()/path_put() out of about 56M
audit_free_names()/audit_alloc_name(). That is about 0.4%.
As auditing is increasingly used in production systems due to various
legal and commercial compliance requirements, it is important that we
should try to minimize performance overhead when auditing is enabled.
Signed-off-by: Waiman Long <longman@redhat.com>
---
[v2] Merge 2 patches together & defer the path_put() call in
audit_alloc_name() to workqueue to prevent might_sleep() in atomic
context problem.
include/linux/fs_struct.h | 11 ++++++++
kernel/audit.h | 7 +++++
kernel/auditsc.c | 56 +++++++++++++++++++++++++++++++++++----
3 files changed, 69 insertions(+), 5 deletions(-)
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 0070764b790a..3a3dda8adc11 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -40,6 +40,17 @@ static inline void get_fs_pwd(struct fs_struct *fs, struct path *pwd)
read_sequnlock_excl(&fs->seq);
}
+/* Return true if fs->pwd matches the given pwd */
+static inline bool fs_pwd_equal(struct fs_struct *fs, struct path *pwd)
+{
+ bool match;
+
+ read_seqlock_excl(&fs->seq);
+ match = (fs->pwd.dentry == pwd->dentry) && (fs->pwd.mnt == pwd->mnt);
+ read_sequnlock_excl(&fs->seq);
+ return match;
+}
+
extern bool current_chrooted(void);
static inline int current_umask(void)
diff --git a/kernel/audit.h b/kernel/audit.h
index 7c401729e21b..03f3539b10e7 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -133,6 +133,13 @@ struct audit_context {
int name_count; /* total records in names_list */
struct list_head names_list; /* struct audit_names->list anchor */
char *filterkey; /* key for rule that triggered record */
+ /*
+ * pwd_reset is set if audit_free_names() has been called from
+ * audit_reset_context() to reset pwd, but pwd is still holding dentry
+ * and mount references to be used in later audit action without
+ * the need to reacqure the references again.
+ */
+ int pwd_reset;
struct path pwd;
struct audit_aux_data *aux;
struct audit_aux_data *aux_pids;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index dd0563a8e0be..e2e8bdb06185 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -123,6 +123,11 @@ struct audit_nfcfgop_tab {
const char *s;
};
+struct audit_pwd_put {
+ struct work_struct work;
+ struct path pwd;
+};
+
static const struct audit_nfcfgop_tab audit_nfcfgs[] = {
{ AUDIT_XT_OP_REGISTER, "xt_register" },
{ AUDIT_XT_OP_REPLACE, "xt_replace" },
@@ -931,6 +936,9 @@ static inline void audit_free_names(struct audit_context *context)
{
struct audit_names *n, *next;
+ if (list_empty(&context->names_list))
+ return; /* audit_alloc_name() has not been called */
+
list_for_each_entry_safe(n, next, &context->names_list, list) {
list_del(&n->list);
if (n->name)
@@ -939,9 +947,7 @@ static inline void audit_free_names(struct audit_context *context)
kfree(n);
}
context->name_count = 0;
- path_put(&context->pwd);
- context->pwd.dentry = NULL;
- context->pwd.mnt = NULL;
+ context->pwd_reset = true;
}
static inline void audit_free_aux(struct audit_context *context)
@@ -1088,6 +1094,8 @@ static inline void audit_free_context(struct audit_context *context)
audit_reset_context(context);
audit_proctitle_free(context);
free_tree_refs(context);
+ if (context->pwd_reset)
+ path_put(&context->pwd);
kfree(context->filterkey);
kfree(context);
}
@@ -1519,7 +1527,8 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n,
/* name was specified as a relative path and the
* directory component is the cwd
*/
- if (context->pwd.dentry && context->pwd.mnt)
+ if (context->pwd.dentry && context->pwd.mnt &&
+ !context->pwd_reset)
audit_log_d_path(ab, " name=", &context->pwd);
else
audit_log_format(ab, " name=(null)");
@@ -1767,7 +1776,7 @@ static void audit_log_exit(void)
context->target_comm))
call_panic = 1;
- if (context->pwd.dentry && context->pwd.mnt) {
+ if (context->pwd.dentry && context->pwd.mnt && !context->pwd_reset) {
ab = audit_log_start(context, GFP_KERNEL, AUDIT_CWD);
if (ab) {
audit_log_d_path(ab, "cwd=", &context->pwd);
@@ -2144,6 +2153,19 @@ static void handle_path(const struct dentry *dentry)
rcu_read_unlock();
}
+static void audit_pwd_put_workfn(struct work_struct *work)
+{
+ struct audit_pwd_put *pp = container_of(work, struct audit_pwd_put, work);
+
+ path_put(&pp->pwd);
+ /*
+ * The audit_pwd_put structure can be safely freed here without UAF
+ * as all the workqueue related data have been copied out and processed
+ * before this function is called.
+ */
+ kfree(pp);
+}
+
static struct audit_names *audit_alloc_name(struct audit_context *context,
unsigned char type)
{
@@ -2164,6 +2186,30 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
list_add_tail(&aname->list, &context->names_list);
context->name_count++;
+ if (context->pwd_reset) {
+ struct audit_pwd_put *pp;
+
+ context->pwd_reset = false;
+ if (likely(fs_pwd_equal(current->fs, &context->pwd)))
+ return aname;
+
+ /*
+ * Defer the path_put() call of the old pwd to workqueue as
+ * we may be in an atomic context that cannot call path_put()
+ * directly because of might_sleep().
+ */
+ pp = kmalloc(sizeof(*pp), GFP_NOFS);
+ if (!pp) {
+ if (aname->should_free)
+ kfree(aname);
+ return NULL;
+ }
+ INIT_WORK(&pp->work, audit_pwd_put_workfn);
+ pp->pwd = context->pwd;
+ schedule_work(&pp->work);
+ context->pwd.dentry = NULL;
+ context->pwd.mnt = NULL;
+ }
if (!context->pwd.dentry)
get_fs_pwd(current->fs, &context->pwd);
return aname;
--
2.52.0
On Tue, Feb 03, 2026 at 02:44:33PM -0500, Waiman Long wrote:
> diff --git a/kernel/audit.h b/kernel/audit.h
> index 7c401729e21b..03f3539b10e7 100644
> --- a/kernel/audit.h
> +++ b/kernel/audit.h
> @@ -133,6 +133,13 @@ struct audit_context {
> int name_count; /* total records in names_list */
> struct list_head names_list; /* struct audit_names->list anchor */
> char *filterkey; /* key for rule that triggered record */
> + /*
> + * pwd_reset is set if audit_free_names() has been called from
> + * audit_reset_context() to reset pwd, but pwd is still holding dentry
> + * and mount references to be used in later audit action without
> + * the need to reacqure the references again.
That's a delicate way to say "we have mounts stuck busy inexplicably for userland"...
Generally a chdir(2) away from something immediately followed by umount(2)
is _not_ expected to leave you with -EBUSY when nobody else has been doing
anything with the mount in question.
On 2/3/26 3:05 PM, Al Viro wrote:
> On Tue, Feb 03, 2026 at 02:44:33PM -0500, Waiman Long wrote:
>
>> diff --git a/kernel/audit.h b/kernel/audit.h
>> index 7c401729e21b..03f3539b10e7 100644
>> --- a/kernel/audit.h
>> +++ b/kernel/audit.h
>> @@ -133,6 +133,13 @@ struct audit_context {
>> int name_count; /* total records in names_list */
>> struct list_head names_list; /* struct audit_names->list anchor */
>> char *filterkey; /* key for rule that triggered record */
>> + /*
>> + * pwd_reset is set if audit_free_names() has been called from
>> + * audit_reset_context() to reset pwd, but pwd is still holding dentry
>> + * and mount references to be used in later audit action without
>> + * the need to reacqure the references again.
> That's a delicate way to say "we have mounts stuck busy inexplicably for userland"...
>
> Generally a chdir(2) away from something immediately followed by umount(2)
> is _not_ expected to leave you with -EBUSY when nobody else has been doing
> anything with the mount in question.
That is actually a concern that I have at the back of my mind. I can
modify the patch to cache only the dentry and do get/put the mount every
time which is much cheaper as it is a percpu counter. In that way, a
chdir(2) followed by a umount(2) shouldn't cause a -EBUSY. Right?
Cheers,
Longman
On Tue, Feb 03, 2026 at 03:32:04PM -0500, Waiman Long wrote: > That is actually a concern that I have at the back of my mind. I can modify > the patch to cache only the dentry and do get/put the mount every time which > is much cheaper as it is a percpu counter. In that way, a chdir(2) followed > by a umount(2) shouldn't cause a -EBUSY. Right? Quite - it will just retain a reference to dentry, with filesystem shutdown being very unhappy about somebody retaining references to objects on the filesystem about to be taken out...
On Tue, Feb 03, 2026 at 09:50:02PM +0000, Al Viro wrote: > On Tue, Feb 03, 2026 at 03:32:04PM -0500, Waiman Long wrote: > > > That is actually a concern that I have at the back of my mind. I can modify > > the patch to cache only the dentry and do get/put the mount every time which > > is much cheaper as it is a percpu counter. In that way, a chdir(2) followed > > by a umount(2) shouldn't cause a -EBUSY. Right? > > Quite - it will just retain a reference to dentry, with filesystem shutdown > being very unhappy about somebody retaining references to objects on the > filesystem about to be taken out... Sarcasm aside, I wonder if we could do the following trick: * a new primitive for "grab or borrow pwd", similar to what fdget() does for struct file. If current->fs is shared, do what we do now and return true; otherwise just copy the contents of current->fs->pwd return false. * paired primitive that would take a boolean + struct path * and do path_put() if boolean is true. * syscalls that might alter ->fs, ->fs->pwd or add extra references to ->fs would start with grabbing an extra ref on entry and drop it in the end; that would make that primitive safe to use there. * audit using that thing and storing the result along with the copy of pwd; on the way out it would use the "put unless borrowed" primitive. Might or might not be useful - hard to tell without knowing the job mix of those audit-afflicted production systems. I'll try to put something along those lines together...
On 2/3/26 6:26 PM, Al Viro wrote: > On Tue, Feb 03, 2026 at 09:50:02PM +0000, Al Viro wrote: >> On Tue, Feb 03, 2026 at 03:32:04PM -0500, Waiman Long wrote: >> >>> That is actually a concern that I have at the back of my mind. I can modify >>> the patch to cache only the dentry and do get/put the mount every time which >>> is much cheaper as it is a percpu counter. In that way, a chdir(2) followed >>> by a umount(2) shouldn't cause a -EBUSY. Right? >> Quite - it will just retain a reference to dentry, with filesystem shutdown >> being very unhappy about somebody retaining references to objects on the >> filesystem about to be taken out... > Sarcasm aside, I wonder if we could do the following trick: > * a new primitive for "grab or borrow pwd", similar to what fdget() > does for struct file. If current->fs is shared, do what we do now and return > true; otherwise just copy the contents of current->fs->pwd return false. > * paired primitive that would take a boolean + struct path * and > do path_put() if boolean is true. > * syscalls that might alter ->fs, ->fs->pwd or add extra references to > ->fs would start with grabbing an extra ref on entry and drop it in the end; > that would make that primitive safe to use there. > * audit using that thing and storing the result along with the copy > of pwd; on the way out it would use the "put unless borrowed" primitive. > > Might or might not be useful - hard to tell without knowing the job mix of those > audit-afflicted production systems. > > I'll try to put something along those lines together... Interesting. So are you thinking about a reference on the pwd inside the fs_struct? I am thinking about instead of getting a reference to pwd.dentry and pwd.mnt, we can just get a reference to the pwd itself. We have to grab the fs lock in get_fs_pwd() anyway. This spinlock doesn't show up as being contended in the customer bug report as each process can has its own fs_struct instead of many processing sharing the same working directory. In the case of audit, fs->pwd is copied out at the beginning of the open* system call and then put back at the end of that system call. So it should last a pretty short time, i.e. the reference will be released shortly. However, that will make the set_fs_pwd() call a bit tricky to implement, but I think it is still doable. Cheers, Longman
On Tue, Feb 03, 2026 at 11:21:23PM -0500, Waiman Long wrote: > Interesting. So are you thinking about a reference on the pwd inside the > fs_struct? No. fs_struct has pwd pinned - both dentry and mount. Each thread has a reference to fs_struct instance in its task_struct (task->fs). It is possible for several threads to have their task->fs pointing to the same instance. A thread may modify its fs_struct pointer, making it point to a different fs_struct instance; that happens on unshare(2). In that case new instance is an identical copy of the original one; reference counts of mounts and dentries (both for pwd and root) are incremented to account for additional references in the copy. That assignment to current->fs happens under task_lock(current). A child gets either a reference to identical copy of parent's fs_struct or an extra reference to parent's fs_struct. In either case child->fs is set before the child gets to run; as the matter of fact, that happens before anyone besides the parent might observe the task_struct of child. That's it - no other stores to task_struct.fs are ever allowed; no other thread can change your reference under you. Other than initializing a new copy, *all* stores to fs_struct.pwd are done to current->fs.pwd; access to any other fs_struct instances is read-only. The only way another thread might change your pwd is if that thread shares fs_struct with you. They can observe its contents, as long as they take care to hold task_lock(your_thread), but no more than that. What's more, if current->fs is the sole reference to fs_struct instance, it will remain such until you spawn a child and set it to share your instance (CLONE_FS). No other thread can gain extra references to it, etc. What it means is that if you are holding the sole reference to current->fs, current->fs->pwd contents will remain unchanged (and pinning the same mount and dentry) through the entire syscall with very few exceptions: 1) you spawn a child with CLONE_FS - that has to be either clone(2) or io_uring_setup(2) (the latter - via worker threads). 2) you explicitly modify your pwd - in chdir(2), fchdir(2), pivot_root(2), setns(2) (with CLONE_NEWNS; it switches you to the root of namespace you've joined) As long as these exceptions are accounted for, audit could check current->fs->users *and* skip incrementing refcounts if it's equal to 1. It would need to remember whether these reference had been taken - rechecking condition won't work, since other threads might by gone by the time we leave the syscall turning "shared" to "not shared". That way we can avoid grabbing any references in a fairly common case. > I am thinking about instead of getting a reference to pwd.dentry > and pwd.mnt, we can just get a reference to the pwd itself. We have to grab > the fs lock in get_fs_pwd() anyway. HUH? pwd is a path_struct; it has no refcount of its own. And an extra reference to fs_struct will not prevent modifications by other threads that happens to share that fs_struct. > This spinlock doesn't show up as being > contended in the customer bug report as each process can has its own > fs_struct instead of many processing sharing the same working directory. <wry> I sincerely hope that you don't mean to hold current->fs->lock through the entire syscall. </wry> > In the case of audit, fs->pwd is copied out at the beginning of the open* > system call and then put back at the end of that system call. So it should > last a pretty short time, i.e. the reference will be released > shortly. However, that will make the set_fs_pwd() call a bit tricky to > implement, but I think it is still doable. Huh? You can't make fs_struct CoW, simply because if you clone a child with CLONE_FS and do chdir() in it, it will change *your* current directory as well (and vice versa).
On 2/4/26 1:26 AM, Al Viro wrote:
> On Tue, Feb 03, 2026 at 11:21:23PM -0500, Waiman Long wrote:
>
>> Interesting. So are you thinking about a reference on the pwd inside the
>> fs_struct?
> No. fs_struct has pwd pinned - both dentry and mount.
>
> Each thread has a reference to fs_struct instance in its task_struct (task->fs).
>
> It is possible for several threads to have their task->fs pointing to the
> same instance.
>
> A thread may modify its fs_struct pointer, making it point to a different
> fs_struct instance; that happens on unshare(2). In that case new instance
> is an identical copy of the original one; reference counts of mounts and dentries
> (both for pwd and root) are incremented to account for additional references
> in the copy. That assignment to current->fs happens under task_lock(current).
>
> A child gets either a reference to identical copy of parent's fs_struct or
> an extra reference to parent's fs_struct. In either case child->fs is
> set before the child gets to run; as the matter of fact, that happens before
> anyone besides the parent might observe the task_struct of child.
>
> That's it - no other stores to task_struct.fs are ever allowed; no other
> thread can change your reference under you.
>
>
> Other than initializing a new copy, *all* stores to fs_struct.pwd are done
> to current->fs.pwd; access to any other fs_struct instances is read-only.
> The only way another thread might change your pwd is if that thread shares
> fs_struct with you. They can observe its contents, as long as they take
> care to hold task_lock(your_thread), but no more than that.
>
> What's more, if current->fs is the sole reference to fs_struct instance,
> it will remain such until you spawn a child and set it to share your
> instance (CLONE_FS). No other thread can gain extra references to it,
> etc.
>
>
> What it means is that if you are holding the sole reference to current->fs,
> current->fs->pwd contents will remain unchanged (and pinning the same mount
> and dentry) through the entire syscall with very few exceptions:
> 1) you spawn a child with CLONE_FS - that has to be either
> clone(2) or io_uring_setup(2) (the latter - via worker threads).
> 2) you explicitly modify your pwd - in chdir(2), fchdir(2),
> pivot_root(2), setns(2) (with CLONE_NEWNS; it switches you to the root of
> namespace you've joined)
>
> As long as these exceptions are accounted for, audit could check
> current->fs->users *and* skip incrementing refcounts if it's equal to 1.
> It would need to remember whether these reference had been taken -
> rechecking condition won't work, since other threads might by gone
> by the time we leave the syscall turning "shared" to "not shared".
>
> That way we can avoid grabbing any references in a fairly common
> case.
Thanks for the detailed explanation. I am thinking about something like
the code diff below. Of course, there are other corner cases like unshare(2)
that still needs to be handled. Do you think something like this is viable?
Thanks,
Longman
=========================[ Cut here ]===================================
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index b8c46c5a38a0..09c97059776d 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -25,6 +25,23 @@ void set_fs_root(struct fs_struct *fs, const struct
path *pa>
path_put(&old_root);
}
+static void unshare_fs_pwd_locked(struct fs_struct *fs)
+{
+ get_task_struct(current);
+ fs->pwd_waiter = current;
+retry:
+ spin_unlock(&fs->seq.lock);
+ /* Sleep until pwd_refs reaches 0 */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule();
+ spin_lock(&fs->seq.lock);
+ if (fs->pwd_refs)
+ goto retry;
+ __set_current_state(TASK_RUNNING);
+ fs->pwd_waiter = NULL;
+ put_task_struct(current);
+}
+
/*
* Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
* It can block.
@@ -34,7 +51,15 @@ void set_fs_pwd(struct fs_struct *fs, const struct
path *pat>
struct path old_pwd;
path_get(path);
- write_seqlock(&fs->seq);
+ /*
+ * As the pwd may be shared with other tasks, we need to break down
+ * the write_seqlock() call to its component spin_lock() and
+ * do_write_seqcount_begin() calls.
+ */
+ spin_lock(&fs->seq.lock);
+ if (unlikely(fs->pwd_refs))
+ unshare_fs_pwd_locked(fs);
+ do_write_seqcount_begin(&fs->seq.seqcount.seqcount);
old_pwd = fs->pwd;
fs->pwd = *path;
write_sequnlock(&fs->seq);
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 0070764b790a..3848189893c7 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -13,6 +13,8 @@ struct fs_struct {
int umask;
int in_exec;
struct path root, pwd;
+ int pwd_refs;
+ struct task_struct *pwd_waiter; /* set_fs_pwd() waiter */
} __randomize_layout;
extern struct kmem_cache *fs_cachep;
@@ -40,6 +42,43 @@ static inline void get_fs_pwd(struct fs_struct *fs,
struct p>
read_sequnlock_excl(&fs->seq);
}
+/*
+ * The get_fs_pwd_share/put_fs_pwd_share APIs should only be used by
callers
+ * that need to keep the pwd references for a short time to avoid blocking
+ * set_fs_pwd() caller for long time.
+ */
+static inline bool get_fs_pwd_share(struct fs_struct *fs, struct path *pwd)
+{
+ bool share = true;
+
+ read_seqlock_excl(&fs->seq);
+ *pwd = fs->pwd;
+ share = !fs->pwd_waiter;
+ if (likely(share))
+ fs->pwd_refs++;
+ else
+ path_get(pwd);
+ read_sequnlock_excl(&fs->seq);
+ return share;
+}
+
+static inline void put_fs_pwd_share(struct fs_struct *fs, struct path
*pwd, bo>
+{
+ struct task_struct *wakeup_task = NULL;
+
+ if (!share) {
+ path_put(pwd);
+ return;
+ }
+ read_seqlock_excl(&fs->seq);
+ fs->pwd_refs--;
+ if (fs->pwd_waiter && !fs->pwd_refs)
+ wakeup_task = fs->pwd_waiter;
+ read_sequnlock_excl(&fs->seq);
+ if (wakeup_task)
+ wake_up_process(wakeup_task);
+}
+
extern bool current_chrooted(void);
static inline int current_umask(void)
On Wed, Feb 04, 2026 at 01:16:15PM -0500, Waiman Long wrote:
> Thanks for the detailed explanation. I am thinking about something like
> the code diff below. Of course, there are other corner cases like unshare(2)
> that still needs to be handled. Do you think something like this is viable?
Deadlocks aside, the immediate problem here is that consensus number is too
low. Take three threads sharing the same fs_struct instance. The first one
calls your get_fs_pwd_share(); then the remaining two threads call set_fs_pwd()
(e.g. by calling chdir(2) in userland code). The reference stored into
fs->pwd_waiter by the first of those two gets overwritten by that stored
by the second. When the caller of get_fs_pwd_share() gets to put_fs_pwd_share(),
only one of the sleepers gets woken up...
And it's very easy to end up with something as simple as chdir("foo") deadlocking -
we start with resolving the relative pathname we'd been given, audit wants to
record the current directory, on the theory that relative pathname is none too
useful in logs without knowing what had it been relative to. Then, in the
same thread, you call set_fs_pwd() - after all, that's the main effect of chdir(2).
Deadlock...
IOW, it's not just unshare(2) that needs to be taken care of - chdir(2) would need
to be treated differently.
On 2/4/26 3:18 PM, Al Viro wrote:
> On Wed, Feb 04, 2026 at 01:16:15PM -0500, Waiman Long wrote:
>
>
>> Thanks for the detailed explanation. I am thinking about something like
>> the code diff below. Of course, there are other corner cases like unshare(2)
>> that still needs to be handled. Do you think something like this is viable?
> Deadlocks aside, the immediate problem here is that consensus number is too
> low. Take three threads sharing the same fs_struct instance. The first one
> calls your get_fs_pwd_share(); then the remaining two threads call set_fs_pwd()
> (e.g. by calling chdir(2) in userland code). The reference stored into
> fs->pwd_waiter by the first of those two gets overwritten by that stored
> by the second. When the caller of get_fs_pwd_share() gets to put_fs_pwd_share(),
> only one of the sleepers gets woken up...
>
> And it's very easy to end up with something as simple as chdir("foo") deadlocking -
> we start with resolving the relative pathname we'd been given, audit wants to
> record the current directory, on the theory that relative pathname is none too
> useful in logs without knowing what had it been relative to. Then, in the
> same thread, you call set_fs_pwd() - after all, that's the main effect of chdir(2).
> Deadlock...
>
> IOW, it's not just unshare(2) that needs to be taken care of - chdir(2) would need
> to be treated differently.
Now I realize that there is indeed a deadlock problem. Scrap that. Now I
have a simpler idea that shouldn't have this type of deadlock problem.
So what do you think about the sample code below?
Thanks,
Longman
=======================[ Cut here ]================================
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index b8c46c5a38a0..daeeb80cf088 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -32,15 +32,19 @@ void set_fs_root(struct fs_struct *fs, const struct
path *p>
void set_fs_pwd(struct fs_struct *fs, const struct path *path)
{
struct path old_pwd;
+ int xrefs;
path_get(path);
write_seqlock(&fs->seq);
old_pwd = fs->pwd;
fs->pwd = *path;
+ xrefs = fs->pwd_xrefs + 1;
+ fs->pwd_xrefs = 0;
write_sequnlock(&fs->seq);
if (old_pwd.dentry)
- path_put(&old_pwd);
+ while (xrefs--)
+ path_put(&old_pwd);
}
static inline int replace_path(struct path *p, const struct path *old,
const s>
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 0070764b790a..0d79d51de240 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -8,10 +8,11 @@
#include <linux/seqlock.h>
struct fs_struct {
- int users;
seqlock_t seq;
+ int users;
int umask;
int in_exec;
+ int pwd_xrefs; /* Extra references of pwd */
struct path root, pwd;
} __randomize_layout;
@@ -40,6 +41,31 @@ static inline void get_fs_pwd(struct fs_struct *fs,
struct p>
read_sequnlock_excl(&fs->seq);
}
+static inline void get_fs_pwd_share(struct fs_struct *fs, struct path *pwd)
+{
+ read_seqlock_excl(&fs->seq);
+ *pwd = fs->pwd;
+ if (fs->pwd_xrefs)
+ fs->pwd_xrefs--;
+ else
+ path_get(pwd);
+ read_sequnlock_excl(&fs->seq);
+}
+
+static inline void put_fs_pwd_share(struct fs_struct *fs, struct path *pwd)
+{
+ bool put = false;
+
+ read_seqlock_excl(&fs->seq);
+ if ((fs->pwd.dentry == pwd->dentry) && (fs->pwd.mnt == pwd->mnt))
+ fs->pwd_xrefs++;
+ else
+ put = true;
+ read_sequnlock_excl(&fs->seq);
+ if (put)
+ path_put(pwd);
+}
+
extern bool current_chrooted(void);
static inline int current_umask(void)
On Wed, Feb 04, 2026 at 10:03:33PM -0500, Waiman Long wrote: > Now I realize that there is indeed a deadlock problem. Scrap that. Now I > have a simpler idea that shouldn't have this type of deadlock problem. So > what do you think about the sample code below? That it's rather bizarre, TBH. Basically, you are allowing to park a number of (identical) references in there instead of dropping them, with your 'xrefs' being the count of skipped drops. get_share either clones a reference or uses up one of those skipped drops; put_share parks the reference if possible. And set discards everything not used up... It could be made to work, but... ouch. It looks like a special-cased variant of something fairly generic, with really confusing calling conventions. Let me poke around and see if we have any other candidates for something similar; if nothing else, current->fs->root is interesting and not just for audit pathologies... Note, BTW, that there's chroot_fs_refs() to deal with, along with free_fs_struct() (at least). This stuff is encapsulated in fs/fs_struct.c and include/linux/fs_struct.h... Oh, hell. "fs: inline current_umask() and move it to fs_struct.h" had been a bad idea; I'd missed it when it had been posted, but... exposing the damn thing all over the place *and* bringing sched.h into it? Microoptimizations are fine, and this one might have a measurable effect, but it grows the subset of the kernel we need to audit when we deal with changing the damn object by at least a couple of orders of magnitude ;-/ Sigh... Mateusz: *is* there a measurable effect? Because if there isn't, I'm very tempted to simply revert that thing. "Churn of adding fs_struct.h as needed" is not the problem - try "exposing the object internals to far larger subset of the kernel". We had interesting bugs with weird shit deciding to poke in there, locking and refcounting be damned. Encapsulation is really called for in some cases... And yes, I ought to have caught that one back in November and asked _then_; mea culpa.
On Thu, Feb 5, 2026 at 6:20 AM Al Viro <viro@zeniv.linux.org.uk> wrote: > Mateusz: *is* there a measurable effect? Because if there isn't, I'm > very tempted to simply revert that thing. "Churn of adding fs_struct.h > as needed" is not the problem - try "exposing the object internals to > far larger subset of the kernel". We had interesting bugs with weird > shit deciding to poke in there, locking and refcounting be damned. This one I did not bother benchmarking as it is an obvious change -- read a var directly instead of going through a func call. If this is causing trouble then it definitely needs to be reverted, but any woes only speak to the mess in the header files.
On 2/5/26 12:22 AM, Al Viro wrote: > On Wed, Feb 04, 2026 at 10:03:33PM -0500, Waiman Long wrote: > >> Now I realize that there is indeed a deadlock problem. Scrap that. Now I >> have a simpler idea that shouldn't have this type of deadlock problem. So >> what do you think about the sample code below? > That it's rather bizarre, TBH. Basically, you are allowing to park > a number of (identical) references in there instead of dropping them, > with your 'xrefs' being the count of skipped drops. get_share either > clones a reference or uses up one of those skipped drops; put_share parks > the reference if possible. And set discards everything not used up... The basic idea is to have a pool of extra pwd references inside fs_struct. When a user needs a reference, it can borrow one, if available, with the get call and then return it back later with a put call. I envision that the pool can grow to the maximum number of outstanding get's that have ever happen. When it is time to let them go, we could implement some low level put_many functions to get rid of them in one go instead releasing them one-by-one which could take a while if the pool grow big. I am not good in naming, so please let me know if you have suggestion of what naming convention should be used. > > It could be made to work, but... ouch. It looks like a special-cased > variant of something fairly generic, with really confusing calling > conventions. Let me poke around and see if we have any other candidates > for something similar; if nothing else, current->fs->root is interesting > and not just for audit pathologies... > > Note, BTW, that there's chroot_fs_refs() to deal with, along with > free_fs_struct() (at least). This stuff is encapsulated in > fs/fs_struct.c and include/linux/fs_struct.h... Oh, hell. I have sent a follow up patch with changes made to other part of fs_struct.c AFAICS. Of course, I will go over it again when I am making an official patch. However, I haven't looked elsewhere outside of fs_struct.[ch]. I believe the change should be pretty self-contained. Please let me know if there are other places where I should look. Cheers, Longman
On 2/4/26 10:03 PM, Waiman Long wrote:
> On 2/4/26 3:18 PM, Al Viro wrote:
>> On Wed, Feb 04, 2026 at 01:16:15PM -0500, Waiman Long wrote:
>>
>>
>>> Thanks for the detailed explanation. I am thinking about something like
>>> the code diff below. Of course, there are other corner cases like
>>> unshare(2)
>>> that still needs to be handled. Do you think something like this is
>>> viable?
>> Deadlocks aside, the immediate problem here is that consensus number
>> is too
>> low. Take three threads sharing the same fs_struct instance. The
>> first one
>> calls your get_fs_pwd_share(); then the remaining two threads call
>> set_fs_pwd()
>> (e.g. by calling chdir(2) in userland code). The reference stored into
>> fs->pwd_waiter by the first of those two gets overwritten by that stored
>> by the second. When the caller of get_fs_pwd_share() gets to
>> put_fs_pwd_share(),
>> only one of the sleepers gets woken up...
>>
>> And it's very easy to end up with something as simple as chdir("foo")
>> deadlocking -
>> we start with resolving the relative pathname we'd been given, audit
>> wants to
>> record the current directory, on the theory that relative pathname is
>> none too
>> useful in logs without knowing what had it been relative to. Then, in
>> the
>> same thread, you call set_fs_pwd() - after all, that's the main
>> effect of chdir(2).
>> Deadlock...
>>
>> IOW, it's not just unshare(2) that needs to be taken care of -
>> chdir(2) would need
>> to be treated differently.
>
> Now I realize that there is indeed a deadlock problem. Scrap that. Now
> I have a simpler idea that shouldn't have this type of deadlock
> problem. So what do you think about the sample code below?
Well, a complete change log is as follows.
Cheers,
Longman
=======================[ Cut here ]================================
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index b8c46c5a38a0..67e08d8db058 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -32,15 +32,19 @@ void set_fs_root(struct fs_struct *fs, const struct
path *p>
void set_fs_pwd(struct fs_struct *fs, const struct path *path)
{
struct path old_pwd;
+ int count;
path_get(path);
write_seqlock(&fs->seq);
old_pwd = fs->pwd;
fs->pwd = *path;
+ count = fs->pwd_xrefs + 1;
+ fs->pwd_xrefs = 0;
write_sequnlock(&fs->seq);
if (old_pwd.dentry)
- path_put(&old_pwd);
+ while (count--)
+ path_put(&old_pwd);
}
static inline int replace_path(struct path *p, const struct path *old,
const s>
@@ -70,6 +74,8 @@ void chroot_fs_refs(const struct path *old_root, const
struct>
count++;
path_get(new_root);
}
+ count += fs->pwd_xrefs;
+ fs->pwd_xrefs = 0;
write_sequnlock(&fs->seq);
}
task_unlock(p);
@@ -81,8 +87,11 @@ void chroot_fs_refs(const struct path *old_root,
const struc>
void free_fs_struct(struct fs_struct *fs)
{
+ int count = fs->pwd_xrefs + 1;
+
path_put(&fs->root);
- path_put(&fs->pwd);
+ while (count--)
+ path_put(&fs->pwd);
kmem_cache_free(fs_cachep, fs);
}
@@ -110,6 +119,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
if (fs) {
fs->users = 1;
fs->in_exec = 0;
+ fs->pwd_xrefs = 0;
seqlock_init(&fs->seq);
fs->umask = old->umask;
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 0070764b790a..0d79d51de240 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -8,10 +8,11 @@
#include <linux/seqlock.h>
struct fs_struct {
- int users;
seqlock_t seq;
+ int users;
int umask;
int in_exec;
+ int pwd_xrefs; /* Extra references of pwd */
struct path root, pwd;
} __randomize_layout;
@@ -40,6 +41,31 @@ static inline void get_fs_pwd(struct fs_struct *fs,
struct p>
read_sequnlock_excl(&fs->seq);
}
+static inline void get_fs_pwd_share(struct fs_struct *fs, struct path *pwd)
+{
+ read_seqlock_excl(&fs->seq);
+ *pwd = fs->pwd;
+ if (fs->pwd_xrefs)
+ fs->pwd_xrefs--;
+ else
+ path_get(pwd);
+ read_sequnlock_excl(&fs->seq);
+}
+
+static inline void put_fs_pwd_share(struct fs_struct *fs, struct path *pwd)
+{
+ bool put = false;
+
+ read_seqlock_excl(&fs->seq);
+ if ((fs->pwd.dentry == pwd->dentry) && (fs->pwd.mnt == pwd->mnt))
+ fs->pwd_xrefs++;
+ else
+ put = true;
+ read_sequnlock_excl(&fs->seq);
+ if (put)
+ path_put(pwd);
+}
+
extern bool current_chrooted(void);
static inline int current_umask(void)
On Wed, Feb 04, 2026 at 11:45:17PM -0500, Waiman Long wrote: > @@ -70,6 +74,8 @@ void chroot_fs_refs(const struct path *old_root, const > struct> > count++; > path_get(new_root); > } > + count += fs->pwd_xrefs; > + fs->pwd_xrefs = 0; > write_sequnlock(&fs->seq); Nope - you only need that for threads that have ->pwd equal to old_root. Incidentally, I'd forgotten about that sucker - it kills the idea of fdget-like tricks dead, more's the pity. Third-party modification of task->fs->pwd (under task->lock and task->fs->seq), possible even with task->fs->users == 1. FWIW, I'm going through the fs_struct uses at the moment; will post whatever I get when I go down (or earlier, in the unlikely case it doesn't spill to tomorrow morning), will keep posting incremental followups until the documentation is done. I'm sick and tired of half-finished docs - let's see if I can push through that one this way ;-/
On Thu, Feb 05, 2026 at 11:53:51PM +0000, Al Viro wrote: > On Wed, Feb 04, 2026 at 11:45:17PM -0500, Waiman Long wrote: > > > @@ -70,6 +74,8 @@ void chroot_fs_refs(const struct path *old_root, const > > struct> > > count++; > > path_get(new_root); > > } > > + count += fs->pwd_xrefs; > > + fs->pwd_xrefs = 0; > > write_sequnlock(&fs->seq); > > Nope - you only need that for threads that have ->pwd equal to old_root. > Incidentally, I'd forgotten about that sucker - it kills the idea of > fdget-like tricks dead, more's the pity. Third-party modification of > task->fs->pwd (under task->lock and task->fs->seq), possible even with > task->fs->users == 1. > > FWIW, I'm going through the fs_struct uses at the moment; will post > whatever I get when I go down (or earlier, in the unlikely case it doesn't > spill to tomorrow morning), will keep posting incremental followups > until the documentation is done. I'm sick and tired of half-finished > docs - let's see if I can push through that one this way ;-/ > All right, I'm going down - it's 3am here; the current notes are at http://ftp.linux.org.uk/people/viro/fs_struct, will update when I get up tomorrow^Wtoday.
On 2/5/26 6:53 PM, Al Viro wrote: > On Wed, Feb 04, 2026 at 11:45:17PM -0500, Waiman Long wrote: > >> @@ -70,6 +74,8 @@ void chroot_fs_refs(const struct path *old_root, const >> struct> >> count++; >> path_get(new_root); >> } >> + count += fs->pwd_xrefs; >> + fs->pwd_xrefs = 0; >> write_sequnlock(&fs->seq); > Nope - you only need that for threads that have ->pwd equal to old_root. > Incidentally, I'd forgotten about that sucker - it kills the idea of > fdget-like tricks dead, more's the pity. Third-party modification of > task->fs->pwd (under task->lock and task->fs->seq), possible even with > task->fs->users == 1. Yes, I am aware of that when I took a further look at the patch that I sent out yesterday. I am testing the updated patch now and is trying to figure out why I get a warning from mntput_no_expire_slowpath() with a count of -1 when doing an umount. It is off by 1 somewhere. I will post the patch once I resolve this bug. Thanks, Longman
On 2/5/26 8:20 PM, Waiman Long wrote:
> On 2/5/26 6:53 PM, Al Viro wrote:
>> On Wed, Feb 04, 2026 at 11:45:17PM -0500, Waiman Long wrote:
>>
>>> @@ -70,6 +74,8 @@ void chroot_fs_refs(const struct path *old_root,
>>> const
>>> struct>
>>> count++;
>>> path_get(new_root);
>>> }
>>> + count += fs->pwd_xrefs;
>>> + fs->pwd_xrefs = 0;
>>> write_sequnlock(&fs->seq);
>> Nope - you only need that for threads that have ->pwd equal to old_root.
>> Incidentally, I'd forgotten about that sucker - it kills the idea of
>> fdget-like tricks dead, more's the pity. Third-party modification of
>> task->fs->pwd (under task->lock and task->fs->seq), possible even with
>> task->fs->users == 1.
>
> Yes, I am aware of that when I took a further look at the patch that I
> sent out yesterday. I am testing the updated patch now and is trying
> to figure out why I get a warning from mntput_no_expire_slowpath()
> with a count of -1 when doing an umount. It is off by 1 somewhere. I
> will post the patch once I resolve this bug.
I now know why there are warnings. The problem is in the copy_mnt_ns()
function in fs/namespace.c:
__latent_entropy
struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
struct user_namespace *user_ns, struct fs_struct *new_fs)
{
:
if (new_fs) {
if (&p->mnt == new_fs->root.mnt) {
new_fs->root.mnt = mntget(&q->mnt);
rootmnt = &p->mnt;
}
if (&p->mnt == new_fs->pwd.mnt) {
new_fs->pwd.mnt = mntget(&q->mnt);
pwdmnt = &p->mnt;
}
}
It is replacing the fs->pwd.mnt with a new one while pwd_refs is 1. I
can make this work with the new fs_struct field. I do have one question
though. Do we need to acquire write_seqlock(&new_fs->seq) if we are
changing root or pwd here or if the new_fs are in such a state that it
will never change when this copying operation is in progress?
Thanks in advance for your advice.
Cheers,
Longman
On Thu, Feb 05, 2026 at 11:11:51PM -0500, Waiman Long wrote:
> __latent_entropy
> struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
> struct user_namespace *user_ns, struct fs_struct *new_fs)
> {
> :
> if (new_fs) {
> if (&p->mnt == new_fs->root.mnt) {
> new_fs->root.mnt = mntget(&q->mnt);
> rootmnt = &p->mnt;
> }
> if (&p->mnt == new_fs->pwd.mnt) {
> new_fs->pwd.mnt = mntget(&q->mnt);
> pwdmnt = &p->mnt;
> }
> }
>
> It is replacing the fs->pwd.mnt with a new one while pwd_refs is 1. I can
> make this work with the new fs_struct field. I do have one question though.
> Do we need to acquire write_seqlock(&new_fs->seq) if we are changing root or
> pwd here or if the new_fs are in such a state that it will never change when
> this copying operation is in progress?
In all cases when we get to that point, new_fs is always a freshly
created private copy of current->fs, not reachable from anywhere
other than stack frames of the callers, but the proof is not pretty.
copy_mnt_ns() is called only by create_new_namespaces() and it gets to
copying anything if and only if CLONE_NEWNS is in the flags. So far,
so good. The call in create_new_namespaces() is
new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
and both flags and new_fs come straight from create_new_namespaces()
callers. There are 3 of those:
1) int copy_namespaces(u64 flags, struct task_struct *tsk):
new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
That gets called by copy_process(), with tsk being the child we are
creating there. tsk->fs is set a bit earlier, by copy_fs() call - either
to extra ref to current->fs (when CLONE_FS is present in flags) or to
a private copy thereof. Since in the very beginning of copy_process()
we have
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
and we are interested in case when CLONE_NEWNS is set, tsk->fs is going
to be a private copy.
2) int unshare_nsproxy_namespaces(unsigned long unshare_flags,
struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs):
*new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
new_fs ? new_fs : current->fs);
That gets called from ksys_unshare(). Earlier in ksys_unshare() we have
/*
* If unsharing namespace, must also unshare filesystem information.
*/
if (unshare_flags & CLONE_NEWNS)
unshare_flags |= CLONE_FS;
so in our case CLONE_FS is going to be set. new_fs is initially set
to NULL and, since CLONE_FS is there, the call of unshare_fs() has
replaced it with a reference to private copy of current->fs. Again,
we get a private copy.
3) int exec_task_namespaces(void):
new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
No CLONE_NEWNS in flags, so we don't get there at all.
Incidentally, one bogosity I've spotted in unshare_fs() today is
if (!(unshare_flags & CLONE_FS) || !fs)
^^^^^^ this
return 0;
The history is amusing - it had been faithfully preserved since
cf2e340f4249 ("[PATCH] unshare system call -v5: system call handler
function") back in 2006, when unshare(2) had been added; initially it
had been a stub:
+static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
+{
+ struct fs_struct *fs = current->fs;
+
+ if ((unshare_flags & CLONE_FS) &&
+ (fs && atomic_read(&fs->count) > 1))
+ return -EINVAL;
+
+ return 0;
+}
The thing is, task->fs does not become NULL until exit_fs() clears it, at
which point we'd better *not* run into any syscalls, unshare(2) included
;-) The same had been true back in 2006; as the matter of fact, I don't
know if it _ever_ had not been true. I suspect that the logics had been
copied from exit_fs(), which also has a pointless check that seems to have
been added there in 1.3.26, when task->fs went kmalloc'ed. The thing
is, in 1.3.26 copy_fs() a failed allocation aborted do_fork() (today's
copy_process()) with exit_fs() never called for the child-not-to-be...
Anyway, with exit_fs() there might have been some value in making it
idempotent, but for unshare_fs() that never made sense. Nobody noticed
and nobody who'd massaged that function afterwards (myself included)
had ever asked WTF would that exit be about and what would happen if we
ever ended up going there (answer: an oops galore)...
On 2/6/26 12:22 AM, Al Viro wrote:
> On Thu, Feb 05, 2026 at 11:11:51PM -0500, Waiman Long wrote:
>
>> __latent_entropy
>> struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
>> struct user_namespace *user_ns, struct fs_struct *new_fs)
>> {
>> :
>> if (new_fs) {
>> if (&p->mnt == new_fs->root.mnt) {
>> new_fs->root.mnt = mntget(&q->mnt);
>> rootmnt = &p->mnt;
>> }
>> if (&p->mnt == new_fs->pwd.mnt) {
>> new_fs->pwd.mnt = mntget(&q->mnt);
>> pwdmnt = &p->mnt;
>> }
>> }
>>
>> It is replacing the fs->pwd.mnt with a new one while pwd_refs is 1. I can
>> make this work with the new fs_struct field. I do have one question though.
>> Do we need to acquire write_seqlock(&new_fs->seq) if we are changing root or
>> pwd here or if the new_fs are in such a state that it will never change when
>> this copying operation is in progress?
> In all cases when we get to that point, new_fs is always a freshly
> created private copy of current->fs, not reachable from anywhere
> other than stack frames of the callers, but the proof is not pretty.
> copy_mnt_ns() is called only by create_new_namespaces() and it gets to
> copying anything if and only if CLONE_NEWNS is in the flags. So far,
> so good. The call in create_new_namespaces() is
> new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
Thanks for the detailed explanation. After further investigation as to
while the pwd_refs is set, I found out the code path leading to this
situation is the unshare syscall.
__x64_sys_unshare()
=> ksys_unshare()
=> unshare_fs(unshare_flags, &new_fs)
=> unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
new_cred, new_fs);
=> create_new_namespaces(unshare_flags, current, user_ns,
new_fs ? new_fs : current->fs);
Here, CLONE_FS isn't set in unshare_flags. So new_fs is NULL and
current->fs is passed down to create_new_namespaces(). That is why
pwd_refs can be set in this case. So it looks like the comment in
copy_mnt_ns() saying that the fs_struct is private is no longer true,
at least in this case. So changing fs_struct without taking the lock
can lead to unexpected result.
Should we add locking to make it safe?
Cheers,
Longman
On Fri, Feb 06, 2026 at 02:16:13PM -0500, Waiman Long wrote:
> > In all cases when we get to that point, new_fs is always a freshly
> > created private copy of current->fs, not reachable from anywhere
> > other than stack frames of the callers, but the proof is not pretty.
> > copy_mnt_ns() is called only by create_new_namespaces() and it gets to
> > copying anything if and only if CLONE_NEWNS is in the flags. So far,
> > so good. The call in create_new_namespaces() is
> > new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
>
> Thanks for the detailed explanation. After further investigation as to while
> the pwd_refs is set, I found out the code path leading to this situation is
> the unshare syscall.
>
> __x64_sys_unshare()
> => ksys_unshare()
> => unshare_fs(unshare_flags, &new_fs)
> => unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
> new_cred, new_fs);
> => create_new_namespaces(unshare_flags, current, user_ns,
> new_fs ? new_fs : current->fs);
>
> Here, CLONE_FS isn't set in unshare_flags. So new_fs is NULL and
> current->fs is passed down to create_new_namespaces(). That is why
> pwd_refs can be set in this case. So it looks like the comment in
> copy_mnt_ns() saying that the fs_struct is private is no longer true,
> at least in this case. So changing fs_struct without taking the lock
> can lead to unexpected result.
CLONE_FS is the red herring here (it will be set earlier in ksys_unshare()
if CLONE_NEWNS is there). I really hate that how convoluted that is,
though.
Look: the case where we might get passed current->fs down there is real.
It can happen in one and only one situation - CLONE_NEWNS in unshare(2)
arguments *and* current->fs->users being 1.
It wouldn't suffice, since there's chroot_fs_refs() that doesn't give
a rat's arse for task->fs being ours - it goes and replaces every
->fs->pwd or ->fs->root that happens to point to old_root.
It's still not a real race, though - both chroot_fs_refs() and that area
in copy_mnt_ns() are serialized on namespace_sem.
And yes, it's obscenely byzantine. It gets even worse when you consider
the fact that pivot_root(2) does not break only because the refcount
drops in chroot_fs_refs() are guaranteed not to reach 0 - the caller is
holding its own references to old_root.{mnt,dentry} and *thar* does not
get dropped until we drop namespace_sem.
IOW, that shit is actually safe, but man, has its correctness grown fucking
convoluted...
Grabbing fs->seq in copy_mnt_ns() wouldn't make the things better, though -
it seriously relies upon the same exclusion with chroot_fs_refs() for
correctness; unless you are willing to hold it over the entire walk through
the mount tree, the proof of correctness doesn't get any simpler.
On Fri, Feb 06, 2026 at 08:29:33PM +0000, Al Viro wrote:
> Look: the case where we might get passed current->fs down there is real.
> It can happen in one and only one situation - CLONE_NEWNS in unshare(2)
> arguments *and* current->fs->users being 1.
>
> It wouldn't suffice, since there's chroot_fs_refs() that doesn't give
> a rat's arse for task->fs being ours - it goes and replaces every
> ->fs->pwd or ->fs->root that happens to point to old_root.
>
> It's still not a real race, though - both chroot_fs_refs() and that area
> in copy_mnt_ns() are serialized on namespace_sem.
>
> And yes, it's obscenely byzantine. It gets even worse when you consider
> the fact that pivot_root(2) does not break only because the refcount
> drops in chroot_fs_refs() are guaranteed not to reach 0 - the caller is
> holding its own references to old_root.{mnt,dentry} and *thar* does not
> get dropped until we drop namespace_sem.
>
> IOW, that shit is actually safe, but man, has its correctness grown fucking
> convoluted...
>
> Grabbing fs->seq in copy_mnt_ns() wouldn't make the things better, though -
> it seriously relies upon the same exclusion with chroot_fs_refs() for
> correctness; unless you are willing to hold it over the entire walk through
> the mount tree, the proof of correctness doesn't get any simpler.
Speaking of the race that _is_ there: pidfd setns() vs. pivot_root().
pivot_root() (well, chroot_fs_refs()) goes over all threads and flips their
->fs->{root,pwd} for the ones that used to be at old_root. The trouble is,
in case where we have setns() with more than just CLONE_NEWNS in flags, we
end up creating a temporary fs_struct, passing that to mntns_install() and
then copying its pwd and root back to the caller's if everything goes well.
That temporary is _not_ going to be found by chroot_fs_refs(), though, so
it misses the update by pivot_root().
On Fri, Feb 06, 2026 at 08:58:04PM +0000, Al Viro wrote:
> On Fri, Feb 06, 2026 at 08:29:33PM +0000, Al Viro wrote:
>
> > Look: the case where we might get passed current->fs down there is real.
> > It can happen in one and only one situation - CLONE_NEWNS in unshare(2)
> > arguments *and* current->fs->users being 1.
> >
> > It wouldn't suffice, since there's chroot_fs_refs() that doesn't give
> > a rat's arse for task->fs being ours - it goes and replaces every
> > ->fs->pwd or ->fs->root that happens to point to old_root.
> >
> > It's still not a real race, though - both chroot_fs_refs() and that area
> > in copy_mnt_ns() are serialized on namespace_sem.
> >
> > And yes, it's obscenely byzantine. It gets even worse when you consider
> > the fact that pivot_root(2) does not break only because the refcount
> > drops in chroot_fs_refs() are guaranteed not to reach 0 - the caller is
> > holding its own references to old_root.{mnt,dentry} and *thar* does not
> > get dropped until we drop namespace_sem.
> >
> > IOW, that shit is actually safe, but man, has its correctness grown fucking
> > convoluted...
> >
> > Grabbing fs->seq in copy_mnt_ns() wouldn't make the things better, though -
> > it seriously relies upon the same exclusion with chroot_fs_refs() for
> > correctness; unless you are willing to hold it over the entire walk through
> > the mount tree, the proof of correctness doesn't get any simpler.
>
> Speaking of the race that _is_ there: pidfd setns() vs. pivot_root().
> pivot_root() (well, chroot_fs_refs()) goes over all threads and flips their
> ->fs->{root,pwd} for the ones that used to be at old_root. The trouble is,
> in case where we have setns() with more than just CLONE_NEWNS in flags, we
> end up creating a temporary fs_struct, passing that to mntns_install() and
> then copying its pwd and root back to the caller's if everything goes well.
>
> That temporary is _not_ going to be found by chroot_fs_refs(), though, so
> it misses the update by pivot_root().
BTW, in the same case of setns(2) (e.g. CLONE_NEWNS | CLONE_NEWUTS in flags)
we end up defeating the check for fs->users == 1 in mntns_install() - for
the temporary fs_struct it will always be true. Unless I'm missing something
elsewhere... Christian? Looks like that went in with 303cc571d107 ("nsproxy:
attach to namespaces via pidfds")...
On 2/6/26 2:16 PM, Waiman Long wrote:
> On 2/6/26 12:22 AM, Al Viro wrote:
>> On Thu, Feb 05, 2026 at 11:11:51PM -0500, Waiman Long wrote:
>>
>>> __latent_entropy
>>> struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
>>> struct user_namespace *user_ns, struct fs_struct
>>> *new_fs)
>>> {
>>> :
>>> if (new_fs) {
>>> if (&p->mnt == new_fs->root.mnt) {
>>> new_fs->root.mnt = mntget(&q->mnt);
>>> rootmnt = &p->mnt;
>>> }
>>> if (&p->mnt == new_fs->pwd.mnt) {
>>> new_fs->pwd.mnt = mntget(&q->mnt);
>>> pwdmnt = &p->mnt;
>>> }
>>> }
>>>
>>> It is replacing the fs->pwd.mnt with a new one while pwd_refs is 1.
>>> I can
>>> make this work with the new fs_struct field. I do have one question
>>> though.
>>> Do we need to acquire write_seqlock(&new_fs->seq) if we are changing
>>> root or
>>> pwd here or if the new_fs are in such a state that it will never
>>> change when
>>> this copying operation is in progress?
>> In all cases when we get to that point, new_fs is always a freshly
>> created private copy of current->fs, not reachable from anywhere
>> other than stack frames of the callers, but the proof is not pretty.
>> copy_mnt_ns() is called only by create_new_namespaces() and it gets to
>> copying anything if and only if CLONE_NEWNS is in the flags. So far,
>> so good. The call in create_new_namespaces() is
>> new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns,
>> user_ns, new_fs);
>
> Thanks for the detailed explanation. After further investigation as to
> while the pwd_refs is set, I found out the code path leading to this
> situation is the unshare syscall.
>
> __x64_sys_unshare()
> => ksys_unshare()
> => unshare_fs(unshare_flags, &new_fs)
> => unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
> new_cred, new_fs);
> => create_new_namespaces(unshare_flags, current, user_ns,
> new_fs ? new_fs : current->fs);
>
> Here, CLONE_FS isn't set in unshare_flags. So new_fs is NULL and
> current->fs is passed down to create_new_namespaces(). That is why
> pwd_refs can be set in this case. So it looks like the comment in
> copy_mnt_ns() saying that the fs_struct is private is no longer true,
> at least in this case. So changing fs_struct without taking the lock
> can lead to unexpected result.
>
> Should we add locking to make it safe?
I guess if private means fs->users == 1, the condition could still be true.
Cheers,
Longman
On Fri, Feb 06, 2026 at 03:04:53PM -0500, Waiman Long wrote: > I guess if private means fs->users == 1, the condition could still be true. Yes. And then there's an oh-so-lovely wart wrt. chroot_fs_refs() and exclusion for it. See another reply re unspeakable[*] elegance of the proof of correctness for that stuff ;-/ Annoyed much more than usual, Al [*] there are some limits to what I'm willing to say on a family-friendly maillist...
On Fri, Feb 06, 2026 at 03:04:53PM -0500, Waiman Long wrote:
[summary of subthread: there's an unpleasant corner case in unshare(2),
when we have a CLONE_NEWNS in flags and current->fs hadn't been shared
at all; in that case copy_mnt_ns() gets passed current->fs instead of
a private copy, which causes interesting warts in proof of correctness]
> I guess if private means fs->users == 1, the condition could still be true.
Unfortunately, it's worse than just a convoluted proof of correctness.
Consider the case when we have CLONE_NEWCGROUP in addition to CLONE_NEWNS
(and current->fs->users == 1).
We pass current->fs to copy_mnt_ns(), all right. Suppose it succeeds and
flips current->fs->{pwd,root} to corresponding locations in the new namespace.
Now we proceed to copy_cgroup_ns(), which fails (e.g. with -ENOMEM).
We call put_mnt_ns() on the namespace created by copy_mnt_ns(), it's
destroyed and its mount tree is dissolved, but... current->fs->root and
current->fs->pwd are both left pointing to now detached mounts.
They are pinning those, so it's not a UAF, but it leaves the calling
process with unshare(2) failing with -ENOMEM _and_ leaving it with
pwd and root on detached isolated mounts. The last part is clearly a bug.
There is other fun related to that mess (races with pivot_root(), including
the one between pivot_root() and fork(), of all things), but this one
is easy to isolate and fix - treat CLONE_NEWNS as "allocate a new
fs_struct even if it hadn't been shared in the first place". Sure, we could
go for something like "if both CLONE_NEWNS *and* one of the things that might
end up failing after copy_mnt_ns() call in create_new_namespaces() are set,
force allocation of new fs_struct", but let's keep it simple - the cost
of copy_fs_struct() is trivial.
Another benefit is that copy_mnt_ns() with CLONE_NEWNS *always* gets
a freshly allocated fs_struct, yet to be attached to anything. That
seriously simplifies the analysis...
FWIW, that bug had been there since the introduction of unshare(2) ;-/
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
diff --git a/kernel/fork.c b/kernel/fork.c
index b1f3915d5f8e..68ccbaea7398 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -3082,7 +3082,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
return 0;
/* don't need lock here; in the worst case we'll do useless copy */
- if (fs->users == 1)
+ if (!(unshare_flags & CLONE_NEWNS) && fs->users == 1)
return 0;
*new_fsp = copy_fs_struct(fs);
On Fri, Feb 06, 2026 at 05:22:18AM +0000, Al Viro wrote:
> callers. There are 3 of those:
4, actually - prepare_nsset() has
nsset->nsproxy = create_new_namespaces(0, me, current_user_ns(), me->fs);
again, without CLONE_NEWNS, so won't get to that loop in copy_mnt_ns()
(que the obvious references to Cardinal Ximenez...)
On Fri, Feb 06, 2026 at 05:22:18AM +0000, Al Viro wrote:
> On Thu, Feb 05, 2026 at 11:11:51PM -0500, Waiman Long wrote:
>
> > __latent_entropy
> > struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
> > struct user_namespace *user_ns, struct fs_struct *new_fs)
> > {
> > :
> > if (new_fs) {
> > if (&p->mnt == new_fs->root.mnt) {
> > new_fs->root.mnt = mntget(&q->mnt);
> > rootmnt = &p->mnt;
> > }
> > if (&p->mnt == new_fs->pwd.mnt) {
> > new_fs->pwd.mnt = mntget(&q->mnt);
> > pwdmnt = &p->mnt;
> > }
> > }
> >
> > It is replacing the fs->pwd.mnt with a new one while pwd_refs is 1. I can
> > make this work with the new fs_struct field. I do have one question though.
> > Do we need to acquire write_seqlock(&new_fs->seq) if we are changing root or
> > pwd here or if the new_fs are in such a state that it will never change when
> > this copying operation is in progress?
>
> In all cases when we get to that point, new_fs is always a freshly
> created private copy of current->fs, not reachable from anywhere
> other than stack frames of the callers, but the proof is not pretty.
> copy_mnt_ns() is called only by create_new_namespaces() and it gets to
> copying anything if and only if CLONE_NEWNS is in the flags. So far,
> so good. The call in create_new_namespaces() is
> new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, user_ns, new_fs);
> and both flags and new_fs come straight from create_new_namespaces()
> callers. There are 3 of those:
> 1) int copy_namespaces(u64 flags, struct task_struct *tsk):
> new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs);
> That gets called by copy_process(), with tsk being the child we are
> creating there. tsk->fs is set a bit earlier, by copy_fs() call - either
> to extra ref to current->fs (when CLONE_FS is present in flags) or to
> a private copy thereof. Since in the very beginning of copy_process()
> we have
> if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
> return ERR_PTR(-EINVAL);
> and we are interested in case when CLONE_NEWNS is set, tsk->fs is going
> to be a private copy.
>
> 2) int unshare_nsproxy_namespaces(unsigned long unshare_flags,
> struct nsproxy **new_nsp, struct cred *new_cred, struct fs_struct *new_fs):
> *new_nsp = create_new_namespaces(unshare_flags, current, user_ns,
> new_fs ? new_fs : current->fs);
> That gets called from ksys_unshare(). Earlier in ksys_unshare() we have
> /*
> * If unsharing namespace, must also unshare filesystem information.
> */
> if (unshare_flags & CLONE_NEWNS)
> unshare_flags |= CLONE_FS;
> so in our case CLONE_FS is going to be set. new_fs is initially set
> to NULL and, since CLONE_FS is there, the call of unshare_fs() has
> replaced it with a reference to private copy of current->fs. Again,
> we get a private copy.
>
> 3) int exec_task_namespaces(void):
> new = create_new_namespaces(0, tsk, current_user_ns(), tsk->fs);
> No CLONE_NEWNS in flags, so we don't get there at all.
>
> Incidentally, one bogosity I've spotted in unshare_fs() today is
> if (!(unshare_flags & CLONE_FS) || !fs)
> ^^^^^^ this
> return 0;
>
> The history is amusing - it had been faithfully preserved since
> cf2e340f4249 ("[PATCH] unshare system call -v5: system call handler
> function") back in 2006, when unshare(2) had been added; initially it
> had been a stub:
> +static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
> +{
> + struct fs_struct *fs = current->fs;
> +
> + if ((unshare_flags & CLONE_FS) &&
> + (fs && atomic_read(&fs->count) > 1))
> + return -EINVAL;
> +
> + return 0;
> +}
>
> The thing is, task->fs does not become NULL until exit_fs() clears it, at
> which point we'd better *not* run into any syscalls, unshare(2) included
> ;-) The same had been true back in 2006; as the matter of fact, I don't
> know if it _ever_ had not been true. I suspect that the logics had been
> copied from exit_fs(), which also has a pointless check that seems to have
> been added there in 1.3.26, when task->fs went kmalloc'ed. The thing
> is, in 1.3.26 copy_fs() a failed allocation aborted do_fork() (today's
> copy_process()) with exit_fs() never called for the child-not-to-be...
Actually, now I remember - until 2.5.<early> we used to have kernel threads
that did exit_fs() and normally followed that by manual switch to extra
reference to init_task.fs. Mostly - by calling daemonize(), some - manually;
there had been some (mtdblock definitely among those, might be more) that
didn't bother with the second part.
The upshot had been that we could get transient task->fs switching to
NULL and then back to &init_fs (that could happen only to kernel threads)
and some of those didn't bother to switch back to non-NULL.
None of that had ever been relevant for unshare(2); the last remnants
of that bogosity went away in 2009 when we got task_lock(current) over
all changes of current->fs, with NULL never being stored there for
as long as the thread remains alive.
Mea culpa - mnt_copy_ns() (all way back to 2001, when it had been
added as copy_namespace()) is in the same boat as unshare(2); none
of those kernel threads would ever spawn a new namespace, let alone
doing that right in the middle between exit_fs() and setting ->fs
to &init_task.fs. Which is to say, that
if (new_fs) {
in there had always been just as bogus - condition is never false ;-/
On Fri, Feb 06, 2026 at 06:31:24AM +0000, Al Viro wrote: > Mea culpa - mnt_copy_ns() copy_mnt_ns(), that is...
On 2/5/26 11:11 PM, Waiman Long wrote:
> On 2/5/26 8:20 PM, Waiman Long wrote:
>> On 2/5/26 6:53 PM, Al Viro wrote:
>>> On Wed, Feb 04, 2026 at 11:45:17PM -0500, Waiman Long wrote:
>>>
>>>> @@ -70,6 +74,8 @@ void chroot_fs_refs(const struct path *old_root,
>>>> const
>>>> struct>
>>>> count++;
>>>> path_get(new_root);
>>>> }
>>>> + count += fs->pwd_xrefs;
>>>> + fs->pwd_xrefs = 0;
>>>> write_sequnlock(&fs->seq);
>>> Nope - you only need that for threads that have ->pwd equal to
>>> old_root.
>>> Incidentally, I'd forgotten about that sucker - it kills the idea of
>>> fdget-like tricks dead, more's the pity. Third-party modification of
>>> task->fs->pwd (under task->lock and task->fs->seq), possible even with
>>> task->fs->users == 1.
>>
>> Yes, I am aware of that when I took a further look at the patch that
>> I sent out yesterday. I am testing the updated patch now and is
>> trying to figure out why I get a warning from
>> mntput_no_expire_slowpath() with a count of -1 when doing an umount.
>> It is off by 1 somewhere. I will post the patch once I resolve this bug.
>
> I now know why there are warnings. The problem is in the copy_mnt_ns()
> function in fs/namespace.c:
>
> __latent_entropy
> struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns,
> struct user_namespace *user_ns, struct fs_struct *new_fs)
> {
> :
> if (new_fs) {
> if (&p->mnt == new_fs->root.mnt) {
> new_fs->root.mnt = mntget(&q->mnt);
> rootmnt = &p->mnt;
> }
> if (&p->mnt == new_fs->pwd.mnt) {
> new_fs->pwd.mnt = mntget(&q->mnt);
> pwdmnt = &p->mnt;
> }
> }
>
> It is replacing the fs->pwd.mnt with a new one while pwd_refs is 1. I
> can make this work with the new fs_struct field. I do have one
> question though. Do we need to acquire write_seqlock(&new_fs->seq) if
> we are changing root or pwd here or if the new_fs are in such a state
> that it will never change when this copying operation is in progress?
Ah, there are comment above saying
/*
* Second pass: switch the tsk->fs->* elements and mark new
vfsmounts
* as belonging to new namespace. We have already acquired a
private
* fs_struct, so tsk->fs->lock is not needed.
*/
So I guess it is safe to make change without taking the lock.
Cheers,
Longman
On Tue, Feb 03, 2026 at 02:44:33PM -0500, Waiman Long wrote: > + /* > + * Defer the path_put() call of the old pwd to workqueue as > + * we may be in an atomic context that cannot call path_put() > + * directly because of might_sleep(). > + */ > + pp = kmalloc(sizeof(*pp), GFP_NOFS); If that really gets called in atomic context, this kmalloc will in itself be enough to fuck you over, won't it?
On 2/3/26 2:59 PM, Al Viro wrote: > On Tue, Feb 03, 2026 at 02:44:33PM -0500, Waiman Long wrote: > >> + /* >> + * Defer the path_put() call of the old pwd to workqueue as >> + * we may be in an atomic context that cannot call path_put() >> + * directly because of might_sleep(). >> + */ >> + pp = kmalloc(sizeof(*pp), GFP_NOFS); > If that really gets called in atomic context, this kmalloc will > in itself be enough to fuck you over, won't it? Looking at the bug report again, I realized that the problem is because of the get_cond_fs_pwd() helper in my v1 patch call path_put() in a spinlock critical section. So we don't need to use workqueue here, just doing a path_put() should be good enough. Thanks, Longman
© 2016 - 2026 Red Hat, Inc.