fs/filesystems.c | 144 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 137 insertions(+), 7 deletions(-)
It is being read surprisingly often (e.g., by mkdir, ls and even sed!).
This is lock-protected pointer chasing over a linked list to pay for
sprintf for every fs (32 on my boxen).
Instead cache the result.
open+read+close cycle single-threaded (ops/s):
before: 442732
after: 1063462 (+140%)
Here the main bottleneck is memcg.
Scalability-wise problems are avoidable lockref trip on open and ref
management for the file on procfs side.
The file looks like a sterotypical C from the 90s, right down to an
open-coded and slightly obfuscated linked list. I intentionally did not
clean up any of it -- I think the file will be best served by a Rust
rewrite when the time comes.
Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
v2:
- drop the procfs bits
- touch up some comments
I posted v1 last year https://lore.kernel.org/linux-fsdevel/20250329192821.822253-1-mjguzik@gmail.com/
but that ran into some procfs issues. the thing can be sped up
regardless of the procfs problem.
fs/filesystems.c | 144 ++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 137 insertions(+), 7 deletions(-)
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 0c7d2b7ac26c..704fc6d49f80 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -34,6 +34,23 @@
static struct file_system_type *file_systems;
static DEFINE_RWLOCK(file_systems_lock);
+#ifdef CONFIG_PROC_FS
+static unsigned long file_systems_gen;
+
+struct file_systems_string {
+ struct rcu_head rcufree;
+ unsigned long gen;
+ size_t len;
+ char string[];
+};
+static struct file_systems_string *file_systems_string;
+static void invalidate_filesystems_string(void);
+#else
+static void invalidate_filesystems_string(void)
+{
+}
+#endif
+
/* WARNING: This can be used only if we _already_ own a reference */
struct file_system_type *get_filesystem(struct file_system_type *fs)
{
@@ -83,10 +100,12 @@ int register_filesystem(struct file_system_type * fs)
return -EBUSY;
write_lock(&file_systems_lock);
p = find_filesystem(fs->name, strlen(fs->name));
- if (*p)
+ if (*p) {
res = -EBUSY;
- else
+ } else {
*p = fs;
+ invalidate_filesystems_string();
+ }
write_unlock(&file_systems_lock);
return res;
}
@@ -115,6 +134,7 @@ int unregister_filesystem(struct file_system_type * fs)
if (fs == *tmp) {
*tmp = fs->next;
fs->next = NULL;
+ invalidate_filesystems_string();
write_unlock(&file_systems_lock);
synchronize_rcu();
return 0;
@@ -235,22 +255,132 @@ int __init list_bdev_fs_names(char *buf, size_t size)
}
#ifdef CONFIG_PROC_FS
-static int filesystems_proc_show(struct seq_file *m, void *v)
+/*
+ * The fs list gets queried a lot by userspace because of libselinux, including
+ * rather surprising programs (would you guess *sed* is on the list?). In order
+ * to reduce the overhead we cache the resulting string, which normally hangs
+ * around below 512 bytes in size.
+ *
+ * As the list almost never changes, its creation is not particularly optimized
+ * for simplicity.
+ *
+ * We sort it out on read in order to not introduce a failure point for fs
+ * registration (in principle we may be unable to alloc memory for the list).
+ */
+static void invalidate_filesystems_string(void)
{
- struct file_system_type * tmp;
+ struct file_systems_string *fss;
- read_lock(&file_systems_lock);
+ lockdep_assert_held_write(&file_systems_lock);
+ file_systems_gen++;
+ fss = file_systems_string;
+ WRITE_ONCE(file_systems_string, NULL);
+ kfree_rcu(fss, rcufree);
+}
+
+static noinline int regen_filesystems_string(void)
+{
+ struct file_system_type *tmp;
+ struct file_systems_string *old, *new;
+ size_t newlen, usedlen;
+ unsigned long gen;
+
+retry:
+ lockdep_assert_not_held(&file_systems_lock);
+
+ newlen = 0;
+ write_lock(&file_systems_lock);
+ gen = file_systems_gen;
+ tmp = file_systems;
+
+ /* pre-calc space for "%s\t%s\n" for each fs */
+ while (tmp) {
+ if (!(tmp->fs_flags & FS_REQUIRES_DEV))
+ newlen += strlen("nodev");
+ newlen += strlen("\t");
+ newlen += strlen(tmp->name);
+ newlen += strlen("\n");
+ tmp = tmp->next;
+ }
+ write_unlock(&file_systems_lock);
+
+ new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1,
+ GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ new->gen = gen;
+ new->len = newlen;
+ new->string[newlen] = '\0';
+ write_lock(&file_systems_lock);
+ old = file_systems_string;
+
+ /*
+ * Did someone beat us to it?
+ */
+ if (old && old->gen == file_systems_gen) {
+ write_unlock(&file_systems_lock);
+ kfree(new);
+ return 0;
+ }
+
+ /*
+ * Did the list change in the meantime?
+ */
+ if (gen != file_systems_gen) {
+ write_unlock(&file_systems_lock);
+ kfree(new);
+ goto retry;
+ }
+
+ /*
+ * Populate the string.
+ *
+ * We know we have just enough space because we calculated the right
+ * size the previous time we had the lock and confirmed the list has
+ * not changed after reacquiring it.
+ */
+ usedlen = 0;
tmp = file_systems;
while (tmp) {
- seq_printf(m, "%s\t%s\n",
+ usedlen += sprintf(&new->string[usedlen], "%s\t%s\n",
(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
tmp->name);
tmp = tmp->next;
}
- read_unlock(&file_systems_lock);
+ BUG_ON(new->len != strlen(new->string));
+
+ /*
+ * Paired with consume fence in READ_ONCE() in filesystems_proc_show()
+ */
+ smp_store_release(&file_systems_string, new);
+ write_unlock(&file_systems_lock);
+ kfree_rcu(old, rcufree);
return 0;
}
+static int filesystems_proc_show(struct seq_file *m, void *v)
+{
+ struct file_systems_string *fss;
+
+ for (;;) {
+ scoped_guard(rcu) {
+ /*
+ * Paired with smp_store_release() in regen_filesystems_string()
+ */
+ fss = READ_ONCE(file_systems_string);
+ if (likely(fss)) {
+ seq_write(m, fss->string, fss->len);
+ return 0;
+ }
+ }
+
+ int err = regen_filesystems_string();
+ if (unlikely(err))
+ return err;
+ }
+}
+
static int __init proc_filesystems_init(void)
{
proc_create_single("filesystems", 0, NULL, filesystems_proc_show);
--
2.48.1
On Wed, Apr 22, 2026 at 08:17:11PM +0200, Mateusz Guzik wrote: > It is being read surprisingly often (e.g., by mkdir, ls and even sed!). > > This is lock-protected pointer chasing over a linked list to pay for > sprintf for every fs (32 on my boxen). > > Instead cache the result. > > open+read+close cycle single-threaded (ops/s): > before: 442732 > after: 1063462 (+140%) > > Here the main bottleneck is memcg. > > Scalability-wise problems are avoidable lockref trip on open and ref > management for the file on procfs side. > > The file looks like a sterotypical C from the 90s, right down to an > open-coded and slightly obfuscated linked list. I intentionally did not > clean up any of it -- I think the file will be best served by a Rust > rewrite when the time comes. > > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> > --- Ugh, can't we at least also get rid of the rwlock stuff? Something like the appended two (completely untested) patches.
On Thu, Apr 23, 2026 at 3:36 PM Christian Brauner <brauner@kernel.org> wrote: > > On Wed, Apr 22, 2026 at 08:17:11PM +0200, Mateusz Guzik wrote: > > It is being read surprisingly often (e.g., by mkdir, ls and even sed!). > > > > This is lock-protected pointer chasing over a linked list to pay for > > sprintf for every fs (32 on my boxen). > > > > Instead cache the result. > > > > open+read+close cycle single-threaded (ops/s): > > before: 442732 > > after: 1063462 (+140%) > > > > Here the main bottleneck is memcg. > > > > Scalability-wise problems are avoidable lockref trip on open and ref > > management for the file on procfs side. > > > > The file looks like a sterotypical C from the 90s, right down to an > > open-coded and slightly obfuscated linked list. I intentionally did not > > clean up any of it -- I think the file will be best served by a Rust > > rewrite when the time comes. > > > > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> > > --- > > Ugh, can't we at least also get rid of the rwlock stuff? > Something like the appended two (completely untested) patches. The procfs thing got protest last time, see https://lore.kernel.org/all/CACVxJT_eXRNjp-2sEfuxYmzTMPvu7U1R2bsKYjadVfR-W691og@mail.gmail.com/ Alexey wrote his own variant and put my name on it: https://lore.kernel.org/linux-fsdevel/c58291cd-0775-4c90-8443-ba71897b5cbb@p183/ . I'm not going to protest the patch, but I'm not signing off on the thing. Perhaps you can pull it while removing my credit? The author cc'ed. The wtf list handling cleanup + rwlock removal should certainly be their own commit. applying all of this on top of my patch should be a matter of few shell commands, alternatively I can massage this the other way around. I did not go for the other changes because a big chunk of the file is the suspicious sysv syscall marked for removal and which I have 0 intentions of testing. Finallly, a review did not like the BUG_ON (which you replaced with WARN_ON_ONCE) and the new failure mode of ENOMEM. Both bogus calculation and ENOMEM can be handled with a fallback to iterating the list like it is done now. I can do the massaging if it sounds like a plan.
On Thu, Apr 23, 2026 at 04:38:05PM +0200, Mateusz Guzik wrote: > On Thu, Apr 23, 2026 at 3:36 PM Christian Brauner <brauner@kernel.org> wrote: > > > > On Wed, Apr 22, 2026 at 08:17:11PM +0200, Mateusz Guzik wrote: > > > It is being read surprisingly often (e.g., by mkdir, ls and even sed!). > > > > > > This is lock-protected pointer chasing over a linked list to pay for > > > sprintf for every fs (32 on my boxen). > > > > > > Instead cache the result. > > > > > > open+read+close cycle single-threaded (ops/s): > > > before: 442732 > > > after: 1063462 (+140%) > > > > > > Here the main bottleneck is memcg. > > > > > > Scalability-wise problems are avoidable lockref trip on open and ref > > > management for the file on procfs side. > > > > > > The file looks like a sterotypical C from the 90s, right down to an > > > open-coded and slightly obfuscated linked list. I intentionally did not > > > clean up any of it -- I think the file will be best served by a Rust > > > rewrite when the time comes. > > > > > > Signed-off-by: Mateusz Guzik <mjguzik@gmail.com> > > > --- > > > > Ugh, can't we at least also get rid of the rwlock stuff? > > Something like the appended two (completely untested) patches. > > The procfs thing got protest last time, see > https://lore.kernel.org/all/CACVxJT_eXRNjp-2sEfuxYmzTMPvu7U1R2bsKYjadVfR-W691og@mail.gmail.com/ > > Alexey wrote his own variant and put my name on it: > https://lore.kernel.org/linux-fsdevel/c58291cd-0775-4c90-8443-ba71897b5cbb@p183/ > . I'm not going to protest the patch, but I'm not signing off on the > thing. Perhaps you can pull it while removing my credit? The author > cc'ed. > > The wtf list handling cleanup + rwlock removal should certainly be > their own commit. applying all of this on top of my patch should be a > matter of few shell commands, alternatively I can massage this the > other way around. > > I did not go for the other changes because a big chunk of the file is > the suspicious sysv syscall marked for removal and which I have 0 > intentions of testing. > > Finallly, a review did not like the BUG_ON (which you replaced with > WARN_ON_ONCE) and the new failure mode of ENOMEM. Both bogus > calculation and ENOMEM can be handled with a fallback to iterating the > list like it is done now. > > I can do the massaging if it sounds like a plan. Yeah, sounds good to me.
On Apr 22, 2026, at 12:17, Mateusz Guzik <mjguzik@gmail.com> wrote:
>
> It is being read surprisingly often (e.g., by mkdir, ls and even sed!).
Yeah, it is sad that bloat continually grows to continually make fast
systems slow.
> This is lock-protected pointer chasing over a linked list to pay for
> sprintf for every fs (32 on my boxen).
>
> Instead cache the result.
>
> open+read+close cycle single-threaded (ops/s):
> before: 442732
> after: 1063462 (+140%)
>
> Here the main bottleneck is memcg.
>
> Scalability-wise problems are avoidable lockref trip on open and ref
> management for the file on procfs side.
>
> The file looks like a sterotypical C from the 90s, right down to an
> open-coded and slightly obfuscated linked list. I intentionally did not
> clean up any of it -- I think the file will be best served by a Rust
> rewrite when the time comes.
>
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
> ---
>
> v2:
> - drop the procfs bits
> - touch up some comments
>
> I posted v1 last year https://lore.kernel.org/linux-fsdevel/20250329192821.822253-1-mjguzik@gmail.com/
>
> but that ran into some procfs issues. the thing can be sped up
> regardless of the procfs problem.
>
> fs/filesystems.c | 144 ++++++++++++++++++++++++++++++++++++++++++++---
> 1 file changed, 137 insertions(+), 7 deletions(-)
>
> diff --git a/fs/filesystems.c b/fs/filesystems.c
>
> +static noinline int regen_filesystems_string(void)
> +{
> + struct file_system_type *tmp;
> + struct file_systems_string *old, *new;
> + size_t newlen, usedlen;
> + unsigned long gen;
> +
> +retry:
> + lockdep_assert_not_held(&file_systems_lock);
> +
> + newlen = 0;
> + write_lock(&file_systems_lock);
> + gen = file_systems_gen;
> + tmp = file_systems;
> +
> + /* pre-calc space for "%s\t%s\n" for each fs */
> + while (tmp) {
> + if (!(tmp->fs_flags & FS_REQUIRES_DEV))
> + newlen += strlen("nodev");
> + newlen += strlen("\t");
> + newlen += strlen(tmp->name);
> + newlen += strlen("\n");
> + tmp = tmp->next;
> + }
> + write_unlock(&file_systems_lock);
> +
> + new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1,
> + GFP_KERNEL);
> + if (!new)
> + return -ENOMEM;
This is introducing an error case where there was none before, in a
critical system file. It would also be possible to fall back to
seq_printf() instead of sprinft() if no buffer is available.
> + new->gen = gen;
> + new->len = newlen;
> + new->string[newlen] = '\0';
> + write_lock(&file_systems_lock);
> + old = file_systems_string;
> +
> + /*
> + * Did someone beat us to it?
> + */
> + if (old && old->gen == file_systems_gen) {
> + write_unlock(&file_systems_lock);
> + kfree(new);
> + return 0;
> + }
> +
> + /*
> + * Did the list change in the meantime?
> + */
> + if (gen != file_systems_gen) {
> + write_unlock(&file_systems_lock);
> + kfree(new);
> + goto retry;
> + }
> +
> + /*
> + * Populate the string.
> + *
> + * We know we have just enough space because we calculated the right
> + * size the previous time we had the lock and confirmed the list has
> + * not changed after reacquiring it.
> + */
It doesn't seem prudent to use raw sprintf() in a loop and justify this in
a comment, when the actual string generation is not performance critical.
This could use scnprintf() and be "safer" even if it "should just work" (tm).
> + usedlen = 0;
> tmp = file_systems;
> while (tmp) {
> - seq_printf(m, "%s\t%s\n",
> + usedlen += sprintf(&new->string[usedlen], "%s\t%s\n",
> (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
> tmp->name);
> tmp = tmp->next;
> }
> - read_unlock(&file_systems_lock);
> + BUG_ON(new->len != strlen(new->string));
Adding a BUG_ON() here seems gratuitous, when there are lots of other
options to handle this case, especially if sprintf() didn't just corrupt
memory for some reason. It could allocate a larger buffer and retry,
or in the worst case return an error instead of crashing the system.
> +
> + /*
> + * Paired with consume fence in READ_ONCE() in filesystems_proc_show()
> + */
> + smp_store_release(&file_systems_string, new);
> + write_unlock(&file_systems_lock);
> + kfree_rcu(old, rcufree);
> return 0;
> }
>
Cheers, Andreas
On Thu, Apr 23, 2026 at 12:07 AM Andreas Dilger <adilger@dilger.ca> wrote:
>
> On Apr 22, 2026, at 12:17, Mateusz Guzik <mjguzik@gmail.com> wrote:
> > + new = kmalloc(offsetof(struct file_systems_string, string) + newlen + 1,
> > + GFP_KERNEL);
> > + if (!new)
> > + return -ENOMEM;
>
> This is introducing an error case where there was none before, in a
> critical system file. It would also be possible to fall back to
> seq_printf() instead of sprinft() if no buffer is available.
>
Fair, should be easy to do.
> > + /*
> > + * Populate the string.
> > + *
> > + * We know we have just enough space because we calculated the right
> > + * size the previous time we had the lock and confirmed the list has
> > + * not changed after reacquiring it.
> > + */
>
> It doesn't seem prudent to use raw sprintf() in a loop and justify this in
> a comment, when the actual string generation is not performance critical.
> This could use scnprintf() and be "safer" even if it "should just work" (tm).
>
I don't get how that's supposed help anything. I still have to call
the routine in a loop, except this time I have to keep recalculating
remaining size by hand on top of it.
What this really wants is a string library which handles easy things
like this one, FreeBSD has something like this in
https://man.freebsd.org/cgi/man.cgi?sbuf(9) , but I certainly can't be
bothered to work on an equivalent or porting it over.
If one is to "overflow-proof this" with total disregard for
performance, a series of strlcats would do it.
> > + usedlen = 0;
> > tmp = file_systems;
> > while (tmp) {
> > - seq_printf(m, "%s\t%s\n",
> > + usedlen += sprintf(&new->string[usedlen], "%s\t%s\n",
> > (tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
> > tmp->name);
> > tmp = tmp->next;
> > }
> > - read_unlock(&file_systems_lock);
> > + BUG_ON(new->len != strlen(new->string));
>
> Adding a BUG_ON() here seems gratuitous, when there are lots of other
> options to handle this case, especially if sprintf() didn't just corrupt
> memory for some reason. It could allocate a larger buffer and retry,
> or in the worst case return an error instead of crashing the system.
>
Well nobody is supposed to get the splat, except maybe someone messing
with how the string is generated, so I don't really see your point --
it's purely a bug-catching measure.
However, it can still just print a warn and fallback to the current
pointer chasing.
On Wed, Apr 22, 2026 at 04:07:15PM -0600, Andreas Dilger wrote: > On Apr 22, 2026, at 12:17, Mateusz Guzik <mjguzik@gmail.com> wrote: > > > > It is being read surprisingly often (e.g., by mkdir, ls and even sed!). > > Yeah, it is sad that bloat continually grows to continually make fast > systems slow. libselinux looking for selinuxfs, at a guess, not that I had any idea why sed might possibly care about that... <RTFS> -i support, apparently. No printable comment...
© 2016 - 2026 Red Hat, Inc.