Add way to query the children of a particular mount. This is a more
flexible way to iterate the mount tree than having to parse the complete
/proc/self/mountinfo.
Allow listing either
- immediate child mounts only, or
- recursively all descendant mounts (depth first).
Lookup the mount by the new 64bit mount ID. If a mount needs to be queried
based on path, then statx(2) can be used to first query the mount ID
belonging to the path.
Return an array of new (64bit) mount ID's. Without privileges only mounts
are listed which are reachable from the task's root.
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
---
fs/namespace.c | 93 ++++++++++++++++++++++++++++++++++++++
include/linux/syscalls.h | 3 ++
include/uapi/linux/mount.h | 9 ++++
3 files changed, 105 insertions(+)
diff --git a/fs/namespace.c b/fs/namespace.c
index a980c250a3a6..0afe2344bba6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -4958,6 +4958,99 @@ SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req,
return ret;
}
+static struct mount *listmnt_first(struct mount *root)
+{
+ return list_first_entry_or_null(&root->mnt_mounts, struct mount, mnt_child);
+}
+
+static struct mount *listmnt_next(struct mount *curr, struct mount *root, bool recurse)
+{
+ if (recurse)
+ return next_mnt(curr, root);
+ if (!list_is_head(curr->mnt_child.next, &root->mnt_mounts))
+ return list_next_entry(curr, mnt_child);
+ return NULL;
+}
+
+static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
+ const struct path *root, unsigned int flags)
+{
+ struct mount *r, *m = real_mount(mnt);
+ struct path rootmnt = {
+ .mnt = root->mnt,
+ .dentry = root->mnt->mnt_root
+ };
+ long ctr = 0;
+ bool reachable_only = true;
+ bool recurse = flags & LISTMOUNT_RECURSIVE;
+ int err;
+
+ err = security_sb_statfs(mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (flags & LISTMOUNT_UNREACHABLE) {
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ reachable_only = false;
+ }
+
+ if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt))
+ return capable(CAP_SYS_ADMIN) ? 0 : -EPERM;
+
+ for (r = listmnt_first(m); r; r = listmnt_next(r, m, recurse)) {
+ if (reachable_only &&
+ !is_path_reachable(r, r->mnt.mnt_root, root))
+ continue;
+
+ if (ctr >= bufsize)
+ return -EOVERFLOW;
+ if (put_user(r->mnt_id_unique, buf + ctr))
+ return -EFAULT;
+ ctr++;
+ if (ctr < 0)
+ return -ERANGE;
+ }
+ return ctr;
+}
+
+SYSCALL_DEFINE4(listmount, const struct __mount_arg __user *, req,
+ u64 __user *, buf, size_t, bufsize, unsigned int, flags)
+{
+ struct __mount_arg kreq;
+ struct vfsmount *mnt;
+ struct path root;
+ u64 mnt_id;
+ long err;
+
+ if (flags & ~(LISTMOUNT_UNREACHABLE | LISTMOUNT_RECURSIVE))
+ return -EINVAL;
+
+ if (copy_from_user(&kreq, req, sizeof(kreq)))
+ return -EFAULT;
+ mnt_id = kreq.mnt_id;
+
+ down_read(&namespace_sem);
+ if (mnt_id == LSMT_ROOT)
+ mnt = ¤t->nsproxy->mnt_ns->root->mnt;
+ else
+ mnt = lookup_mnt_in_ns(mnt_id, current->nsproxy->mnt_ns);
+
+ err = -ENOENT;
+ if (mnt) {
+ get_fs_root(current->fs, &root);
+ /* Skip unreachable for LSMT_ROOT */
+ if (mnt_id == LSMT_ROOT && !(flags & LISTMOUNT_UNREACHABLE))
+ mnt = root.mnt;
+ err = do_listmount(mnt, buf, bufsize, &root, flags);
+ path_put(&root);
+ }
+ up_read(&namespace_sem);
+
+ return err;
+}
+
+
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index ba371024d902..38f3da7e04d1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -413,6 +413,9 @@ asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
asmlinkage long sys_statmount(const struct __mount_arg __user *req,
struct statmnt __user *buf, size_t bufsize,
unsigned int flags);
+asmlinkage long sys_listmount(const struct __mount_arg __user *req,
+ u64 __user *buf, size_t bufsize,
+ unsigned int flags);
asmlinkage long sys_truncate(const char __user *path, long length);
asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
#if BITS_PER_LONG == 32
diff --git a/include/uapi/linux/mount.h b/include/uapi/linux/mount.h
index d2c988ab526b..704c408cc662 100644
--- a/include/uapi/linux/mount.h
+++ b/include/uapi/linux/mount.h
@@ -194,4 +194,13 @@ struct __mount_arg {
#define STMT_MNT_POINT 0x00000010U /* Want/got mnt_point */
#define STMT_FS_TYPE 0x00000020U /* Want/got fs_type */
+/* listmount(2) flags */
+#define LISTMOUNT_UNREACHABLE 0x01 /* List unreachable mounts too */
+#define LISTMOUNT_RECURSIVE 0x02 /* List a mount tree */
+
+/*
+ * Special @mnt_id values that can be passed to listmount
+ */
+#define LSMT_ROOT 0xffffffffffffffff /* root mount */
+
#endif /* _UAPI_LINUX_MOUNT_H */
--
2.41.0
Hi, On Wed, Oct 25, 2023 at 04:02:03PM +0200, Miklos Szeredi wrote: > Add way to query the children of a particular mount. This is a more > flexible way to iterate the mount tree than having to parse the complete > /proc/self/mountinfo. > > Allow listing either > > - immediate child mounts only, or > > - recursively all descendant mounts (depth first). > > Lookup the mount by the new 64bit mount ID. If a mount needs to be queried > based on path, then statx(2) can be used to first query the mount ID > belonging to the path. > > Return an array of new (64bit) mount ID's. Without privileges only mounts > are listed which are reachable from the task's root. > > Signed-off-by: Miklos Szeredi <mszeredi@redhat.com> with this patch in the tree, all sh4 builds fail with ICE. during RTL pass: final In file included from fs/namespace.c:11: fs/namespace.c: In function '__se_sys_listmount': include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275 I tested with gcc 8.2, 11.3, 11.4, and 12.3. The compiler version does not make a difference. Has anyone else seen the same problem ? If so, any idea what to do about it ? Thanks, Guenter
Hi Guenter, > with this patch in the tree, all sh4 builds fail with ICE. > > during RTL pass: final > In file included from fs/namespace.c:11: > fs/namespace.c: In function '__se_sys_listmount': > include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275 > > I tested with gcc 8.2, 11.3, 11.4, and 12.3. The compiler version > does not make a difference. Has anyone else seen the same problem ? > If so, any idea what to do about it ? I'm not seeing any problems building the SH kernel except some -Werror=missing-prototypes warnings. I'm using gcc 11.1 from here [1]. Adrian PS: Please always CC linux-sh and the SH maintainers when reporting issues. > [1] https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/x86_64/11.1.0/ -- .''`. John Paul Adrian Glaubitz : :' : Debian Developer `. `' Physicist `- GPG: 62FF 8A75 84E0 2956 9546 0006 7426 3B37 F5B5 F913
On 1/23/24 06:14, John Paul Adrian Glaubitz wrote:
> Hi Guenter,
>
>> with this patch in the tree, all sh4 builds fail with ICE.
>>
>> during RTL pass: final
>> In file included from fs/namespace.c:11:
>> fs/namespace.c: In function '__se_sys_listmount':
>> include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275
>>
>> I tested with gcc 8.2, 11.3, 11.4, and 12.3. The compiler version
>> does not make a difference. Has anyone else seen the same problem ?
>> If so, any idea what to do about it ?
>
> I'm not seeing any problems building the SH kernel except some -Werror=missing-prototypes warnings.
>
The problem has been fixed thanks to some guidance from Linus. I did disable
CONFIG_WERROR for sh (and a few other architectures) because it now always
results in pointless build failures on test builds due to commit 0fcb70851fbf
("Makefile.extrawarn: turn on missing-prototypes globally").
> I'm using gcc 11.1 from here [1].
>
> Adrian
>
> PS: Please always CC linux-sh and the SH maintainers when reporting issues.
>
When reporting anything in the linux kernel, it is always a tight rope between
copying too few or too many people or mailing lists. I have been scolded for both.
Replying to the original patch historically results in the fewest complaints,
so I think I'll stick to that.
Guenter
Hi Guenter, > with this patch in the tree, all sh4 builds fail with ICE. > > during RTL pass: final > In file included from fs/namespace.c:11: > fs/namespace.c: In function '__se_sys_listmount': > include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275 > > I tested with gcc 8.2, 11.3, 11.4, and 12.3. The compiler version > does not make a difference. Has anyone else seen the same problem ? > If so, any idea what to do about it ? I'm not seeing any problems building the SH kernel except some -Werror=missing-prototypes warnings. I'm using gcc 11.1 from here [1]. Adrian PS: Please always CC linux-sh and the SH maintainers when reporting issues. > [1] https://mirrors.edge.kernel.org/pub/tools/crosstool/files/bin/x86_64/11.1.0/ -- .''`. John Paul Adrian Glaubitz : :' : Debian Developer `. `' Physicist `- GPG: 62FF 8A75 84E0 2956 9546 0006 7426 3B37 F5B5 F913
On Wed, 10 Jan 2024 at 14:23, Guenter Roeck <linux@roeck-us.net> wrote:
>
> with this patch in the tree, all sh4 builds fail with ICE.
>
> during RTL pass: final
> In file included from fs/namespace.c:11:
> fs/namespace.c: In function '__se_sys_listmount':
> include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275
We do have those very ugly SYSCALL_DEFINEx() macros, but I'm not
seeing _anything_ that would be odd about the listmount case.
And the "__se_sys" thing in particular is just a fairly trivial wrapper.
It does use that asmlinkage_protect() thing, and it is unquestionably
horrendously ugly (staring too long at <linux/syscalls.h> has been
known to cause madness and despair), but we do that for *every* single
system call and I don't see why the new listmount entry would be
different.
Linus
On 1/10/24 16:32, Linus Torvalds wrote:
> On Wed, 10 Jan 2024 at 14:23, Guenter Roeck <linux@roeck-us.net> wrote:
>>
>> with this patch in the tree, all sh4 builds fail with ICE.
>>
>> during RTL pass: final
>> In file included from fs/namespace.c:11:
>> fs/namespace.c: In function '__se_sys_listmount':
>> include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275
>
> We do have those very ugly SYSCALL_DEFINEx() macros, but I'm not
> seeing _anything_ that would be odd about the listmount case.
>
> And the "__se_sys" thing in particular is just a fairly trivial wrapper.
>
> It does use that asmlinkage_protect() thing, and it is unquestionably
> horrendously ugly (staring too long at <linux/syscalls.h> has been
> known to cause madness and despair), but we do that for *every* single
> system call and I don't see why the new listmount entry would be
> different.
>
It isn't the syscall. The following hack avoids the problem.
diff --git a/fs/namespace.c b/fs/namespace.c
index ef1fd6829814..28fe2a55bd94 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5070,8 +5070,10 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
ctr = array_index_nospec(ctr, bufsize);
if (put_user(r->mnt_id_unique, buf + ctr))
return -EFAULT;
+#if 0
if (check_add_overflow(ctr, 1, &ctr))
return -ERANGE;
+#endif
But it isn't check_add_overflow() either. This "helps" as well.
diff --git a/fs/namespace.c b/fs/namespace.c
index ef1fd6829814..b53cb2f13530 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5068,8 +5068,10 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
if (!is_path_reachable(r, r->mnt.mnt_root, orig))
continue;
ctr = array_index_nospec(ctr, bufsize);
+#if 0
if (put_user(r->mnt_id_unique, buf + ctr))
return -EFAULT;
+#endif
if (check_add_overflow(ctr, 1, &ctr))
return -ERANGE;
Any variance of put_user() with &buf[ctr] or buf + ctr fails
if ctr is a variable and permitted to be != 0. For example,
commenting out the call to check_add_overflow() and starting
the loop with ctr = 1 also triggers the problem, as does replacing
the call to check_add_overflow() with ctr++;. Using a temporary
variable such as in
u64 __user *pbuf;
...
pbuf = buf + ctr;
if (put_user(r->mnt_id_unique, pbuf))
return -EFAULT;
doesn't help either. But this does:
- if (put_user(r->mnt_id_unique, buf + ctr))
+ if (put_user(r->mnt_id_unique, (u32 *)(buf + ctr)))
and "buf + 17" as well as "&buf[17]" work as well. Essentially,
every combination of "buf + ctr" or "&buf[ctr]" fails if buf
is u64* and ctr is a variable.
The following works. Would this be acceptable ?
diff --git a/fs/namespace.c b/fs/namespace.c
index ef1fd6829814..dc0f844205d9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5068,10 +5068,11 @@ static ssize_t do_listmount(struct mount *first, struct path *orig, u64 mnt_id,
if (!is_path_reachable(r, r->mnt.mnt_root, orig))
continue;
ctr = array_index_nospec(ctr, bufsize);
- if (put_user(r->mnt_id_unique, buf + ctr))
+ if (put_user(r->mnt_id_unique, buf))
return -EFAULT;
if (check_add_overflow(ctr, 1, &ctr))
return -ERANGE;
+ buf++;
}
return ctr;
}
Guenter
On Thu, 11 Jan 2024 at 10:57, Guenter Roeck <linux@roeck-us.net> wrote:
>
> Any variance of put_user() with &buf[ctr] or buf + ctr fails
> if ctr is a variable and permitted to be != 0.
Crazy. But the 64-bit put_user() is a bit special and tends to require
more registers (the 64-bit value is passed in two registers), so that
probably then results in the ICE.
Side note: looking at the SH version of __put_user_u64(), I think it's
buggy and is missing the exception handler for the second 32-bit move.
I dunno, I don't read sh asm, but it looks suspicious.
> The following works. Would this be acceptable ?
It might be very easy to trigger this once again if somebody goes "that's silly"
That said, I also absolutely detest the "error handling" in that
function. It's horrible.
Noticing the user access error in the middle is just sad, and if that
was just handled better and at least the range was checked first, the
overflow error couldn't happen and checking for it is thus pointless.
And looking at it all, it really looks like the whole interface is
broken. The "bufsize" argument isn't the size of the buffer at all.
It's the number of entries.
Extra confusingly, in the *other* system call, bufsize is in fact the
size of the buffer.
And the 'ctr' overflow checking is doubly garbage, because the only
reason *that* can happen is that we didn't check the incoming
arguments properly.
Same goes for the whole array_index_nospec() - it's pointless, because
the user controls what that code checks against anyway, so there's no
point to trying to manage some range checking.
The only range checking there that matters would be the one that
put_user() has to do against the address space size, but that's done
by put_user().
End result: that thing needs a rewrite.
The SH put_user64() needs to be looked at too, but in the meantime,
maybe something like this fixes the problems with listmount?
NOTE! ENTIRELY untested, but that naming and lack of argument sanity
checking really is horrendous. We should have caught this earlier.
Linus
> NOTE! ENTIRELY untested, but that naming and lack of argument sanity Thanks for catching that. I've slightly tweaked the attached patch and put it into vfs.fixes at [1]. There's a fsnotify performance regression fix for io_uring in there as well. I plan to get both to you Saturday CET. Let us know if there's any additional issues. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git/log/?h=vfs.fixes
On 1/11/24 12:14, Linus Torvalds wrote:
> On Thu, 11 Jan 2024 at 10:57, Guenter Roeck <linux@roeck-us.net> wrote:
>>
>> Any variance of put_user() with &buf[ctr] or buf + ctr fails
>> if ctr is a variable and permitted to be != 0.
>
> Crazy. But the 64-bit put_user() is a bit special and tends to require
> more registers (the 64-bit value is passed in two registers), so that
> probably then results in the ICE.
>
> Side note: looking at the SH version of __put_user_u64(), I think it's
> buggy and is missing the exception handler for the second 32-bit move.
> I dunno, I don't read sh asm, but it looks suspicious.
>
I wonder if something may be wrong with the definition and use of __m
for u64 accesses. The code below also fixes the build problem.
But then I really don't know what
struct __large_struct { unsigned long buf[100]; };
#define __m(x) (*(struct __large_struct __user *)(x))
is supposed to be doing in the first place, and I still don't understand
why the problem only shows up with CONFIG_MMU=n.
Guenter
---
diff --git a/arch/sh/include/asm/uaccess_32.h b/arch/sh/include/asm/uaccess_32.h
index 5d7ddc092afd..f0451a37b6ff 100644
--- a/arch/sh/include/asm/uaccess_32.h
+++ b/arch/sh/include/asm/uaccess_32.h
@@ -196,7 +196,7 @@ __asm__ __volatile__( \
".long 1b, 3b\n\t" \
".previous" \
: "=r" (retval) \
- : "r" (val), "m" (__m(addr)), "i" (-EFAULT), "0" (retval) \
+ : "r" (val), "m" (*(u64 *)(addr)), "i" (-EFAULT), "0" (retval) \
: "memory"); })
#else
#define __put_user_u64(val,addr,retval) \
@@ -218,7 +218,7 @@ __asm__ __volatile__( \
".long 1b, 3b\n\t" \
".previous" \
: "=r" (retval) \
- : "r" (val), "m" (__m(addr)), "i" (-EFAULT), "0" (retval) \
+ : "r" (val), "m" (*(u64 *)(addr)), "i" (-EFAULT), "0" (retval) \
: "memory"); })
#endif
On Thu, 11 Jan 2024 at 15:57, Guenter Roeck <linux@roeck-us.net> wrote:
>
> I wonder if something may be wrong with the definition and use of __m
> for u64 accesses. The code below also fixes the build problem.
Ok, that looks like the right workaround.
> But then I really don't know what
>
> struct __large_struct { unsigned long buf[100]; };
> #define __m(x) (*(struct __large_struct __user *)(x))
This is a historical pattern we've used because the gcc docs weren't
100% clear on what "m" does, and whether it might for example end up
loading the value from memory into a register, spilling it to the
stack, and then using the stack address...
Using the whole "tell the compiler it accesses a big structure" turns
the memory access into "BLKmode" (in gcc terms) and makes sure that
never happens.
NOTE! I'm not sure it ever did happen with gcc, but we have literally
seen clang do that "load from memory, spill to stack, and then use the
stack address for the asm". Crazy, I know. See
https://lore.kernel.org/all/CAHk-=wgobnShg4c2yyMbk2p=U-wmnOmX_0=b3ZY_479Jjey2xw@mail.gmail.com/
where I point to clang doing basically exactly that with the "rm"
constraint for another case entirely. I consider it a clang bug, but
happily I've never seen the "load only to spill" in a case where the
"stupid code generation" turned into "actively buggy code generation".
If it ever does, we may need to turn the "m" into a "p" and a memory
clobber, which will generate horrendous code. Or we may just need to
tell clang developers that enough is enough, and that they actually
need to take the asm constraints more seriously.
Linus
On Thu, Jan 11, 2024 at 07:40:32PM -0800, Linus Torvalds wrote:
> On Thu, 11 Jan 2024 at 15:57, Guenter Roeck <linux@roeck-us.net> wrote:
> >
> > I wonder if something may be wrong with the definition and use of __m
> > for u64 accesses. The code below also fixes the build problem.
>
> Ok, that looks like the right workaround.
>
I think I'll go with the code below. It doesn't touch the MMU code
(which for some reason doesn't result in a build failure), and
it generates
! 5071 "fs/namespace.c" 1
mov.l r6,@r1
mov.l r7,@(4,r1)
which seems to be correct. It also matches the pattern used
for __put_user_asm(). Does that make sense ?
Thanks,
Guenter
---
diff --git a/arch/sh/include/asm/uaccess_32.h b/arch/sh/include/asm/uaccess_32.h
index 5d7ddc092afd..5ce46b2a95b6 100644
--- a/arch/sh/include/asm/uaccess_32.h
+++ b/arch/sh/include/asm/uaccess_32.h
@@ -176,6 +176,7 @@ do { \
} while (0)
#endif /* CONFIG_MMU */
+#ifdef CONFIG_MMU
#if defined(CONFIG_CPU_LITTLE_ENDIAN)
#define __put_user_u64(val,addr,retval) \
({ \
@@ -221,6 +222,26 @@ __asm__ __volatile__( \
: "r" (val), "m" (__m(addr)), "i" (-EFAULT), "0" (retval) \
: "memory"); })
#endif
+#else
+#if defined(CONFIG_CPU_LITTLE_ENDIAN)
+#define __put_user_u64(val,addr,retval) \
+({ \
+__asm__ __volatile__( \
+ "mov.l %R0,%1\n\t" \
+ "mov.l %S0,%T1\n\t" \
+ : /* no outputs */ \
+ : "r" (val), "m" (*(u64 *)(addr)) \
+ : "memory"); })
+#else
+({ \
+__asm__ __volatile__( \
+ "mov.l %S0,%1\n\t" \
+ "mov.l %R0,%T1\n\t" \
+ : /* no outputs */ \
+ : "r" (val), "m" (*(u64 *)(addr)) \
+ : "memory"); })
+#endif
+#endif /* CONFIG_MMU */
extern void __put_user_unknown(void);
On 1/10/24 16:32, Linus Torvalds wrote: > On Wed, 10 Jan 2024 at 14:23, Guenter Roeck <linux@roeck-us.net> wrote: >> >> with this patch in the tree, all sh4 builds fail with ICE. >> >> during RTL pass: final >> In file included from fs/namespace.c:11: >> fs/namespace.c: In function '__se_sys_listmount': >> include/linux/syscalls.h:258:9: internal compiler error: in change_address_1, at emit-rtl.c:2275 > > We do have those very ugly SYSCALL_DEFINEx() macros, but I'm not > seeing _anything_ that would be odd about the listmount case. > > And the "__se_sys" thing in particular is just a fairly trivial wrapper. > > It does use that asmlinkage_protect() thing, and it is unquestionably > horrendously ugly (staring too long at <linux/syscalls.h> has been > known to cause madness and despair), but we do that for *every* single > system call and I don't see why the new listmount entry would be > different. > I don't have much of a clue either, but here is a hint: The problem is only seen if CONFIG_MMU=n. I tested with all configurations in arch/sh/configs. Guenter
On Oct 25, 2023 Miklos Szeredi <mszeredi@redhat.com> wrote:
>
> Add way to query the children of a particular mount. This is a more
> flexible way to iterate the mount tree than having to parse the complete
> /proc/self/mountinfo.
>
> Allow listing either
>
> - immediate child mounts only, or
>
> - recursively all descendant mounts (depth first).
>
> Lookup the mount by the new 64bit mount ID. If a mount needs to be queried
> based on path, then statx(2) can be used to first query the mount ID
> belonging to the path.
>
> Return an array of new (64bit) mount ID's. Without privileges only mounts
> are listed which are reachable from the task's root.
>
> Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
> Reviewed-by: Ian Kent <raven@themaw.net>
> ---
> fs/namespace.c | 93 ++++++++++++++++++++++++++++++++++++++
> include/linux/syscalls.h | 3 ++
> include/uapi/linux/mount.h | 9 ++++
> 3 files changed, 105 insertions(+)
>
> diff --git a/fs/namespace.c b/fs/namespace.c
> index a980c250a3a6..0afe2344bba6 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -4958,6 +4958,99 @@ SYSCALL_DEFINE4(statmount, const struct __mount_arg __user *, req,
> return ret;
> }
>
> +static struct mount *listmnt_first(struct mount *root)
> +{
> + return list_first_entry_or_null(&root->mnt_mounts, struct mount, mnt_child);
> +}
> +
> +static struct mount *listmnt_next(struct mount *curr, struct mount *root, bool recurse)
> +{
> + if (recurse)
> + return next_mnt(curr, root);
> + if (!list_is_head(curr->mnt_child.next, &root->mnt_mounts))
> + return list_next_entry(curr, mnt_child);
> + return NULL;
> +}
> +
> +static long do_listmount(struct vfsmount *mnt, u64 __user *buf, size_t bufsize,
> + const struct path *root, unsigned int flags)
> +{
> + struct mount *r, *m = real_mount(mnt);
> + struct path rootmnt = {
> + .mnt = root->mnt,
> + .dentry = root->mnt->mnt_root
> + };
> + long ctr = 0;
> + bool reachable_only = true;
> + bool recurse = flags & LISTMOUNT_RECURSIVE;
> + int err;
> +
> + err = security_sb_statfs(mnt->mnt_root);
> + if (err)
> + return err;
> +
> + if (flags & LISTMOUNT_UNREACHABLE) {
> + if (!capable(CAP_SYS_ADMIN))
> + return -EPERM;
> + reachable_only = false;
> + }
Similar to my comment in patch 4/6, please move the LSM call after the
capability check.
> + if (reachable_only && !is_path_reachable(m, mnt->mnt_root, &rootmnt))
> + return capable(CAP_SYS_ADMIN) ? 0 : -EPERM;
> +
> + for (r = listmnt_first(m); r; r = listmnt_next(r, m, recurse)) {
> + if (reachable_only &&
> + !is_path_reachable(r, r->mnt.mnt_root, root))
> + continue;
> +
> + if (ctr >= bufsize)
> + return -EOVERFLOW;
> + if (put_user(r->mnt_id_unique, buf + ctr))
> + return -EFAULT;
> + ctr++;
> + if (ctr < 0)
> + return -ERANGE;
> + }
> + return ctr;
> +}
--
paul-moore.com
Miklos Szeredi <mszeredi@redhat.com> writes:
> Add way to query the children of a particular mount. This is a more
> flexible way to iterate the mount tree than having to parse the complete
> /proc/self/mountinfo.
>
> Allow listing either
>
> - immediate child mounts only, or
>
> - recursively all descendant mounts (depth first).
So I have one probably silly question:
> +SYSCALL_DEFINE4(listmount, const struct __mount_arg __user *, req,
> + u64 __user *, buf, size_t, bufsize, unsigned int, flags)
> +{
Why use struct __mount_arg (or struct mnt_id_req :) here rather than
just passing in the mount ID directly? You don't use the request_mask
field anywhere.
Thanks,
jon
> Why use struct __mount_arg (or struct mnt_id_req :) here rather than > just passing in the mount ID directly? You don't use the request_mask Please see Arnd's detailed summary here: https://lore.kernel.org/lkml/44631c05-6b8a-42dc-b37e-df6776baa5d4@app.fastmail.com
Christian Brauner <brauner@kernel.org> writes: >> Why use struct __mount_arg (or struct mnt_id_req :) here rather than >> just passing in the mount ID directly? You don't use the request_mask > > Please see Arnd's detailed summary here: > https://lore.kernel.org/lkml/44631c05-6b8a-42dc-b37e-df6776baa5d4@app.fastmail.com Ah, makes sense, I'd missed that. Given this, though, it seems like maybe sys_listmount() should enforce that req->request_mask==0 ? Thanks, jon
On Wed, Nov 08, 2023 at 09:20:28AM -0700, Jonathan Corbet wrote:
> Christian Brauner <brauner@kernel.org> writes:
>
> >> Why use struct __mount_arg (or struct mnt_id_req :) here rather than
> >> just passing in the mount ID directly? You don't use the request_mask
> >
> > Please see Arnd's detailed summary here:
> > https://lore.kernel.org/lkml/44631c05-6b8a-42dc-b37e-df6776baa5d4@app.fastmail.com
>
> Ah, makes sense, I'd missed that.
>
> Given this, though, it seems like maybe sys_listmount() should enforce
> that req->request_mask==0 ?
Good catch, it does now:
diff --git a/fs/namespace.c b/fs/namespace.c
index ae09321b0f72..21edddd75532 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -5030,6 +5030,8 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
if (copy_from_user(&kreq, req, sizeof(kreq)))
return -EFAULT;
+ if (kreq.request_mask != 0)
+ return -EINVAL;
mnt_id = kreq.mnt_id;
down_read(&namespace_sem);
© 2016 - 2025 Red Hat, Inc.