arch/x86/include/asm/compat.h | 6 ++---- fs/stat.c | 19 ++++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-)
On Mon, 11 Apr 2022, Linus Torvalds wrote:
> On Mon, Apr 11, 2022 at 7:13 AM Mikulas Patocka <mpatocka@redhat.com> wrote:
> >
> > Should we perhaps hash the number, take 16 bits of the hash and hope
> > than the collision won't happen?
>
> That would "work", but I think it would be incredibly annoying to
> users with basically random results.
>
> I think the solution is to just put the bits in the high bits. Yes,
> they might be masked off if people use 'MAJOR()' to pick them out, but
> the common "compare st_dev and st_ino" model at least works. That's
> the one that wants unique numbers.
>
> > For me, the failure happens in cp_compat_stat (I have a 64-bit kernel). In
> > struct compat_stat in arch/x86/include/asm/compat.h, st_dev and st_rdev
> > are compat_dev_t which is 16-bit. But they are followed by 16-bit
> > paddings, so they could be extended.
>
> Ok, that actually looks like a bug.
>
> The compat structure should match the native structure. Those "u16
> __padX" fields seem to be just a symptom of the bug.
>
> The only user of that compat_stat structure is the kernel, so that
> should just be fixed.
>
> Of course, who knows what the libraries have done, so user space could
> still have screwed up.
Here I'm sending a patch that makes struct compat_stat match struct stat.
stat: fix inconsistency between struct stat and struct compat_stat
struct stat (defined in arch/x86/include/uapi/asm/stat.h) has 32-bit
st_dev and st_rdev; struct compat_stat (defined in
arch/x86/include/asm/compat.h) has 16-bit st_dev and st_rdev followed by a
16-bit padding. This patch fixes struct compat_stat to match struct stat.
Note that we can't change compat_dev_t because it is used by
compat_loop_info.
Also, if the st_dev and st_rdev values are 32-bit, we don't have to use
old_valid_dev to test if the value fits into them. This fixes -EOVERFLOW
on filesystems that are on NVMe because NVMe uses the major number 259.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
arch/x86/include/asm/compat.h | 6 ++----
fs/stat.c | 19 ++++++++++---------
2 files changed, 12 insertions(+), 13 deletions(-)
Index: linux-5.17.2/arch/x86/include/asm/compat.h
===================================================================
--- linux-5.17.2.orig/arch/x86/include/asm/compat.h 2022-01-21 10:29:12.000000000 +0100
+++ linux-5.17.2/arch/x86/include/asm/compat.h 2022-04-12 11:27:14.000000000 +0200
@@ -28,15 +28,13 @@ typedef u16 compat_ipc_pid_t;
typedef __kernel_fsid_t compat_fsid_t;
struct compat_stat {
- compat_dev_t st_dev;
- u16 __pad1;
+ u32 st_dev;
compat_ino_t st_ino;
compat_mode_t st_mode;
compat_nlink_t st_nlink;
__compat_uid_t st_uid;
__compat_gid_t st_gid;
- compat_dev_t st_rdev;
- u16 __pad2;
+ u32 st_rdev;
u32 st_size;
u32 st_blksize;
u32 st_blocks;
Index: linux-5.17.2/fs/stat.c
===================================================================
--- linux-5.17.2.orig/fs/stat.c 2022-04-12 10:39:46.000000000 +0200
+++ linux-5.17.2/fs/stat.c 2022-04-12 10:58:28.000000000 +0200
@@ -334,9 +334,6 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd,
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
-#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
-
#ifndef INIT_STRUCT_STAT_PADDING
# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif
@@ -345,7 +342,9 @@ static int cp_new_stat(struct kstat *sta
{
struct stat tmp;
- if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
#if BITS_PER_LONG == 32
if (stat->size > MAX_NON_LFS)
@@ -353,7 +352,7 @@ static int cp_new_stat(struct kstat *sta
#endif
INIT_STRUCT_STAT_PADDING(tmp);
- tmp.st_dev = encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -363,7 +362,7 @@ static int cp_new_stat(struct kstat *sta
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
tmp.st_size = stat->size;
tmp.st_atime = stat->atime.tv_sec;
tmp.st_mtime = stat->mtime.tv_sec;
@@ -644,11 +643,13 @@ static int cp_compat_stat(struct kstat *
{
struct compat_stat tmp;
- if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
memset(&tmp, 0, sizeof(tmp));
- tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -658,7 +659,7 @@ static int cp_compat_stat(struct kstat *
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = old_encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
if ((u64) stat->size > MAX_NON_LFS)
return -EOVERFLOW;
tmp.st_size = stat->size;
On Mon, Apr 11, 2022 at 11:41 PM Mikulas Patocka <mpatocka@redhat.com> wrote:
>
> Also, if the st_dev and st_rdev values are 32-bit, we don't have to use
> old_valid_dev to test if the value fits into them. This fixes -EOVERFLOW
> on filesystems that are on NVMe because NVMe uses the major number 259.
The problem with this part of the patch is that this:
> @@ -353,7 +352,7 @@ static int cp_new_stat(struct kstat *sta
> #endif
>
> INIT_STRUCT_STAT_PADDING(tmp);
> - tmp.st_dev = encode_dev(stat->dev);
> + tmp.st_dev = new_encode_dev(stat->dev);
completely changes the format of that st_dev field.
For completely insane historical reasons, we have had the rule that
- 32-bit architectures encode the device into a 16 bit value
- 64-bit architectures encode the device number into a 32 bit value
and that has been true *despite* the fact that the actual "st_dev"
field has been 32-bit and 64-bit respectively since 2003!
And it doesn't help that to confuse things even more, the _naming_ of
those "encode_dev" functions is "old and new", so that logically you'd
think that "cp_new_stat()" would use "new_encode_dev()". Nope.
So on 32-bit architectures, cp_new_stat() uses "old_encode_dev()",
which historically put the minor number in bits 0..7, and the major
number in bits 8..15.
End result: on a 32-bit system (or in the compat syscall mode),
changing to new_encode_dev() would confuse anybody (like just "ls -l
/dev") that uses that old stat call and tries to print out major/minor
numbers.
Now,. the good news is that
(a) nobody should use that old stat call, since the new world order
is called "stat64" and has been for a loooong time - also since at
least 2003)
(b) we could just hide the bits in upper bits instead.
So what I suggest we do is to make old_encode_dev() put the minor bits
in bits 0..7 _and_ 16..23, and the major bits in 8..15 _and_ 24..32.
And then the -EOVERFLOW should be something like
unsigned int st_dev = encode_dev(stat->dev);
tmp.st_dev = st_dev;
if (st_dev != tmp.st_dev)
return -EOVERFLOW;
for the lcase that tmp.st_dev is actually 16-bit (ie the compat case
for some architecture where the padding wasn't there?)
NOTE: That will still screw up 'ls -l' output, but only for the
devices that previously would have returned -EOVERFLOW.
And it will make anybopdy who does that "stat1->st_dev ==
stat2->st_dev && ino == ino2" thing for testing "same inode" work just
fine.
Linus
On Tue, 12 Apr 2022, Linus Torvalds wrote:
> On Mon, Apr 11, 2022 at 11:41 PM Mikulas Patocka <mpatocka@redhat.com> wrote:
> >
> > Also, if the st_dev and st_rdev values are 32-bit, we don't have to use
> > old_valid_dev to test if the value fits into them. This fixes -EOVERFLOW
> > on filesystems that are on NVMe because NVMe uses the major number 259.
>
> The problem with this part of the patch is that this:
>
> > @@ -353,7 +352,7 @@ static int cp_new_stat(struct kstat *sta
> > #endif
> >
> > INIT_STRUCT_STAT_PADDING(tmp);
> > - tmp.st_dev = encode_dev(stat->dev);
> > + tmp.st_dev = new_encode_dev(stat->dev);
>
> completely changes the format of that st_dev field.
we have these definitions:
static __always_inline u16 old_encode_dev(dev_t dev)
{
return (MAJOR(dev) << 8) | MINOR(dev);
}
static __always_inline u32 new_encode_dev(dev_t dev)
{
unsigned major = MAJOR(dev);
unsigned minor = MINOR(dev);
return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12);
}
As long as both major and minor numbers are less than 256, these functions
return equivalent results. So, I think it's safe to replace old_encode_dev
with new_encode_dev.
old_encode_dev shouldn't be called with minor >= 256, because it blends
the upper minor bits into the major field - the kernel doesn't do this and
checks the value with old_valid_dev before calling old_encode_dev. But
when old_valid_dev returns true, it doesn't matter if you use
old_encode_dev or new_encode_dev - both give equivalent results.
When I tested it, both gcc and openwatcom return st_dev 0x10301, which is
the expected value (the NVMe device has major 259 and minor 1).
> (b) we could just hide the bits in upper bits instead.
>
> So what I suggest we do is to make old_encode_dev() put the minor bits
> in bits 0..7 _and_ 16..23, and the major bits in 8..15 _and_ 24..32.
new_encode_dev puts the minor value into bits 0..7, 20..31 and the major
value into bits 8..19
So, we can use this instead of inventing a new format.
Mikulas
On Tue, Apr 12, 2022 at 7:42 AM Mikulas Patocka <mpatocka@redhat.com> wrote:
>
> As long as both major and minor numbers are less than 256, these functions
> return equivalent results. So, I think it's safe to replace old_encode_dev
> with new_encode_dev.
You are of course 100% right, and I should have looked more closely at
the code rather than going by my (broken) assumptions based on old
memory of what we did when we did that "new" stat expansion.
I take back all my objections that were completely bogus.
Linus
© 2016 - 2026 Red Hat, Inc.