Following process makes ext4 load stale buffer heads from last failed
mounting in a new mounting operation:
mount_bdev
ext4_fill_super
| ext4_load_and_init_journal
| ext4_load_journal
| jbd2_journal_load
| load_superblock
| journal_get_superblock
| set_buffer_verified(bh) // buffer head is verified
| jbd2_journal_recover // failed caused by EIO
| goto failed_mount3a // skip 'sb->s_root' initialization
deactivate_locked_super
kill_block_super
generic_shutdown_super
if (sb->s_root)
// false, skip ext4_put_super->invalidate_bdev->
// invalidate_mapping_pages->mapping_evict_folio->
// filemap_release_folio->try_to_free_buffers, which
// cannot drop buffer head.
blkdev_put
blkdev_put_whole
if (atomic_dec_and_test(&bdev->bd_openers))
// false, systemd-udev happens to open the device. Then
// blkdev_flush_mapping->kill_bdev->truncate_inode_pages->
// truncate_inode_folio->truncate_cleanup_folio->
// folio_invalidate->block_invalidate_folio->
// filemap_release_folio->try_to_free_buffers will be skipped,
// dropping buffer head is missed again.
Second mount:
ext4_fill_super
ext4_load_and_init_journal
ext4_load_journal
ext4_get_journal
jbd2_journal_init_inode
journal_init_common
bh = getblk_unmovable
bh = __find_get_block // Found stale bh in last failed mounting
journal->j_sb_buffer = bh
jbd2_journal_load
load_superblock
journal_get_superblock
if (buffer_verified(bh))
// true, skip journal->j_format_version = 2, value is 0
jbd2_journal_recover
do_one_pass
next_log_block += count_tags(journal, bh)
// According to journal_tag_bytes(), 'tag_bytes' calculating is
// affected by jbd2_has_feature_csum3(), jbd2_has_feature_csum3()
// returns false because 'j->j_format_version >= 2' is not true,
// then we get wrong next_log_block. The do_one_pass may exit
// early whenoccuring non JBD2_MAGIC_NUMBER in 'next_log_block'.
The filesystem is corrupted here, journal is partially replayed, and
new journal sequence number actually is already used by last mounting.
The invalidate_bdev() can drop all buffer heads even racing with bare
reading block device(eg. systemd-udev), so we can fix it by invalidating
bdev in error handling path in __ext4_fill_super().
Fetch a reproducer in [Link].
Link: https://bugzilla.kernel.org/show_bug.cgi?id=217171
Cc: <stable@kernel.org>
Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
---
fs/ext4/super.c | 15 ++++++++-------
1 file changed, 8 insertions(+), 7 deletions(-)
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 88f7b8a88c76..7e990637bc48 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1126,6 +1126,12 @@ static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
struct block_device *bdev;
bdev = sbi->s_journal_bdev;
if (bdev) {
+ /*
+ * Invalidate the journal device's buffers. We don't want them
+ * floating about in memory - the physical journal device may
+ * hotswapped, and it breaks the `ro-after' testing code.
+ */
+ invalidate_bdev(bdev);
ext4_blkdev_put(bdev);
sbi->s_journal_bdev = NULL;
}
@@ -1271,14 +1277,8 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
- /*
- * Invalidate the journal device's buffers. We don't want them
- * floating about in memory - the physical journal device may
- * hotswapped, and it breaks the `ro-after' testing code.
- */
+ if (sbi->s_journal_bdev) {
sync_blockdev(sbi->s_journal_bdev);
- invalidate_bdev(sbi->s_journal_bdev);
ext4_blkdev_remove(sbi);
}
@@ -5610,6 +5610,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
brelse(sbi->s_sbh);
ext4_blkdev_remove(sbi);
out_fail:
+ invalidate_bdev(sb->s_bdev);
sb->s_fs_info = NULL;
return err ? err : ret;
}
--
2.31.1
Hi, Zhihao, On 3/10/23 12:52, Zhihao Cheng wrote: cut > Link: https://bugzilla.kernel.org/show_bug.cgi?id=217171 > Cc: <stable@kernel.org> Shouldn't have been stable@vger.kernel.org instead? That's what is advertised at: https://www.kernel.org/doc/html/latest/process/stable-kernel-rules.html A Fixes tag would be helpful. It would assist the stable kernel team, or people that have to backport your patch, in determining which stable versions should receive your fix. Same suggestion is made in https://www.kernel.org/doc/html/latest/process/submitting-patches.html Thanks! ta
On Fri 10-03-23 20:52:02, Zhihao Cheng wrote:
> Following process makes ext4 load stale buffer heads from last failed
> mounting in a new mounting operation:
> mount_bdev
> ext4_fill_super
> | ext4_load_and_init_journal
> | ext4_load_journal
> | jbd2_journal_load
> | load_superblock
> | journal_get_superblock
> | set_buffer_verified(bh) // buffer head is verified
> | jbd2_journal_recover // failed caused by EIO
> | goto failed_mount3a // skip 'sb->s_root' initialization
> deactivate_locked_super
> kill_block_super
> generic_shutdown_super
> if (sb->s_root)
> // false, skip ext4_put_super->invalidate_bdev->
> // invalidate_mapping_pages->mapping_evict_folio->
> // filemap_release_folio->try_to_free_buffers, which
> // cannot drop buffer head.
> blkdev_put
> blkdev_put_whole
> if (atomic_dec_and_test(&bdev->bd_openers))
> // false, systemd-udev happens to open the device. Then
> // blkdev_flush_mapping->kill_bdev->truncate_inode_pages->
> // truncate_inode_folio->truncate_cleanup_folio->
> // folio_invalidate->block_invalidate_folio->
> // filemap_release_folio->try_to_free_buffers will be skipped,
> // dropping buffer head is missed again.
>
> Second mount:
> ext4_fill_super
> ext4_load_and_init_journal
> ext4_load_journal
> ext4_get_journal
> jbd2_journal_init_inode
> journal_init_common
> bh = getblk_unmovable
> bh = __find_get_block // Found stale bh in last failed mounting
> journal->j_sb_buffer = bh
> jbd2_journal_load
> load_superblock
> journal_get_superblock
> if (buffer_verified(bh))
> // true, skip journal->j_format_version = 2, value is 0
> jbd2_journal_recover
> do_one_pass
> next_log_block += count_tags(journal, bh)
> // According to journal_tag_bytes(), 'tag_bytes' calculating is
> // affected by jbd2_has_feature_csum3(), jbd2_has_feature_csum3()
> // returns false because 'j->j_format_version >= 2' is not true,
> // then we get wrong next_log_block. The do_one_pass may exit
> // early whenoccuring non JBD2_MAGIC_NUMBER in 'next_log_block'.
>
> The filesystem is corrupted here, journal is partially replayed, and
> new journal sequence number actually is already used by last mounting.
>
> The invalidate_bdev() can drop all buffer heads even racing with bare
> reading block device(eg. systemd-udev), so we can fix it by invalidating
> bdev in error handling path in __ext4_fill_super().
>
> Fetch a reproducer in [Link].
>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=217171
> Cc: <stable@kernel.org>
> Signed-off-by: Zhihao Cheng <chengzhihao1@huawei.com>
The fix looks good to me. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/ext4/super.c | 15 ++++++++-------
> 1 file changed, 8 insertions(+), 7 deletions(-)
>
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 88f7b8a88c76..7e990637bc48 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -1126,6 +1126,12 @@ static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
> struct block_device *bdev;
> bdev = sbi->s_journal_bdev;
> if (bdev) {
> + /*
> + * Invalidate the journal device's buffers. We don't want them
> + * floating about in memory - the physical journal device may
> + * hotswapped, and it breaks the `ro-after' testing code.
> + */
> + invalidate_bdev(bdev);
> ext4_blkdev_put(bdev);
> sbi->s_journal_bdev = NULL;
> }
> @@ -1271,14 +1277,8 @@ static void ext4_put_super(struct super_block *sb)
>
> sync_blockdev(sb->s_bdev);
> invalidate_bdev(sb->s_bdev);
> - if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
> - /*
> - * Invalidate the journal device's buffers. We don't want them
> - * floating about in memory - the physical journal device may
> - * hotswapped, and it breaks the `ro-after' testing code.
> - */
> + if (sbi->s_journal_bdev) {
> sync_blockdev(sbi->s_journal_bdev);
> - invalidate_bdev(sbi->s_journal_bdev);
> ext4_blkdev_remove(sbi);
> }
>
> @@ -5610,6 +5610,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
> brelse(sbi->s_sbh);
> ext4_blkdev_remove(sbi);
> out_fail:
> + invalidate_bdev(sb->s_bdev);
> sb->s_fs_info = NULL;
> return err ? err : ret;
> }
> --
> 2.31.1
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
© 2016 - 2026 Red Hat, Inc.