fs/ext4/block_validity.c | 23 ++++++++++++++++------- fs/ext4/ext4.h | 1 + fs/ext4/inode.c | 19 +++++++++++++++---- fs/ext4/super.c | 5 ++++- 4 files changed, 36 insertions(+), 12 deletions(-)
Currently, we access journal ino through sbi->s_es->s_journal_inum,
which directly reads from the ext4 sb buffer head. If someone modifies
this underneath us then the s_journal_inum field might get corrupted.
Although direct block device modifications can be expected to cause
issues in the FS, let's cache s_journal_inum in sbi->s_journal_ino so
our checks can be more resillient.
Suggested-by: Baokun Li <libaokun1@huawei.com>
Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
---
fs/ext4/block_validity.c | 23 ++++++++++++++++-------
fs/ext4/ext4.h | 1 +
fs/ext4/inode.c | 19 +++++++++++++++----
fs/ext4/super.c | 5 ++++-
4 files changed, 36 insertions(+), 12 deletions(-)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 87ee3a17bd29..54e6f3499263 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -247,9 +247,9 @@ int ext4_setup_system_zone(struct super_block *sb)
if (ret)
goto err;
}
- if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
+ if (ext4_has_feature_journal(sb) && sbi->s_journal_ino) {
ret = ext4_protect_reserved_inode(sb, system_blks,
- le32_to_cpu(sbi->s_es->s_journal_inum));
+ sbi->s_journal_ino);
if (ret)
goto err;
}
@@ -351,11 +351,20 @@ int ext4_check_blockref(const char *function, unsigned int line,
{
__le32 *bref = p;
unsigned int blk;
-
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
- return 0;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (ext4_has_feature_journal(inode->i_sb)) {
+ /* If we are called from journal init path then
+ * sbi->s_journal_ino would be 0
+ */
+ u32 journal_ino = sbi->s_journal_ino ?
+ sbi->s_journal_ino :
+ sbi->s_es->s_journal_inum;
+ WARN_ON_ONCE(journal_ino == 0);
+
+ if (inode->i_ino == journal_ino)
+ return 0;
+ }
while (bref < p+max) {
blk = le32_to_cpu(*bref++);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 2b7d781bfcad..648b0459e9fd 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1556,6 +1556,7 @@ struct ext4_sb_info {
u32 s_max_batch_time;
u32 s_min_batch_time;
struct file *s_journal_bdev_file;
+ u32 s_journal_ino;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 365d31004bd0..50961bc0c94d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -384,10 +384,21 @@ static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
- if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
- return 0;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+ if (ext4_has_feature_journal(inode->i_sb)) {
+ /*
+ * If we are called from journal init path then
+ * sbi->s_journal_ino would be 0
+ */
+ u32 journal_ino = sbi->s_journal_ino ?
+ sbi->s_journal_ino :
+ sbi->s_es->s_journal_inum;
+ WARN_ON_ONCE(journal_ino == 0);
+
+ if (inode->i_ino == journal_ino)
+ return 0;
+ }
if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
"lblock %lu mapped to illegal pblock %llu "
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a963ffda692a..44e79db7f12a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -4162,7 +4162,8 @@ int ext4_calculate_overhead(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
struct inode *j_inode;
- unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
+ unsigned int j_blocks;
+ u32 j_inum = sbi->s_journal_ino;
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
ext4_fsblk_t overhead = 0;
char *buf = (char *) get_zeroed_page(GFP_NOFS);
@@ -6091,6 +6092,8 @@ static int ext4_load_journal(struct super_block *sb,
ext4_commit_super(sb);
}
+ EXT4_SB(sb)->s_journal_ino = le32_to_cpu(es->s_journal_inum);
+
return 0;
err_out:
--
2.48.1
On Fri, Mar 14, 2025 at 05:11:43PM +0530, Ojaswin Mujoo wrote: > Currently, we access journal ino through sbi->s_es->s_journal_inum, > which directly reads from the ext4 sb buffer head. If someone modifies > this underneath us then the s_journal_inum field might get corrupted. > > Although direct block device modifications can be expected to cause > issues in the FS, let's cache s_journal_inum in sbi->s_journal_ino so > our checks can be more resillient. The reason why the block validity checks need to check against s_journal_ino is to exempt the lookups done by ext4_journal_bmap() from running afoul of the system zone checks, since the journal's data blocks are considered part of the system zone. So this is something we need to do if the journal is actived, and if it's active, then sbi->s_journal will be non-NULL, and so we can just check to see if inode == sbi->s_journal instead. This will simplify the code, without needing to expand the ext4_sb_info structure. Cheers, - Ted
On Sat, Mar 15, 2025 at 09:41:28PM -0400, Theodore Ts'o wrote: > On Fri, Mar 14, 2025 at 05:11:43PM +0530, Ojaswin Mujoo wrote: > > Currently, we access journal ino through sbi->s_es->s_journal_inum, > > which directly reads from the ext4 sb buffer head. If someone modifies > > this underneath us then the s_journal_inum field might get corrupted. > > > > Although direct block device modifications can be expected to cause > > issues in the FS, let's cache s_journal_inum in sbi->s_journal_ino so > > our checks can be more resillient. > > The reason why the block validity checks need to check against > s_journal_ino is to exempt the lookups done by ext4_journal_bmap() > from running afoul of the system zone checks, since the journal's data > blocks are considered part of the system zone. > > So this is something we need to do if the journal is actived, and if > it's active, then sbi->s_journal will be non-NULL, and so we can just > check to see if inode == sbi->s_journal instead. This will simplify I believe you mean inode == sbi->s_journal->j_inode here right? since that is enough to confirm if we are called via ext4_journal_bmap so we can avoid validity checks. Regards, ojaswin > the code, without needing to expand the ext4_sb_info structure. > > Cheers, > > - Ted
On Tue, Mar 18, 2025 at 01:42:31PM +0530, Ojaswin Mujoo wrote: > > So this is something we need to do if the journal is actived, and if > > it's active, then sbi->s_journal will be non-NULL, and so we can just > > check to see if inode == sbi->s_journal instead. This will simplify > > I believe you mean inode == sbi->s_journal->j_inode here right? Yes, that's what I meant; sorry for the not catching this before I sent my reply. Cheers, - Ted
On Tue, Mar 18, 2025 at 10:31:29PM -0400, Theodore Ts'o wrote: > On Tue, Mar 18, 2025 at 01:42:31PM +0530, Ojaswin Mujoo wrote: > > > So this is something we need to do if the journal is actived, and if > > > it's active, then sbi->s_journal will be non-NULL, and so we can just > > > check to see if inode == sbi->s_journal instead. This will simplify > > > > I believe you mean inode == sbi->s_journal->j_inode here right? > > Yes, that's what I meant; sorry for the not catching this before I > sent my reply. > > Cheers, > > - Ted Hi Ted, Baokun, I got some time to revisit this. Seems like checking against s_journal->j_inode is not enough. This is because both ext4_check_blockref() and check_block_validity() can be called even before journal->j_inode is set: ext4_open_inode_journal ext4_get_journal_inode __ext4_iget ext4_ind_check_inode ext4_check_blockref /* j_inode not set */ journal = jbd2_journal_init_inode bmap ext4_bmap iomap_bmap ext4_iomap_begin ext4_map_blocks check_block_validity journal->j_inode = inode Now, I think in this case the best solution might be to use the extra field like we do in this patch but set EXT4_SB(sb)->s_journal_ino sufficiently early. Thoughts?
On 2025/3/26 1:57, Ojaswin Mujoo wrote:
> On Tue, Mar 18, 2025 at 10:31:29PM -0400, Theodore Ts'o wrote:
>> On Tue, Mar 18, 2025 at 01:42:31PM +0530, Ojaswin Mujoo wrote:
>>>> So this is something we need to do if the journal is actived, and if
>>>> it's active, then sbi->s_journal will be non-NULL, and so we can just
>>>> check to see if inode == sbi->s_journal instead. This will simplify
>>> I believe you mean inode == sbi->s_journal->j_inode here right?
>> Yes, that's what I meant; sorry for the not catching this before I
>> sent my reply.
>>
>> Cheers,
>>
>> - Ted
> Hi Ted, Baokun,
>
> I got some time to revisit this. Seems like checking against
> s_journal->j_inode is not enough. This is because both
> ext4_check_blockref() and check_block_validity() can be called even
> before journal->j_inode is set:
>
> ext4_open_inode_journal
> ext4_get_journal_inode
> __ext4_iget
> ext4_ind_check_inode
> ext4_check_blockref /* j_inode not set */
>
> journal = jbd2_journal_init_inode
> bmap
> ext4_bmap
> iomap_bmap
> ext4_iomap_begin
> ext4_map_blocks
> check_block_validity
>
> journal->j_inode = inode
>
>
> Now, I think in this case the best solution might be to use the extra
> field like we do in this patch but set EXT4_SB(sb)->s_journal_ino
> sufficiently early.
>
> Thoughts?
Because system zone setup happens after the journal are loaded, I think we
can skip the check if the journal haven't been loaded yet, like this:
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d04d8a7f12e7..38dc72ff7e78 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -383,9 +383,10 @@ static int __check_block_validity(struct inode
*inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ (!journal || inode == journal->j_inode))
return 0;
if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
If any part of the journal area overlaps with the system zone, we'll catch
it when we add the journal area to the system zone later.
Cheers,
Baokun
On 2025/3/26 10:16, Baokun Li wrote:
> On 2025/3/26 1:57, Ojaswin Mujoo wrote:
>> On Tue, Mar 18, 2025 at 10:31:29PM -0400, Theodore Ts'o wrote:
>>> On Tue, Mar 18, 2025 at 01:42:31PM +0530, Ojaswin Mujoo wrote:
>>>>> So this is something we need to do if the journal is actived, and if
>>>>> it's active, then sbi->s_journal will be non-NULL, and so we can just
>>>>> check to see if inode == sbi->s_journal instead. This will simplify
>>>> I believe you mean inode == sbi->s_journal->j_inode here right?
>>> Yes, that's what I meant; sorry for the not catching this before I
>>> sent my reply.
>>>
>>> Cheers,
>>>
>>> - Ted
>> Hi Ted, Baokun,
>>
>> I got some time to revisit this. Seems like checking against
>> s_journal->j_inode is not enough. This is because both
>> ext4_check_blockref() and check_block_validity() can be called even
>> before journal->j_inode is set:
>>
>> ext4_open_inode_journal
>> ext4_get_journal_inode
>> __ext4_iget
>> ext4_ind_check_inode
>> ext4_check_blockref /* j_inode not set */
>>
>> journal = jbd2_journal_init_inode
>> bmap
>> ext4_bmap
>> iomap_bmap
>> ext4_iomap_begin
>> ext4_map_blocks
>> check_block_validity
>>
>> journal->j_inode = inode
>>
>>
>> Now, I think in this case the best solution might be to use the extra
>> field like we do in this patch but set EXT4_SB(sb)->s_journal_ino
>> sufficiently early.
>>
>> Thoughts?
>
> Because system zone setup happens after the journal are loaded, I think we
> can skip the check if the journal haven't been loaded yet, like this:
>
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index d04d8a7f12e7..38dc72ff7e78 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -383,9 +383,10 @@ static int __check_block_validity(struct inode *inode, const char *func,
> unsigned int line,
> struct ext4_map_blocks *map)
> {
> + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> +
> if (ext4_has_feature_journal(inode->i_sb) &&
> - (inode->i_ino ==
> - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
> + (!journal || inode == journal->j_inode))
> return 0;
> if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
> ext4_error_inode(inode, func, line, map->m_pblk,
>
> If any part of the journal area overlaps with the system zone, we'll catch
> it when we add the journal area to the system zone later.
>
>
Since the creation of the system zone relies on the journal being
loaded, I think there is no risk in proceeding to call
ext4_inode_block_valid() to perform a basic block range check for
the journal inode, or even better.
Thanks,
Yi.
On Wed, Mar 26, 2025 at 12:01:45PM +0800, Zhang Yi wrote:
> On 2025/3/26 10:16, Baokun Li wrote:
> > On 2025/3/26 1:57, Ojaswin Mujoo wrote:
> >> On Tue, Mar 18, 2025 at 10:31:29PM -0400, Theodore Ts'o wrote:
> >>> On Tue, Mar 18, 2025 at 01:42:31PM +0530, Ojaswin Mujoo wrote:
> >>>>> So this is something we need to do if the journal is actived, and if
> >>>>> it's active, then sbi->s_journal will be non-NULL, and so we can just
> >>>>> check to see if inode == sbi->s_journal instead. This will simplify
> >>>> I believe you mean inode == sbi->s_journal->j_inode here right?
> >>> Yes, that's what I meant; sorry for the not catching this before I
> >>> sent my reply.
> >>>
> >>> Cheers,
> >>>
> >>> - Ted
> >> Hi Ted, Baokun,
> >>
> >> I got some time to revisit this. Seems like checking against
> >> s_journal->j_inode is not enough. This is because both
> >> ext4_check_blockref() and check_block_validity() can be called even
> >> before journal->j_inode is set:
> >>
> >> ext4_open_inode_journal
> >> ext4_get_journal_inode
> >> __ext4_iget
> >> ext4_ind_check_inode
> >> ext4_check_blockref /* j_inode not set */
> >>
> >> journal = jbd2_journal_init_inode
> >> bmap
> >> ext4_bmap
> >> iomap_bmap
> >> ext4_iomap_begin
> >> ext4_map_blocks
> >> check_block_validity
> >>
> >> journal->j_inode = inode
> >>
> >>
> >> Now, I think in this case the best solution might be to use the extra
> >> field like we do in this patch but set EXT4_SB(sb)->s_journal_ino
> >> sufficiently early.
> >>
> >> Thoughts?
> >
> > Because system zone setup happens after the journal are loaded, I think we
> > can skip the check if the journal haven't been loaded yet, like this:
> >
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index d04d8a7f12e7..38dc72ff7e78 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -383,9 +383,10 @@ static int __check_block_validity(struct inode *inode, const char *func,
> > unsigned int line,
> > struct ext4_map_blocks *map)
> > {
> > + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> > +
> > if (ext4_has_feature_journal(inode->i_sb) &&
> > - (inode->i_ino ==
> > - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
> > + (!journal || inode == journal->j_inode))
> > return 0;
> > if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
> > ext4_error_inode(inode, func, line, map->m_pblk,
> >
> > If any part of the journal area overlaps with the system zone, we'll catch
> > it when we add the journal area to the system zone later.
> >
> >
>
> Since the creation of the system zone relies on the journal being
> loaded, I think there is no risk in proceeding to call
> ext4_inode_block_valid() to perform a basic block range check for
> the journal inode, or even better.
>
> Thanks,
> Yi.
Got it Yi, makes sense to me. So I believe you are suggesting something
like:
@@ -384,9 +384,10 @@ static int __check_block_validity(struct inode *inode, const char *func,
unsigned int line,
struct ext4_map_blocks *map)
{
+ journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
+
if (ext4_has_feature_journal(inode->i_sb) &&
- (inode->i_ino ==
- le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
+ (journal && journal->j_inode == inode))
return 0;
if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
ext4_error_inode(inode, func, line, map->m_pblk,
>
So that even if it is a journal inode we can go ahead and perform some basic checks
as the system zone rbtree will anyways be NULL at this point. From a cursory look,
it seems that __ext4_iget(..., journal_inode) -> ext4_ext_check_inode() already relies
on the fact that system zone is NULL, so we should be okay here as well.
If this looks good, I'll send a v2 with the suggested changes.
Thanks,
ojaswin
On 2025/3/26 14:39, Ojaswin Mujoo wrote:
> On Wed, Mar 26, 2025 at 12:01:45PM +0800, Zhang Yi wrote:
>> On 2025/3/26 10:16, Baokun Li wrote:
>>> On 2025/3/26 1:57, Ojaswin Mujoo wrote:
>>>> On Tue, Mar 18, 2025 at 10:31:29PM -0400, Theodore Ts'o wrote:
>>>>> On Tue, Mar 18, 2025 at 01:42:31PM +0530, Ojaswin Mujoo wrote:
>>>>>>> So this is something we need to do if the journal is actived, and if
>>>>>>> it's active, then sbi->s_journal will be non-NULL, and so we can just
>>>>>>> check to see if inode == sbi->s_journal instead. This will simplify
>>>>>> I believe you mean inode == sbi->s_journal->j_inode here right?
>>>>> Yes, that's what I meant; sorry for the not catching this before I
>>>>> sent my reply.
>>>>>
>>>>> Cheers,
>>>>>
>>>>> - Ted
>>>> Hi Ted, Baokun,
>>>>
>>>> I got some time to revisit this. Seems like checking against
>>>> s_journal->j_inode is not enough. This is because both
>>>> ext4_check_blockref() and check_block_validity() can be called even
>>>> before journal->j_inode is set:
>>>>
>>>> ext4_open_inode_journal
>>>> ext4_get_journal_inode
>>>> __ext4_iget
>>>> ext4_ind_check_inode
>>>> ext4_check_blockref /* j_inode not set */
>>>>
>>>> journal = jbd2_journal_init_inode
>>>> bmap
>>>> ext4_bmap
>>>> iomap_bmap
>>>> ext4_iomap_begin
>>>> ext4_map_blocks
>>>> check_block_validity
>>>>
>>>> journal->j_inode = inode
>>>>
>>>>
>>>> Now, I think in this case the best solution might be to use the extra
>>>> field like we do in this patch but set EXT4_SB(sb)->s_journal_ino
>>>> sufficiently early.
>>>>
>>>> Thoughts?
>>>
>>> Because system zone setup happens after the journal are loaded, I think we
>>> can skip the check if the journal haven't been loaded yet, like this:
>>>
>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>> index d04d8a7f12e7..38dc72ff7e78 100644
>>> --- a/fs/ext4/inode.c
>>> +++ b/fs/ext4/inode.c
>>> @@ -383,9 +383,10 @@ static int __check_block_validity(struct inode *inode, const char *func,
>>> unsigned int line,
>>> struct ext4_map_blocks *map)
>>> {
>>> + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
>>> +
>>> if (ext4_has_feature_journal(inode->i_sb) &&
>>> - (inode->i_ino ==
>>> - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
>>> + (!journal || inode == journal->j_inode))
>>> return 0;
>>> if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
>>> ext4_error_inode(inode, func, line, map->m_pblk,
>>>
>>> If any part of the journal area overlaps with the system zone, we'll catch
>>> it when we add the journal area to the system zone later.
>>>
>>>
>>
>> Since the creation of the system zone relies on the journal being
>> loaded, I think there is no risk in proceeding to call
>> ext4_inode_block_valid() to perform a basic block range check for
>> the journal inode, or even better.
>>
>> Thanks,
>> Yi.
>
> Got it Yi, makes sense to me. So I believe you are suggesting something
> like:
>
> @@ -384,9 +384,10 @@ static int __check_block_validity(struct inode *inode, const char *func,
> unsigned int line,
> struct ext4_map_blocks *map)
> {
> + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> +
> if (ext4_has_feature_journal(inode->i_sb) &&
We are going to check ->s_journal, so I suppose we could drop this
feature check as well. Others looks good to me.
> - (inode->i_ino ==
> - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
> + (journal && journal->j_inode == inode))
> return 0;
> if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
> ext4_error_inode(inode, func, line, map->m_pblk,
>
>>
>
> So that even if it is a journal inode we can go ahead and perform some basic checks
> as the system zone rbtree will anyways be NULL at this point. From a cursory look,
> it seems that __ext4_iget(..., journal_inode) -> ext4_ext_check_inode() already relies
> on the fact that system zone is NULL, so we should be okay here as well.
Yeah, that's right. :)
Cheers,
Yi.
>
> If this looks good, I'll send a v2 with the suggested changes.
>
> Thanks,
> ojaswin
On 2025/3/26 16:33, Zhang Yi wrote:
> On 2025/3/26 14:39, Ojaswin Mujoo wrote:
>> On Wed, Mar 26, 2025 at 12:01:45PM +0800, Zhang Yi wrote:
>>> On 2025/3/26 10:16, Baokun Li wrote:
>>>> On 2025/3/26 1:57, Ojaswin Mujoo wrote:
>>>>> On Tue, Mar 18, 2025 at 10:31:29PM -0400, Theodore Ts'o wrote:
>>>>>> On Tue, Mar 18, 2025 at 01:42:31PM +0530, Ojaswin Mujoo wrote:
>>>>>>>> So this is something we need to do if the journal is actived, and if
>>>>>>>> it's active, then sbi->s_journal will be non-NULL, and so we can just
>>>>>>>> check to see if inode == sbi->s_journal instead. This will simplify
>>>>>>> I believe you mean inode == sbi->s_journal->j_inode here right?
>>>>>> Yes, that's what I meant; sorry for the not catching this before I
>>>>>> sent my reply.
>>>>>>
>>>>>> Cheers,
>>>>>>
>>>>>> - Ted
>>>>> Hi Ted, Baokun,
>>>>>
>>>>> I got some time to revisit this. Seems like checking against
>>>>> s_journal->j_inode is not enough. This is because both
>>>>> ext4_check_blockref() and check_block_validity() can be called even
>>>>> before journal->j_inode is set:
>>>>>
>>>>> ext4_open_inode_journal
>>>>> ext4_get_journal_inode
>>>>> __ext4_iget
>>>>> ext4_ind_check_inode
>>>>> ext4_check_blockref /* j_inode not set */
>>>>>
>>>>> journal = jbd2_journal_init_inode
>>>>> bmap
>>>>> ext4_bmap
>>>>> iomap_bmap
>>>>> ext4_iomap_begin
>>>>> ext4_map_blocks
>>>>> check_block_validity
>>>>>
>>>>> journal->j_inode = inode
>>>>>
>>>>>
>>>>> Now, I think in this case the best solution might be to use the extra
>>>>> field like we do in this patch but set EXT4_SB(sb)->s_journal_ino
>>>>> sufficiently early.
>>>>>
>>>>> Thoughts?
>>>> Because system zone setup happens after the journal are loaded, I think we
>>>> can skip the check if the journal haven't been loaded yet, like this:
>>>>
>>>> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
>>>> index d04d8a7f12e7..38dc72ff7e78 100644
>>>> --- a/fs/ext4/inode.c
>>>> +++ b/fs/ext4/inode.c
>>>> @@ -383,9 +383,10 @@ static int __check_block_validity(struct inode *inode, const char *func,
>>>> unsigned int line,
>>>> struct ext4_map_blocks *map)
>>>> {
>>>> + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
>>>> +
>>>> if (ext4_has_feature_journal(inode->i_sb) &&
>>>> - (inode->i_ino ==
>>>> - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
>>>> + (!journal || inode == journal->j_inode))
>>>> return 0;
>>>> if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
>>>> ext4_error_inode(inode, func, line, map->m_pblk,
>>>>
>>>> If any part of the journal area overlaps with the system zone, we'll catch
>>>> it when we add the journal area to the system zone later.
>>>>
>>>>
>>> Since the creation of the system zone relies on the journal being
>>> loaded, I think there is no risk in proceeding to call
>>> ext4_inode_block_valid() to perform a basic block range check for
>>> the journal inode, or even better.
Indeed, performing some basic anomaly checks in advance can prevent
journal replay from worsening the situation in abnormal cases. Moreover,
since s_journal is NULL at this point, we won't schedule s_sb_upd_work
even if the check fails, which is safe.
>>>
>>> Thanks,
>>> Yi.
>> Got it Yi, makes sense to me. So I believe you are suggesting something
>> like:
>>
>> @@ -384,9 +384,10 @@ static int __check_block_validity(struct inode *inode, const char *func,
>> unsigned int line,
>> struct ext4_map_blocks *map)
>> {
>> + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
>> +
>> if (ext4_has_feature_journal(inode->i_sb) &&
> We are going to check ->s_journal, so I suppose we could drop this
> feature check as well. Others looks good to me.
Seconded.
>
>> - (inode->i_ino ==
>> - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
>> + (journal && journal->j_inode == inode))
>> return 0;
>> if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
>> ext4_error_inode(inode, func, line, map->m_pblk,
>>
>> So that even if it is a journal inode we can go ahead and perform some basic checks
>> as the system zone rbtree will anyways be NULL at this point. From a cursory look,
>> it seems that __ext4_iget(..., journal_inode) -> ext4_ext_check_inode() already relies
>> on the fact that system zone is NULL, so we should be okay here as well.
> Yeah, that's right. :)
>
> Cheers,
> Yi.
>
>> If this looks good, I'll send a v2 with the suggested changes.
>>
>> Thanks,
>> ojaswin
Please mention in the commit message that we're now doing some basic
checks on the journal area.
Cheers,
Baokun
On Wed, Mar 26, 2025 at 05:26:19PM +0800, Baokun Li wrote:
> On 2025/3/26 16:33, Zhang Yi wrote:
> > On 2025/3/26 14:39, Ojaswin Mujoo wrote:
> > > On Wed, Mar 26, 2025 at 12:01:45PM +0800, Zhang Yi wrote:
> > > > On 2025/3/26 10:16, Baokun Li wrote:
> > > > > On 2025/3/26 1:57, Ojaswin Mujoo wrote:
> > > > > > On Tue, Mar 18, 2025 at 10:31:29PM -0400, Theodore Ts'o wrote:
> > > > > > > On Tue, Mar 18, 2025 at 01:42:31PM +0530, Ojaswin Mujoo wrote:
> > > > > > > > > So this is something we need to do if the journal is actived, and if
> > > > > > > > > it's active, then sbi->s_journal will be non-NULL, and so we can just
> > > > > > > > > check to see if inode == sbi->s_journal instead. This will simplify
> > > > > > > > I believe you mean inode == sbi->s_journal->j_inode here right?
> > > > > > > Yes, that's what I meant; sorry for the not catching this before I
> > > > > > > sent my reply.
> > > > > > >
> > > > > > > Cheers,
> > > > > > >
> > > > > > > - Ted
> > > > > > Hi Ted, Baokun,
> > > > > >
> > > > > > I got some time to revisit this. Seems like checking against
> > > > > > s_journal->j_inode is not enough. This is because both
> > > > > > ext4_check_blockref() and check_block_validity() can be called even
> > > > > > before journal->j_inode is set:
> > > > > >
> > > > > > ext4_open_inode_journal
> > > > > > ext4_get_journal_inode
> > > > > > __ext4_iget
> > > > > > ext4_ind_check_inode
> > > > > > ext4_check_blockref /* j_inode not set */
> > > > > >
> > > > > > journal = jbd2_journal_init_inode
> > > > > > bmap
> > > > > > ext4_bmap
> > > > > > iomap_bmap
> > > > > > ext4_iomap_begin
> > > > > > ext4_map_blocks
> > > > > > check_block_validity
> > > > > >
> > > > > > journal->j_inode = inode
> > > > > >
> > > > > >
> > > > > > Now, I think in this case the best solution might be to use the extra
> > > > > > field like we do in this patch but set EXT4_SB(sb)->s_journal_ino
> > > > > > sufficiently early.
> > > > > >
> > > > > > Thoughts?
> > > > > Because system zone setup happens after the journal are loaded, I think we
> > > > > can skip the check if the journal haven't been loaded yet, like this:
> > > > >
> > > > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > > > index d04d8a7f12e7..38dc72ff7e78 100644
> > > > > --- a/fs/ext4/inode.c
> > > > > +++ b/fs/ext4/inode.c
> > > > > @@ -383,9 +383,10 @@ static int __check_block_validity(struct inode *inode, const char *func,
> > > > > unsigned int line,
> > > > > struct ext4_map_blocks *map)
> > > > > {
> > > > > + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> > > > > +
> > > > > if (ext4_has_feature_journal(inode->i_sb) &&
> > > > > - (inode->i_ino ==
> > > > > - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
> > > > > + (!journal || inode == journal->j_inode))
> > > > > return 0;
> > > > > if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
> > > > > ext4_error_inode(inode, func, line, map->m_pblk,
> > > > >
> > > > > If any part of the journal area overlaps with the system zone, we'll catch
> > > > > it when we add the journal area to the system zone later.
> > > > >
> > > > >
> > > > Since the creation of the system zone relies on the journal being
> > > > loaded, I think there is no risk in proceeding to call
> > > > ext4_inode_block_valid() to perform a basic block range check for
> > > > the journal inode, or even better.
> Indeed, performing some basic anomaly checks in advance can prevent
> journal replay from worsening the situation in abnormal cases. Moreover,
> since s_journal is NULL at this point, we won't schedule s_sb_upd_work
> even if the check fails, which is safe.
> > > >
> > > > Thanks,
> > > > Yi.
> > > Got it Yi, makes sense to me. So I believe you are suggesting something
> > > like:
> > >
> > > @@ -384,9 +384,10 @@ static int __check_block_validity(struct inode *inode, const char *func,
> > > unsigned int line,
> > > struct ext4_map_blocks *map)
> > > {
> > > + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
> > > +
> > > if (ext4_has_feature_journal(inode->i_sb) &&
> > We are going to check ->s_journal, so I suppose we could drop this
> > feature check as well. Others looks good to me.
> Seconded.
> >
> > > - (inode->i_ino ==
> > > - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
> > > + (journal && journal->j_inode == inode))
> > > return 0;
> > > if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
> > > ext4_error_inode(inode, func, line, map->m_pblk,
> > >
> > > So that even if it is a journal inode we can go ahead and perform some basic checks
> > > as the system zone rbtree will anyways be NULL at this point. From a cursory look,
> > > it seems that __ext4_iget(..., journal_inode) -> ext4_ext_check_inode() already relies
> > > on the fact that system zone is NULL, so we should be okay here as well.
> > Yeah, that's right. :)
> >
> > Cheers,
> > Yi.
> >
> > > If this looks good, I'll send a v2 with the suggested changes.
> > >
> > > Thanks,
> > > ojaswin
>
> Please mention in the commit message that we're now doing some basic
> checks on the journal area.
>
>
> Cheers,
> Baokun
Got it, I'll send a v2 with the changes. Thanks Baokun, Yi
Regards,
ojaswin
>
On 2025/3/16 9:41, Theodore Ts'o wrote: > On Fri, Mar 14, 2025 at 05:11:43PM +0530, Ojaswin Mujoo wrote: >> Currently, we access journal ino through sbi->s_es->s_journal_inum, >> which directly reads from the ext4 sb buffer head. If someone modifies >> this underneath us then the s_journal_inum field might get corrupted. >> >> Although direct block device modifications can be expected to cause >> issues in the FS, let's cache s_journal_inum in sbi->s_journal_ino so >> our checks can be more resillient. > The reason why the block validity checks need to check against > s_journal_ino is to exempt the lookups done by ext4_journal_bmap() > from running afoul of the system zone checks, since the journal's data > blocks are considered part of the system zone. > > So this is something we need to do if the journal is actived, and if > it's active, then sbi->s_journal will be non-NULL, and so we can just > check to see if inode == sbi->s_journal instead. This will simplify > the code, without needing to expand the ext4_sb_info structure. > > Cheers, > > - Ted > This looks good! It's a much more direct approach, avoiding extra field and complex code. Cheers, Baokun
On 2025/3/14 19:41, Ojaswin Mujoo wrote:
> Currently, we access journal ino through sbi->s_es->s_journal_inum,
> which directly reads from the ext4 sb buffer head. If someone modifies
> this underneath us then the s_journal_inum field might get corrupted.
>
> Although direct block device modifications can be expected to cause
> issues in the FS, let's cache s_journal_inum in sbi->s_journal_ino so
> our checks can be more resillient.
>
> Suggested-by: Baokun Li <libaokun1@huawei.com>
> Signed-off-by: Ojaswin Mujoo <ojaswin@linux.ibm.com>
> ---
> fs/ext4/block_validity.c | 23 ++++++++++++++++-------
> fs/ext4/ext4.h | 1 +
> fs/ext4/inode.c | 19 +++++++++++++++----
> fs/ext4/super.c | 5 ++++-
> 4 files changed, 36 insertions(+), 12 deletions(-)
>
> diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
> index 87ee3a17bd29..54e6f3499263 100644
> --- a/fs/ext4/block_validity.c
> +++ b/fs/ext4/block_validity.c
> @@ -247,9 +247,9 @@ int ext4_setup_system_zone(struct super_block *sb)
> if (ret)
> goto err;
> }
> - if (ext4_has_feature_journal(sb) && sbi->s_es->s_journal_inum) {
> + if (ext4_has_feature_journal(sb) && sbi->s_journal_ino) {
> ret = ext4_protect_reserved_inode(sb, system_blks,
> - le32_to_cpu(sbi->s_es->s_journal_inum));
> + sbi->s_journal_ino);
> if (ret)
> goto err;
> }
> @@ -351,11 +351,20 @@ int ext4_check_blockref(const char *function, unsigned int line,
> {
> __le32 *bref = p;
> unsigned int blk;
> -
> - if (ext4_has_feature_journal(inode->i_sb) &&
> - (inode->i_ino ==
> - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
> - return 0;
> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> +
> + if (ext4_has_feature_journal(inode->i_sb)) {
> + /* If we are called from journal init path then
> + * sbi->s_journal_ino would be 0
> + */
> + u32 journal_ino = sbi->s_journal_ino ?
> + sbi->s_journal_ino :
> + sbi->s_es->s_journal_inum;
> + WARN_ON_ONCE(journal_ino == 0);
> +
> + if (inode->i_ino == journal_ino)
> + return 0;
> + }
>
Hello Ojaswin,
I'd suggested to assign s_journal_ino in ext4_open_inode_journal(), so
that we can drop these two complex code snippets in ext4_check_blockref()
and __check_block_validity().
@@ -5856,6 +5856,7 @@ static journal_t *ext4_open_inode_journal(struct super_block *sb,
return ERR_CAST(journal);
}
journal->j_private = sb;
+ EXT4_SB(sb)->s_journal_ino = journal_inum;
journal->j_bmap = ext4_journal_bmap;
ext4_init_journal_params(sb, journal);
return journal;
Thanks,
Yi.
> while (bref < p+max) {
> blk = le32_to_cpu(*bref++);
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 2b7d781bfcad..648b0459e9fd 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1556,6 +1556,7 @@ struct ext4_sb_info {
> u32 s_max_batch_time;
> u32 s_min_batch_time;
> struct file *s_journal_bdev_file;
> + u32 s_journal_ino;
> #ifdef CONFIG_QUOTA
> /* Names of quota files with journalled quota */
> char __rcu *s_qf_names[EXT4_MAXQUOTAS];
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 365d31004bd0..50961bc0c94d 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -384,10 +384,21 @@ static int __check_block_validity(struct inode *inode, const char *func,
> unsigned int line,
> struct ext4_map_blocks *map)
> {
> - if (ext4_has_feature_journal(inode->i_sb) &&
> - (inode->i_ino ==
> - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum)))
> - return 0;
> + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
> +
> + if (ext4_has_feature_journal(inode->i_sb)) {
> + /*
> + * If we are called from journal init path then
> + * sbi->s_journal_ino would be 0
> + */
> + u32 journal_ino = sbi->s_journal_ino ?
> + sbi->s_journal_ino :
> + sbi->s_es->s_journal_inum;
> + WARN_ON_ONCE(journal_ino == 0);
> +
> + if (inode->i_ino == journal_ino)
> + return 0;
> + }
> if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) {
> ext4_error_inode(inode, func, line, map->m_pblk,
> "lblock %lu mapped to illegal pblock %llu "
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index a963ffda692a..44e79db7f12a 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -4162,7 +4162,8 @@ int ext4_calculate_overhead(struct super_block *sb)
> struct ext4_sb_info *sbi = EXT4_SB(sb);
> struct ext4_super_block *es = sbi->s_es;
> struct inode *j_inode;
> - unsigned int j_blocks, j_inum = le32_to_cpu(es->s_journal_inum);
> + unsigned int j_blocks;
> + u32 j_inum = sbi->s_journal_ino;
> ext4_group_t i, ngroups = ext4_get_groups_count(sb);
> ext4_fsblk_t overhead = 0;
> char *buf = (char *) get_zeroed_page(GFP_NOFS);
> @@ -6091,6 +6092,8 @@ static int ext4_load_journal(struct super_block *sb,
> ext4_commit_super(sb);
> }
>
> + EXT4_SB(sb)->s_journal_ino = le32_to_cpu(es->s_journal_inum);
> +
> return 0;
>
> err_out:
© 2016 - 2025 Red Hat, Inc.