From: Baokun Li <libaokun1@huawei.com>
Currently, ext4_set_inode_mapping_order() does not set max folio order
for files with the data journalling flag. For files that already have
large folios enabled, ext4_inode_journal_mode() ignores the data
journalling flag once max folio order is set.
This is not because data journalling cannot work with large folios, but
because credit estimates will go through the roof if there are too many
blocks per folio.
Since the real constraint is blocks-per-folio, to support data=journal
under LBS, we now set max folio order to be equal to min folio order for
files with the journalling flag. When LBS is disabled, the max folio order
remains unset as before.
Therefore, before ext4_change_inode_journal_flag() switches the journalling
mode, we call truncate_pagecache() to drop all page cache for that inode,
and filemap_write_and_wait() is called unconditionally.
After that, once the journalling mode has been switched, we can safely
reset the inode mapping order, and the mapping_large_folio_support() check
in ext4_inode_journal_mode() can be removed.
Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
---
fs/ext4/ext4_jbd2.c | 3 +--
fs/ext4/inode.c | 32 ++++++++++++++++++--------------
2 files changed, 19 insertions(+), 16 deletions(-)
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index a0e66bc10093..05e5946ed9b3 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,8 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode)
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC) &&
- !mapping_large_folio_support(inode->i_mapping))) {
+ !test_opt(inode->i_sb, DELALLOC))) {
/* We do not support data journalling for encrypted data */
if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 22d215f90c64..613a989bf750 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5152,9 +5152,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
if (!S_ISREG(inode->i_mode))
return false;
- if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
- ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
- return false;
if (ext4_has_feature_verity(sb))
return false;
if (ext4_has_feature_encrypt(sb))
@@ -5172,12 +5169,20 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
void ext4_set_inode_mapping_order(struct inode *inode)
{
+ u32 max_order;
+
if (!ext4_should_enable_large_folio(inode))
return;
+ if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ max_order = EXT4_SB(inode->i_sb)->s_min_folio_order;
+ else
+ max_order = EXT4_MAX_PAGECACHE_ORDER(inode);
+
mapping_set_folio_order_range(inode->i_mapping,
EXT4_SB(inode->i_sb)->s_min_folio_order,
- EXT4_MAX_PAGECACHE_ORDER(inode));
+ max_order);
}
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -6553,14 +6558,14 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
* dirty data which can be converted only after flushing the dirty
* data (and journalled aops don't know how to handle these cases).
*/
- if (val) {
- filemap_invalidate_lock(inode->i_mapping);
- err = filemap_write_and_wait(inode->i_mapping);
- if (err < 0) {
- filemap_invalidate_unlock(inode->i_mapping);
- return err;
- }
+ filemap_invalidate_lock(inode->i_mapping);
+ err = filemap_write_and_wait(inode->i_mapping);
+ if (err < 0) {
+ filemap_invalidate_unlock(inode->i_mapping);
+ return err;
}
+ /* Before switch the inode journalling mode evict all the page cache. */
+ truncate_pagecache(inode, 0);
alloc_ctx = ext4_writepages_down_write(inode->i_sb);
jbd2_journal_lock_updates(journal);
@@ -6585,12 +6590,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
}
ext4_set_aops(inode);
+ ext4_set_inode_mapping_order(inode);
jbd2_journal_unlock_updates(journal);
ext4_writepages_up_write(inode->i_sb, alloc_ctx);
-
- if (val)
- filemap_invalidate_unlock(inode->i_mapping);
+ filemap_invalidate_unlock(inode->i_mapping);
/* Finally we can mark the inode as dirty. */
--
2.46.1
Hi,
kernel test robot noticed the following build warnings:
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/libaokun-huaweicloud-com/ext4-remove-page-offset-calculation-in-ext4_block_zero_page_range/20251111-224944
base: https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
patch link: https://lore.kernel.org/r/20251111142634.3301616-22-libaokun%40huaweicloud.com
patch subject: [PATCH v3 21/24] ext4: make data=journal support large block size
config: arm64-randconfig-r071-20251114 (https://download.01.org/0day-ci/archive/20251116/202511161433.qI6uGU0m-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 0bba1e76581bad04e7d7f09f5115ae5e2989e0d9)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202511161433.qI6uGU0m-lkp@intel.com/
New smatch warnings:
fs/ext4/inode.c:6612 ext4_change_inode_journal_flag() warn: inconsistent returns '&inode->i_mapping->invalidate_lock'.
vim +6612 fs/ext4/inode.c
617ba13b31fbf5 Mingming Cao 2006-10-11 6527 int ext4_change_inode_journal_flag(struct inode *inode, int val)
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6528 {
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6529 journal_t *journal;
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6530 handle_t *handle;
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6531 int err;
00d873c17e29cc Jan Kara 2023-05-04 6532 int alloc_ctx;
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6533
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6534 /*
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6535 * We have to be very careful here: changing a data block's
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6536 * journaling status dynamically is dangerous. If we write a
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6537 * data block to the journal, change the status and then delete
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6538 * that block, we risk forgetting to revoke the old log record
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6539 * from the journal and so a subsequent replay can corrupt data.
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6540 * So, first we make sure that the journal is empty and that
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6541 * nobody is changing anything.
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6542 */
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6543
617ba13b31fbf5 Mingming Cao 2006-10-11 6544 journal = EXT4_JOURNAL(inode);
0390131ba84fd3 Frank Mayhar 2009-01-07 6545 if (!journal)
0390131ba84fd3 Frank Mayhar 2009-01-07 6546 return 0;
d699594dc151c6 Dave Hansen 2007-07-18 6547 if (is_journal_aborted(journal))
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6548 return -EROFS;
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6549
17335dcc471199 Dmitry Monakhov 2012-09-29 6550 /* Wait for all existing dio workers */
17335dcc471199 Dmitry Monakhov 2012-09-29 6551 inode_dio_wait(inode);
17335dcc471199 Dmitry Monakhov 2012-09-29 6552
4c54659269ecb7 Daeho Jeong 2016-04-25 6553 /*
4c54659269ecb7 Daeho Jeong 2016-04-25 6554 * Before flushing the journal and switching inode's aops, we have
4c54659269ecb7 Daeho Jeong 2016-04-25 6555 * to flush all dirty data the inode has. There can be outstanding
4c54659269ecb7 Daeho Jeong 2016-04-25 6556 * delayed allocations, there can be unwritten extents created by
4c54659269ecb7 Daeho Jeong 2016-04-25 6557 * fallocate or buffered writes in dioread_nolock mode covered by
4c54659269ecb7 Daeho Jeong 2016-04-25 6558 * dirty data which can be converted only after flushing the dirty
4c54659269ecb7 Daeho Jeong 2016-04-25 6559 * data (and journalled aops don't know how to handle these cases).
4c54659269ecb7 Daeho Jeong 2016-04-25 6560 */
d4f5258eae7b38 Jan Kara 2021-02-04 6561 filemap_invalidate_lock(inode->i_mapping);
4c54659269ecb7 Daeho Jeong 2016-04-25 6562 err = filemap_write_and_wait(inode->i_mapping);
4c54659269ecb7 Daeho Jeong 2016-04-25 6563 if (err < 0) {
d4f5258eae7b38 Jan Kara 2021-02-04 6564 filemap_invalidate_unlock(inode->i_mapping);
4c54659269ecb7 Daeho Jeong 2016-04-25 6565 return err;
4c54659269ecb7 Daeho Jeong 2016-04-25 6566 }
f893fb965834e9 Baokun Li 2025-11-11 6567 /* Before switch the inode journalling mode evict all the page cache. */
f893fb965834e9 Baokun Li 2025-11-11 6568 truncate_pagecache(inode, 0);
4c54659269ecb7 Daeho Jeong 2016-04-25 6569
00d873c17e29cc Jan Kara 2023-05-04 6570 alloc_ctx = ext4_writepages_down_write(inode->i_sb);
dab291af8d6307 Mingming Cao 2006-10-11 6571 jbd2_journal_lock_updates(journal);
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6572
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6573 /*
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6574 * OK, there are no updates running now, and all cached data is
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6575 * synced to disk. We are now in a completely consistent state
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6576 * which doesn't have anything in the journal, and we know that
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6577 * no filesystem updates are running, so it is safe to modify
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6578 * the inode's in-core data-journaling state flag now.
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6579 */
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6580
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6581 if (val)
12e9b892002d9a Dmitry Monakhov 2010-05-16 6582 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5872ddaaf05bf2 Yongqiang Yang 2011-12-28 6583 else {
01d5d96542fd4e Leah Rumancik 2021-05-18 6584 err = jbd2_journal_flush(journal, 0);
4f879ca687a5f2 Jan Kara 2014-10-30 6585 if (err < 0) {
4f879ca687a5f2 Jan Kara 2014-10-30 6586 jbd2_journal_unlock_updates(journal);
00d873c17e29cc Jan Kara 2023-05-04 6587 ext4_writepages_up_write(inode->i_sb, alloc_ctx);
4f879ca687a5f2 Jan Kara 2014-10-30 6588 return err;
filemap_invalidate_unlock(inode->i_mapping) before returning?
4f879ca687a5f2 Jan Kara 2014-10-30 6589 }
12e9b892002d9a Dmitry Monakhov 2010-05-16 6590 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5872ddaaf05bf2 Yongqiang Yang 2011-12-28 6591 }
617ba13b31fbf5 Mingming Cao 2006-10-11 6592 ext4_set_aops(inode);
f893fb965834e9 Baokun Li 2025-11-11 6593 ext4_set_inode_mapping_order(inode);
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6594
dab291af8d6307 Mingming Cao 2006-10-11 6595 jbd2_journal_unlock_updates(journal);
00d873c17e29cc Jan Kara 2023-05-04 6596 ext4_writepages_up_write(inode->i_sb, alloc_ctx);
d4f5258eae7b38 Jan Kara 2021-02-04 6597 filemap_invalidate_unlock(inode->i_mapping);
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6598
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6599 /* Finally we can mark the inode as dirty. */
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6600
9924a92a8c2175 Theodore Ts'o 2013-02-08 6601 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6602 if (IS_ERR(handle))
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6603 return PTR_ERR(handle);
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6604
aa75f4d3daaeb1 Harshad Shirwadkar 2020-10-15 6605 ext4_fc_mark_ineligible(inode->i_sb,
e85c81ba8859a4 Xin Yin 2022-01-17 6606 EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
617ba13b31fbf5 Mingming Cao 2006-10-11 6607 err = ext4_mark_inode_dirty(handle, inode);
0390131ba84fd3 Frank Mayhar 2009-01-07 6608 ext4_handle_sync(handle);
617ba13b31fbf5 Mingming Cao 2006-10-11 6609 ext4_journal_stop(handle);
617ba13b31fbf5 Mingming Cao 2006-10-11 6610 ext4_std_error(inode->i_sb, err);
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6611
ac27a0ec112a08 Dave Kleikamp 2006-10-11 @6612 return err;
ac27a0ec112a08 Dave Kleikamp 2006-10-11 6613 }
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
On 2025-11-19 20:41, Dan Carpenter wrote:
> Hi,
>
> kernel test robot noticed the following build warnings:
>
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url: https://github.com/intel-lab-lkp/linux/commits/libaokun-huaweicloud-com/ext4-remove-page-offset-calculation-in-ext4_block_zero_page_range/20251111-224944
> base: https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
> patch link: https://lore.kernel.org/r/20251111142634.3301616-22-libaokun%40huaweicloud.com
> patch subject: [PATCH v3 21/24] ext4: make data=journal support large block size
> config: arm64-randconfig-r071-20251114 (https://download.01.org/0day-ci/archive/20251116/202511161433.qI6uGU0m-lkp@intel.com/config)
> compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 0bba1e76581bad04e7d7f09f5115ae5e2989e0d9)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
> | Closes: https://lore.kernel.org/r/202511161433.qI6uGU0m-lkp@intel.com/
>
> New smatch warnings:
> fs/ext4/inode.c:6612 ext4_change_inode_journal_flag() warn: inconsistent returns '&inode->i_mapping->invalidate_lock'.
>
> vim +6612 fs/ext4/inode.c
>
> 617ba13b31fbf5 Mingming Cao 2006-10-11 6527 int ext4_change_inode_journal_flag(struct inode *inode, int val)
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6528 {
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6529 journal_t *journal;
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6530 handle_t *handle;
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6531 int err;
> 00d873c17e29cc Jan Kara 2023-05-04 6532 int alloc_ctx;
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6533
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6534 /*
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6535 * We have to be very careful here: changing a data block's
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6536 * journaling status dynamically is dangerous. If we write a
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6537 * data block to the journal, change the status and then delete
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6538 * that block, we risk forgetting to revoke the old log record
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6539 * from the journal and so a subsequent replay can corrupt data.
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6540 * So, first we make sure that the journal is empty and that
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6541 * nobody is changing anything.
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6542 */
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6543
> 617ba13b31fbf5 Mingming Cao 2006-10-11 6544 journal = EXT4_JOURNAL(inode);
> 0390131ba84fd3 Frank Mayhar 2009-01-07 6545 if (!journal)
> 0390131ba84fd3 Frank Mayhar 2009-01-07 6546 return 0;
> d699594dc151c6 Dave Hansen 2007-07-18 6547 if (is_journal_aborted(journal))
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6548 return -EROFS;
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6549
> 17335dcc471199 Dmitry Monakhov 2012-09-29 6550 /* Wait for all existing dio workers */
> 17335dcc471199 Dmitry Monakhov 2012-09-29 6551 inode_dio_wait(inode);
> 17335dcc471199 Dmitry Monakhov 2012-09-29 6552
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6553 /*
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6554 * Before flushing the journal and switching inode's aops, we have
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6555 * to flush all dirty data the inode has. There can be outstanding
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6556 * delayed allocations, there can be unwritten extents created by
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6557 * fallocate or buffered writes in dioread_nolock mode covered by
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6558 * dirty data which can be converted only after flushing the dirty
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6559 * data (and journalled aops don't know how to handle these cases).
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6560 */
> d4f5258eae7b38 Jan Kara 2021-02-04 6561 filemap_invalidate_lock(inode->i_mapping);
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6562 err = filemap_write_and_wait(inode->i_mapping);
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6563 if (err < 0) {
> d4f5258eae7b38 Jan Kara 2021-02-04 6564 filemap_invalidate_unlock(inode->i_mapping);
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6565 return err;
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6566 }
> f893fb965834e9 Baokun Li 2025-11-11 6567 /* Before switch the inode journalling mode evict all the page cache. */
> f893fb965834e9 Baokun Li 2025-11-11 6568 truncate_pagecache(inode, 0);
> 4c54659269ecb7 Daeho Jeong 2016-04-25 6569
> 00d873c17e29cc Jan Kara 2023-05-04 6570 alloc_ctx = ext4_writepages_down_write(inode->i_sb);
> dab291af8d6307 Mingming Cao 2006-10-11 6571 jbd2_journal_lock_updates(journal);
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6572
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6573 /*
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6574 * OK, there are no updates running now, and all cached data is
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6575 * synced to disk. We are now in a completely consistent state
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6576 * which doesn't have anything in the journal, and we know that
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6577 * no filesystem updates are running, so it is safe to modify
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6578 * the inode's in-core data-journaling state flag now.
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6579 */
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6580
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6581 if (val)
> 12e9b892002d9a Dmitry Monakhov 2010-05-16 6582 ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
> 5872ddaaf05bf2 Yongqiang Yang 2011-12-28 6583 else {
> 01d5d96542fd4e Leah Rumancik 2021-05-18 6584 err = jbd2_journal_flush(journal, 0);
> 4f879ca687a5f2 Jan Kara 2014-10-30 6585 if (err < 0) {
> 4f879ca687a5f2 Jan Kara 2014-10-30 6586 jbd2_journal_unlock_updates(journal);
> 00d873c17e29cc Jan Kara 2023-05-04 6587 ext4_writepages_up_write(inode->i_sb, alloc_ctx);
> 4f879ca687a5f2 Jan Kara 2014-10-30 6588 return err;
>
> filemap_invalidate_unlock(inode->i_mapping) before returning?
Oops! You nailed it. My bad, I totally forgot that unlock here, which
definitely left the lock unbalanced. I'll get that fixed up in v3.
Thanks a ton for doing the testing!
Cheers,
Baokun
>
> 4f879ca687a5f2 Jan Kara 2014-10-30 6589 }
> 12e9b892002d9a Dmitry Monakhov 2010-05-16 6590 ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
> 5872ddaaf05bf2 Yongqiang Yang 2011-12-28 6591 }
> 617ba13b31fbf5 Mingming Cao 2006-10-11 6592 ext4_set_aops(inode);
> f893fb965834e9 Baokun Li 2025-11-11 6593 ext4_set_inode_mapping_order(inode);
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6594
> dab291af8d6307 Mingming Cao 2006-10-11 6595 jbd2_journal_unlock_updates(journal);
> 00d873c17e29cc Jan Kara 2023-05-04 6596 ext4_writepages_up_write(inode->i_sb, alloc_ctx);
> d4f5258eae7b38 Jan Kara 2021-02-04 6597 filemap_invalidate_unlock(inode->i_mapping);
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6598
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6599 /* Finally we can mark the inode as dirty. */
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6600
> 9924a92a8c2175 Theodore Ts'o 2013-02-08 6601 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6602 if (IS_ERR(handle))
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6603 return PTR_ERR(handle);
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6604
> aa75f4d3daaeb1 Harshad Shirwadkar 2020-10-15 6605 ext4_fc_mark_ineligible(inode->i_sb,
> e85c81ba8859a4 Xin Yin 2022-01-17 6606 EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
> 617ba13b31fbf5 Mingming Cao 2006-10-11 6607 err = ext4_mark_inode_dirty(handle, inode);
> 0390131ba84fd3 Frank Mayhar 2009-01-07 6608 ext4_handle_sync(handle);
> 617ba13b31fbf5 Mingming Cao 2006-10-11 6609 ext4_journal_stop(handle);
> 617ba13b31fbf5 Mingming Cao 2006-10-11 6610 ext4_std_error(inode->i_sb, err);
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6611
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 @6612 return err;
> ac27a0ec112a08 Dave Kleikamp 2006-10-11 6613 }
>
On Thu, Nov 20, 2025 at 09:21:23AM +0800, Baokun Li wrote:
>
> Oops! You nailed it. My bad, I totally forgot that unlock here, which
> definitely left the lock unbalanced. I'll get that fixed up in v3.
I think you meant v4 (since the current patch series are v3 :-). When
do you think you might be able to get the next version of this patch
series ready? I think we're almost ready to land this feature!
- Ted
On 2025-11-20 23:41, Theodore Tso wrote: > On Thu, Nov 20, 2025 at 09:21:23AM +0800, Baokun Li wrote: >> Oops! You nailed it. My bad, I totally forgot that unlock here, which >> definitely left the lock unbalanced. I'll get that fixed up in v3. > I think you meant v4 (since the current patch series are v3 :-). Haha, yes, , I messed up the version number. 😅 > When > do you think you might be able to get the next version of this patch > series ready? I think we're almost ready to land this feature! > > - Ted > Yep, the current tests look clean! Good news on the dependencies too: [1] and [2] are already merged to next. [1]: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=ee040cbd6e48 [2]: https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=50b2a4f19b22 I'll be sending out v4 today to fix the issue Dan mentioned, and then I think this feature is ready to land! [P.S.: I noticed Christoph Hellwig and Eric Biggers are cleaning up the fscrypt API. That might clear the way for us to ditch the "no fscrypt support for ext4 LBS" restriction later on. I'm also looking into speeding up large block checksums. But I think these extra features and improvements can evolve independently from the work we’re doing now.] Cheers, Baokun
On Tue 11-11-25 22:26:31, libaokun@huaweicloud.com wrote:
> From: Baokun Li <libaokun1@huawei.com>
>
> Currently, ext4_set_inode_mapping_order() does not set max folio order
> for files with the data journalling flag. For files that already have
> large folios enabled, ext4_inode_journal_mode() ignores the data
> journalling flag once max folio order is set.
>
> This is not because data journalling cannot work with large folios, but
> because credit estimates will go through the roof if there are too many
> blocks per folio.
>
> Since the real constraint is blocks-per-folio, to support data=journal
> under LBS, we now set max folio order to be equal to min folio order for
> files with the journalling flag. When LBS is disabled, the max folio order
> remains unset as before.
>
> Therefore, before ext4_change_inode_journal_flag() switches the journalling
> mode, we call truncate_pagecache() to drop all page cache for that inode,
> and filemap_write_and_wait() is called unconditionally.
>
> After that, once the journalling mode has been switched, we can safely
> reset the inode mapping order, and the mapping_large_folio_support() check
> in ext4_inode_journal_mode() can be removed.
>
> Suggested-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Baokun Li <libaokun1@huawei.com>
Looks good. Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/ext4/ext4_jbd2.c | 3 +--
> fs/ext4/inode.c | 32 ++++++++++++++++++--------------
> 2 files changed, 19 insertions(+), 16 deletions(-)
>
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index a0e66bc10093..05e5946ed9b3 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -16,8 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode)
> ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
> test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
> - !test_opt(inode->i_sb, DELALLOC) &&
> - !mapping_large_folio_support(inode->i_mapping))) {
> + !test_opt(inode->i_sb, DELALLOC))) {
> /* We do not support data journalling for encrypted data */
> if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
> return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 22d215f90c64..613a989bf750 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5152,9 +5152,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
>
> if (!S_ISREG(inode->i_mode))
> return false;
> - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> - return false;
> if (ext4_has_feature_verity(sb))
> return false;
> if (ext4_has_feature_encrypt(sb))
> @@ -5172,12 +5169,20 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
> umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
> void ext4_set_inode_mapping_order(struct inode *inode)
> {
> + u32 max_order;
> +
> if (!ext4_should_enable_large_folio(inode))
> return;
>
> + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> + max_order = EXT4_SB(inode->i_sb)->s_min_folio_order;
> + else
> + max_order = EXT4_MAX_PAGECACHE_ORDER(inode);
> +
> mapping_set_folio_order_range(inode->i_mapping,
> EXT4_SB(inode->i_sb)->s_min_folio_order,
> - EXT4_MAX_PAGECACHE_ORDER(inode));
> + max_order);
> }
>
> struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
> @@ -6553,14 +6558,14 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> * dirty data which can be converted only after flushing the dirty
> * data (and journalled aops don't know how to handle these cases).
> */
> - if (val) {
> - filemap_invalidate_lock(inode->i_mapping);
> - err = filemap_write_and_wait(inode->i_mapping);
> - if (err < 0) {
> - filemap_invalidate_unlock(inode->i_mapping);
> - return err;
> - }
> + filemap_invalidate_lock(inode->i_mapping);
> + err = filemap_write_and_wait(inode->i_mapping);
> + if (err < 0) {
> + filemap_invalidate_unlock(inode->i_mapping);
> + return err;
> }
> + /* Before switch the inode journalling mode evict all the page cache. */
> + truncate_pagecache(inode, 0);
>
> alloc_ctx = ext4_writepages_down_write(inode->i_sb);
> jbd2_journal_lock_updates(journal);
> @@ -6585,12 +6590,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
> }
> ext4_set_aops(inode);
> + ext4_set_inode_mapping_order(inode);
>
> jbd2_journal_unlock_updates(journal);
> ext4_writepages_up_write(inode->i_sb, alloc_ctx);
> -
> - if (val)
> - filemap_invalidate_unlock(inode->i_mapping);
> + filemap_invalidate_unlock(inode->i_mapping);
>
> /* Finally we can mark the inode as dirty. */
>
> --
> 2.46.1
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
On 11/11/2025 10:26 PM, libaokun@huaweicloud.com wrote:
> From: Baokun Li <libaokun1@huawei.com>
>
> Currently, ext4_set_inode_mapping_order() does not set max folio order
> for files with the data journalling flag. For files that already have
> large folios enabled, ext4_inode_journal_mode() ignores the data
> journalling flag once max folio order is set.
>
> This is not because data journalling cannot work with large folios, but
> because credit estimates will go through the roof if there are too many
> blocks per folio.
>
> Since the real constraint is blocks-per-folio, to support data=journal
> under LBS, we now set max folio order to be equal to min folio order for
> files with the journalling flag. When LBS is disabled, the max folio order
> remains unset as before.
>
> Therefore, before ext4_change_inode_journal_flag() switches the journalling
> mode, we call truncate_pagecache() to drop all page cache for that inode,
> and filemap_write_and_wait() is called unconditionally.
>
> After that, once the journalling mode has been switched, we can safely
> reset the inode mapping order, and the mapping_large_folio_support() check
> in ext4_inode_journal_mode() can be removed.
>
> Suggested-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Baokun Li <libaokun1@huawei.com>
Looks good to me.
Reviewed-by: Zhang Yi <yi.zhang@huawei.com>
> ---
> fs/ext4/ext4_jbd2.c | 3 +--
> fs/ext4/inode.c | 32 ++++++++++++++++++--------------
> 2 files changed, 19 insertions(+), 16 deletions(-)
>
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index a0e66bc10093..05e5946ed9b3 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -16,8 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode)
> ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
> test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
> - !test_opt(inode->i_sb, DELALLOC) &&
> - !mapping_large_folio_support(inode->i_mapping))) {
> + !test_opt(inode->i_sb, DELALLOC))) {
> /* We do not support data journalling for encrypted data */
> if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
> return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 22d215f90c64..613a989bf750 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5152,9 +5152,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
>
> if (!S_ISREG(inode->i_mode))
> return false;
> - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> - ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> - return false;
> if (ext4_has_feature_verity(sb))
> return false;
> if (ext4_has_feature_encrypt(sb))
> @@ -5172,12 +5169,20 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
> umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
> void ext4_set_inode_mapping_order(struct inode *inode)
> {
> + u32 max_order;
> +
> if (!ext4_should_enable_large_folio(inode))
> return;
>
> + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> + max_order = EXT4_SB(inode->i_sb)->s_min_folio_order;
> + else
> + max_order = EXT4_MAX_PAGECACHE_ORDER(inode);
> +
> mapping_set_folio_order_range(inode->i_mapping,
> EXT4_SB(inode->i_sb)->s_min_folio_order,
> - EXT4_MAX_PAGECACHE_ORDER(inode));
> + max_order);
> }
>
> struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
> @@ -6553,14 +6558,14 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> * dirty data which can be converted only after flushing the dirty
> * data (and journalled aops don't know how to handle these cases).
> */
> - if (val) {
> - filemap_invalidate_lock(inode->i_mapping);
> - err = filemap_write_and_wait(inode->i_mapping);
> - if (err < 0) {
> - filemap_invalidate_unlock(inode->i_mapping);
> - return err;
> - }
> + filemap_invalidate_lock(inode->i_mapping);
> + err = filemap_write_and_wait(inode->i_mapping);
> + if (err < 0) {
> + filemap_invalidate_unlock(inode->i_mapping);
> + return err;
> }
> + /* Before switch the inode journalling mode evict all the page cache. */
> + truncate_pagecache(inode, 0);
>
> alloc_ctx = ext4_writepages_down_write(inode->i_sb);
> jbd2_journal_lock_updates(journal);
> @@ -6585,12 +6590,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
> }
> ext4_set_aops(inode);
> + ext4_set_inode_mapping_order(inode);
>
> jbd2_journal_unlock_updates(journal);
> ext4_writepages_up_write(inode->i_sb, alloc_ctx);
> -
> - if (val)
> - filemap_invalidate_unlock(inode->i_mapping);
> + filemap_invalidate_unlock(inode->i_mapping);
>
> /* Finally we can mark the inode as dirty. */
>
© 2016 - 2026 Red Hat, Inc.