[PATCH v3 21/24] ext4: make data=journal support large block size

libaokun@huaweicloud.com posted 24 patches 3 months ago
There is a newer version of this series
[PATCH v3 21/24] ext4: make data=journal support large block size
Posted by libaokun@huaweicloud.com 3 months ago
From: Baokun Li <libaokun1@huawei.com>

Currently, ext4_set_inode_mapping_order() does not set max folio order
for files with the data journalling flag. For files that already have
large folios enabled, ext4_inode_journal_mode() ignores the data
journalling flag once max folio order is set.

This is not because data journalling cannot work with large folios, but
because credit estimates will go through the roof if there are too many
blocks per folio.

Since the real constraint is blocks-per-folio, to support data=journal
under LBS, we now set max folio order to be equal to min folio order for
files with the journalling flag. When LBS is disabled, the max folio order
remains unset as before.

Therefore, before ext4_change_inode_journal_flag() switches the journalling
mode, we call truncate_pagecache() to drop all page cache for that inode,
and filemap_write_and_wait() is called unconditionally.

After that, once the journalling mode has been switched, we can safely
reset the inode mapping order, and the mapping_large_folio_support() check
in ext4_inode_journal_mode() can be removed.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: Baokun Li <libaokun1@huawei.com>
---
 fs/ext4/ext4_jbd2.c |  3 +--
 fs/ext4/inode.c     | 32 ++++++++++++++++++--------------
 2 files changed, 19 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index a0e66bc10093..05e5946ed9b3 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,8 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode)
 	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
 	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
 	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
-	    !test_opt(inode->i_sb, DELALLOC) &&
-	    !mapping_large_folio_support(inode->i_mapping))) {
+	    !test_opt(inode->i_sb, DELALLOC))) {
 		/* We do not support data journalling for encrypted data */
 		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
 			return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 22d215f90c64..613a989bf750 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -5152,9 +5152,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
 
 	if (!S_ISREG(inode->i_mode))
 		return false;
-	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
-	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-		return false;
 	if (ext4_has_feature_verity(sb))
 		return false;
 	if (ext4_has_feature_encrypt(sb))
@@ -5172,12 +5169,20 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
 		umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
 void ext4_set_inode_mapping_order(struct inode *inode)
 {
+	u32 max_order;
+
 	if (!ext4_should_enable_large_folio(inode))
 		return;
 
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+		max_order = EXT4_SB(inode->i_sb)->s_min_folio_order;
+	else
+		max_order = EXT4_MAX_PAGECACHE_ORDER(inode);
+
 	mapping_set_folio_order_range(inode->i_mapping,
 				      EXT4_SB(inode->i_sb)->s_min_folio_order,
-				      EXT4_MAX_PAGECACHE_ORDER(inode));
+				      max_order);
 }
 
 struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
@@ -6553,14 +6558,14 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	 * dirty data which can be converted only after flushing the dirty
 	 * data (and journalled aops don't know how to handle these cases).
 	 */
-	if (val) {
-		filemap_invalidate_lock(inode->i_mapping);
-		err = filemap_write_and_wait(inode->i_mapping);
-		if (err < 0) {
-			filemap_invalidate_unlock(inode->i_mapping);
-			return err;
-		}
+	filemap_invalidate_lock(inode->i_mapping);
+	err = filemap_write_and_wait(inode->i_mapping);
+	if (err < 0) {
+		filemap_invalidate_unlock(inode->i_mapping);
+		return err;
 	}
+	/* Before switch the inode journalling mode evict all the page cache. */
+	truncate_pagecache(inode, 0);
 
 	alloc_ctx = ext4_writepages_down_write(inode->i_sb);
 	jbd2_journal_lock_updates(journal);
@@ -6585,12 +6590,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	}
 	ext4_set_aops(inode);
+	ext4_set_inode_mapping_order(inode);
 
 	jbd2_journal_unlock_updates(journal);
 	ext4_writepages_up_write(inode->i_sb, alloc_ctx);
-
-	if (val)
-		filemap_invalidate_unlock(inode->i_mapping);
+	filemap_invalidate_unlock(inode->i_mapping);
 
 	/* Finally we can mark the inode as dirty. */
 
-- 
2.46.1
Re: [PATCH v3 21/24] ext4: make data=journal support large block size
Posted by Dan Carpenter 2 months, 3 weeks ago
Hi,

kernel test robot noticed the following build warnings:

https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/libaokun-huaweicloud-com/ext4-remove-page-offset-calculation-in-ext4_block_zero_page_range/20251111-224944
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
patch link:    https://lore.kernel.org/r/20251111142634.3301616-22-libaokun%40huaweicloud.com
patch subject: [PATCH v3 21/24] ext4: make data=journal support large block size
config: arm64-randconfig-r071-20251114 (https://download.01.org/0day-ci/archive/20251116/202511161433.qI6uGU0m-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 0bba1e76581bad04e7d7f09f5115ae5e2989e0d9)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202511161433.qI6uGU0m-lkp@intel.com/

New smatch warnings:
fs/ext4/inode.c:6612 ext4_change_inode_journal_flag() warn: inconsistent returns '&inode->i_mapping->invalidate_lock'.

vim +6612 fs/ext4/inode.c

617ba13b31fbf5 Mingming Cao       2006-10-11  6527  int ext4_change_inode_journal_flag(struct inode *inode, int val)
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6528  {
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6529  	journal_t *journal;
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6530  	handle_t *handle;
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6531  	int err;
00d873c17e29cc Jan Kara           2023-05-04  6532  	int alloc_ctx;
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6533  
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6534  	/*
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6535  	 * We have to be very careful here: changing a data block's
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6536  	 * journaling status dynamically is dangerous.  If we write a
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6537  	 * data block to the journal, change the status and then delete
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6538  	 * that block, we risk forgetting to revoke the old log record
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6539  	 * from the journal and so a subsequent replay can corrupt data.
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6540  	 * So, first we make sure that the journal is empty and that
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6541  	 * nobody is changing anything.
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6542  	 */
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6543  
617ba13b31fbf5 Mingming Cao       2006-10-11  6544  	journal = EXT4_JOURNAL(inode);
0390131ba84fd3 Frank Mayhar       2009-01-07  6545  	if (!journal)
0390131ba84fd3 Frank Mayhar       2009-01-07  6546  		return 0;
d699594dc151c6 Dave Hansen        2007-07-18  6547  	if (is_journal_aborted(journal))
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6548  		return -EROFS;
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6549  
17335dcc471199 Dmitry Monakhov    2012-09-29  6550  	/* Wait for all existing dio workers */
17335dcc471199 Dmitry Monakhov    2012-09-29  6551  	inode_dio_wait(inode);
17335dcc471199 Dmitry Monakhov    2012-09-29  6552  
4c54659269ecb7 Daeho Jeong        2016-04-25  6553  	/*
4c54659269ecb7 Daeho Jeong        2016-04-25  6554  	 * Before flushing the journal and switching inode's aops, we have
4c54659269ecb7 Daeho Jeong        2016-04-25  6555  	 * to flush all dirty data the inode has. There can be outstanding
4c54659269ecb7 Daeho Jeong        2016-04-25  6556  	 * delayed allocations, there can be unwritten extents created by
4c54659269ecb7 Daeho Jeong        2016-04-25  6557  	 * fallocate or buffered writes in dioread_nolock mode covered by
4c54659269ecb7 Daeho Jeong        2016-04-25  6558  	 * dirty data which can be converted only after flushing the dirty
4c54659269ecb7 Daeho Jeong        2016-04-25  6559  	 * data (and journalled aops don't know how to handle these cases).
4c54659269ecb7 Daeho Jeong        2016-04-25  6560  	 */
d4f5258eae7b38 Jan Kara           2021-02-04  6561  	filemap_invalidate_lock(inode->i_mapping);
4c54659269ecb7 Daeho Jeong        2016-04-25  6562  	err = filemap_write_and_wait(inode->i_mapping);
4c54659269ecb7 Daeho Jeong        2016-04-25  6563  	if (err < 0) {
d4f5258eae7b38 Jan Kara           2021-02-04  6564  		filemap_invalidate_unlock(inode->i_mapping);
4c54659269ecb7 Daeho Jeong        2016-04-25  6565  		return err;
4c54659269ecb7 Daeho Jeong        2016-04-25  6566  	}
f893fb965834e9 Baokun Li          2025-11-11  6567  	/* Before switch the inode journalling mode evict all the page cache. */
f893fb965834e9 Baokun Li          2025-11-11  6568  	truncate_pagecache(inode, 0);
4c54659269ecb7 Daeho Jeong        2016-04-25  6569  
00d873c17e29cc Jan Kara           2023-05-04  6570  	alloc_ctx = ext4_writepages_down_write(inode->i_sb);
dab291af8d6307 Mingming Cao       2006-10-11  6571  	jbd2_journal_lock_updates(journal);
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6572  
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6573  	/*
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6574  	 * OK, there are no updates running now, and all cached data is
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6575  	 * synced to disk.  We are now in a completely consistent state
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6576  	 * which doesn't have anything in the journal, and we know that
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6577  	 * no filesystem updates are running, so it is safe to modify
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6578  	 * the inode's in-core data-journaling state flag now.
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6579  	 */
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6580  
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6581  	if (val)
12e9b892002d9a Dmitry Monakhov    2010-05-16  6582  		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5872ddaaf05bf2 Yongqiang Yang     2011-12-28  6583  	else {
01d5d96542fd4e Leah Rumancik      2021-05-18  6584  		err = jbd2_journal_flush(journal, 0);
4f879ca687a5f2 Jan Kara           2014-10-30  6585  		if (err < 0) {
4f879ca687a5f2 Jan Kara           2014-10-30  6586  			jbd2_journal_unlock_updates(journal);
00d873c17e29cc Jan Kara           2023-05-04  6587  			ext4_writepages_up_write(inode->i_sb, alloc_ctx);
4f879ca687a5f2 Jan Kara           2014-10-30  6588  			return err;

filemap_invalidate_unlock(inode->i_mapping) before returning?

4f879ca687a5f2 Jan Kara           2014-10-30  6589  		}
12e9b892002d9a Dmitry Monakhov    2010-05-16  6590  		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
5872ddaaf05bf2 Yongqiang Yang     2011-12-28  6591  	}
617ba13b31fbf5 Mingming Cao       2006-10-11  6592  	ext4_set_aops(inode);
f893fb965834e9 Baokun Li          2025-11-11  6593  	ext4_set_inode_mapping_order(inode);
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6594  
dab291af8d6307 Mingming Cao       2006-10-11  6595  	jbd2_journal_unlock_updates(journal);
00d873c17e29cc Jan Kara           2023-05-04  6596  	ext4_writepages_up_write(inode->i_sb, alloc_ctx);
d4f5258eae7b38 Jan Kara           2021-02-04  6597  	filemap_invalidate_unlock(inode->i_mapping);
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6598  
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6599  	/* Finally we can mark the inode as dirty. */
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6600  
9924a92a8c2175 Theodore Ts'o      2013-02-08  6601  	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6602  	if (IS_ERR(handle))
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6603  		return PTR_ERR(handle);
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6604  
aa75f4d3daaeb1 Harshad Shirwadkar 2020-10-15  6605  	ext4_fc_mark_ineligible(inode->i_sb,
e85c81ba8859a4 Xin Yin            2022-01-17  6606  		EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
617ba13b31fbf5 Mingming Cao       2006-10-11  6607  	err = ext4_mark_inode_dirty(handle, inode);
0390131ba84fd3 Frank Mayhar       2009-01-07  6608  	ext4_handle_sync(handle);
617ba13b31fbf5 Mingming Cao       2006-10-11  6609  	ext4_journal_stop(handle);
617ba13b31fbf5 Mingming Cao       2006-10-11  6610  	ext4_std_error(inode->i_sb, err);
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6611  
ac27a0ec112a08 Dave Kleikamp      2006-10-11 @6612  	return err;
ac27a0ec112a08 Dave Kleikamp      2006-10-11  6613  }

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v3 21/24] ext4: make data=journal support large block size
Posted by Baokun Li 2 months, 3 weeks ago
On 2025-11-19 20:41, Dan Carpenter wrote:
> Hi,
>
> kernel test robot noticed the following build warnings:
>
> https://git-scm.com/docs/git-format-patch#_base_tree_information]
>
> url:    https://github.com/intel-lab-lkp/linux/commits/libaokun-huaweicloud-com/ext4-remove-page-offset-calculation-in-ext4_block_zero_page_range/20251111-224944
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
> patch link:    https://lore.kernel.org/r/20251111142634.3301616-22-libaokun%40huaweicloud.com
> patch subject: [PATCH v3 21/24] ext4: make data=journal support large block size
> config: arm64-randconfig-r071-20251114 (https://download.01.org/0day-ci/archive/20251116/202511161433.qI6uGU0m-lkp@intel.com/config)
> compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 0bba1e76581bad04e7d7f09f5115ae5e2989e0d9)
>
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
> | Closes: https://lore.kernel.org/r/202511161433.qI6uGU0m-lkp@intel.com/
>
> New smatch warnings:
> fs/ext4/inode.c:6612 ext4_change_inode_journal_flag() warn: inconsistent returns '&inode->i_mapping->invalidate_lock'.
>
> vim +6612 fs/ext4/inode.c
>
> 617ba13b31fbf5 Mingming Cao       2006-10-11  6527  int ext4_change_inode_journal_flag(struct inode *inode, int val)
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6528  {
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6529  	journal_t *journal;
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6530  	handle_t *handle;
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6531  	int err;
> 00d873c17e29cc Jan Kara           2023-05-04  6532  	int alloc_ctx;
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6533  
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6534  	/*
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6535  	 * We have to be very careful here: changing a data block's
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6536  	 * journaling status dynamically is dangerous.  If we write a
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6537  	 * data block to the journal, change the status and then delete
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6538  	 * that block, we risk forgetting to revoke the old log record
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6539  	 * from the journal and so a subsequent replay can corrupt data.
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6540  	 * So, first we make sure that the journal is empty and that
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6541  	 * nobody is changing anything.
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6542  	 */
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6543  
> 617ba13b31fbf5 Mingming Cao       2006-10-11  6544  	journal = EXT4_JOURNAL(inode);
> 0390131ba84fd3 Frank Mayhar       2009-01-07  6545  	if (!journal)
> 0390131ba84fd3 Frank Mayhar       2009-01-07  6546  		return 0;
> d699594dc151c6 Dave Hansen        2007-07-18  6547  	if (is_journal_aborted(journal))
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6548  		return -EROFS;
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6549  
> 17335dcc471199 Dmitry Monakhov    2012-09-29  6550  	/* Wait for all existing dio workers */
> 17335dcc471199 Dmitry Monakhov    2012-09-29  6551  	inode_dio_wait(inode);
> 17335dcc471199 Dmitry Monakhov    2012-09-29  6552  
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6553  	/*
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6554  	 * Before flushing the journal and switching inode's aops, we have
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6555  	 * to flush all dirty data the inode has. There can be outstanding
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6556  	 * delayed allocations, there can be unwritten extents created by
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6557  	 * fallocate or buffered writes in dioread_nolock mode covered by
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6558  	 * dirty data which can be converted only after flushing the dirty
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6559  	 * data (and journalled aops don't know how to handle these cases).
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6560  	 */
> d4f5258eae7b38 Jan Kara           2021-02-04  6561  	filemap_invalidate_lock(inode->i_mapping);
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6562  	err = filemap_write_and_wait(inode->i_mapping);
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6563  	if (err < 0) {
> d4f5258eae7b38 Jan Kara           2021-02-04  6564  		filemap_invalidate_unlock(inode->i_mapping);
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6565  		return err;
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6566  	}
> f893fb965834e9 Baokun Li          2025-11-11  6567  	/* Before switch the inode journalling mode evict all the page cache. */
> f893fb965834e9 Baokun Li          2025-11-11  6568  	truncate_pagecache(inode, 0);
> 4c54659269ecb7 Daeho Jeong        2016-04-25  6569  
> 00d873c17e29cc Jan Kara           2023-05-04  6570  	alloc_ctx = ext4_writepages_down_write(inode->i_sb);
> dab291af8d6307 Mingming Cao       2006-10-11  6571  	jbd2_journal_lock_updates(journal);
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6572  
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6573  	/*
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6574  	 * OK, there are no updates running now, and all cached data is
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6575  	 * synced to disk.  We are now in a completely consistent state
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6576  	 * which doesn't have anything in the journal, and we know that
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6577  	 * no filesystem updates are running, so it is safe to modify
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6578  	 * the inode's in-core data-journaling state flag now.
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6579  	 */
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6580  
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6581  	if (val)
> 12e9b892002d9a Dmitry Monakhov    2010-05-16  6582  		ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
> 5872ddaaf05bf2 Yongqiang Yang     2011-12-28  6583  	else {
> 01d5d96542fd4e Leah Rumancik      2021-05-18  6584  		err = jbd2_journal_flush(journal, 0);
> 4f879ca687a5f2 Jan Kara           2014-10-30  6585  		if (err < 0) {
> 4f879ca687a5f2 Jan Kara           2014-10-30  6586  			jbd2_journal_unlock_updates(journal);
> 00d873c17e29cc Jan Kara           2023-05-04  6587  			ext4_writepages_up_write(inode->i_sb, alloc_ctx);
> 4f879ca687a5f2 Jan Kara           2014-10-30  6588  			return err;
>
> filemap_invalidate_unlock(inode->i_mapping) before returning?

Oops! You nailed it. My bad, I totally forgot that unlock here, which
definitely left the lock unbalanced. I'll get that fixed up in v3.

Thanks a ton for doing the testing!


Cheers,
Baokun

>
> 4f879ca687a5f2 Jan Kara           2014-10-30  6589  		}
> 12e9b892002d9a Dmitry Monakhov    2010-05-16  6590  		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
> 5872ddaaf05bf2 Yongqiang Yang     2011-12-28  6591  	}
> 617ba13b31fbf5 Mingming Cao       2006-10-11  6592  	ext4_set_aops(inode);
> f893fb965834e9 Baokun Li          2025-11-11  6593  	ext4_set_inode_mapping_order(inode);
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6594  
> dab291af8d6307 Mingming Cao       2006-10-11  6595  	jbd2_journal_unlock_updates(journal);
> 00d873c17e29cc Jan Kara           2023-05-04  6596  	ext4_writepages_up_write(inode->i_sb, alloc_ctx);
> d4f5258eae7b38 Jan Kara           2021-02-04  6597  	filemap_invalidate_unlock(inode->i_mapping);
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6598  
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6599  	/* Finally we can mark the inode as dirty. */
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6600  
> 9924a92a8c2175 Theodore Ts'o      2013-02-08  6601  	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6602  	if (IS_ERR(handle))
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6603  		return PTR_ERR(handle);
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6604  
> aa75f4d3daaeb1 Harshad Shirwadkar 2020-10-15  6605  	ext4_fc_mark_ineligible(inode->i_sb,
> e85c81ba8859a4 Xin Yin            2022-01-17  6606  		EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, handle);
> 617ba13b31fbf5 Mingming Cao       2006-10-11  6607  	err = ext4_mark_inode_dirty(handle, inode);
> 0390131ba84fd3 Frank Mayhar       2009-01-07  6608  	ext4_handle_sync(handle);
> 617ba13b31fbf5 Mingming Cao       2006-10-11  6609  	ext4_journal_stop(handle);
> 617ba13b31fbf5 Mingming Cao       2006-10-11  6610  	ext4_std_error(inode->i_sb, err);
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6611  
> ac27a0ec112a08 Dave Kleikamp      2006-10-11 @6612  	return err;
> ac27a0ec112a08 Dave Kleikamp      2006-10-11  6613  }
>
Re: [PATCH v3 21/24] ext4: make data=journal support large block size
Posted by Theodore Tso 2 months, 2 weeks ago
On Thu, Nov 20, 2025 at 09:21:23AM +0800, Baokun Li wrote:
> 
> Oops! You nailed it. My bad, I totally forgot that unlock here, which
> definitely left the lock unbalanced. I'll get that fixed up in v3.

I think you meant v4 (since the current patch series are v3 :-).  When
do you think you might be able to get the next version of this patch
series ready?  I think we're almost ready to land this feature!

       	       	       	     	    	     - Ted
Re: [PATCH v3 21/24] ext4: make data=journal support large block size
Posted by Baokun Li 2 months, 2 weeks ago
On 2025-11-20 23:41, Theodore Tso wrote:
> On Thu, Nov 20, 2025 at 09:21:23AM +0800, Baokun Li wrote:
>> Oops! You nailed it. My bad, I totally forgot that unlock here, which
>> definitely left the lock unbalanced. I'll get that fixed up in v3.
> I think you meant v4 (since the current patch series are v3 :-). 

Haha, yes, , I messed up the version number. 😅

>  When
> do you think you might be able to get the next version of this patch
> series ready?  I think we're almost ready to land this feature!
>
>        	       	       	     	    	     - Ted
>
Yep, the current tests look clean! Good news on the dependencies too:
[1] and [2] are already merged to next.

[1]:
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=ee040cbd6e48
[2]:
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git/commit/?id=50b2a4f19b22

I'll be sending out v4 today to fix the issue Dan mentioned, and then
I think this feature is ready to land! 

[P.S.: I noticed Christoph Hellwig and Eric Biggers are cleaning up the
 fscrypt API. That might clear the way for us to ditch the "no fscrypt
 support for ext4 LBS" restriction later on. I'm also looking into
 speeding up large block checksums. But I think these extra features and
 improvements can evolve independently from the work we’re doing now.]


Cheers,
Baokun

Re: [PATCH v3 21/24] ext4: make data=journal support large block size
Posted by Jan Kara 2 months, 4 weeks ago
On Tue 11-11-25 22:26:31, libaokun@huaweicloud.com wrote:
> From: Baokun Li <libaokun1@huawei.com>
> 
> Currently, ext4_set_inode_mapping_order() does not set max folio order
> for files with the data journalling flag. For files that already have
> large folios enabled, ext4_inode_journal_mode() ignores the data
> journalling flag once max folio order is set.
> 
> This is not because data journalling cannot work with large folios, but
> because credit estimates will go through the roof if there are too many
> blocks per folio.
> 
> Since the real constraint is blocks-per-folio, to support data=journal
> under LBS, we now set max folio order to be equal to min folio order for
> files with the journalling flag. When LBS is disabled, the max folio order
> remains unset as before.
> 
> Therefore, before ext4_change_inode_journal_flag() switches the journalling
> mode, we call truncate_pagecache() to drop all page cache for that inode,
> and filemap_write_and_wait() is called unconditionally.
> 
> After that, once the journalling mode has been switched, we can safely
> reset the inode mapping order, and the mapping_large_folio_support() check
> in ext4_inode_journal_mode() can be removed.
> 
> Suggested-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Baokun Li <libaokun1@huawei.com>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/ext4_jbd2.c |  3 +--
>  fs/ext4/inode.c     | 32 ++++++++++++++++++--------------
>  2 files changed, 19 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index a0e66bc10093..05e5946ed9b3 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -16,8 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode)
>  	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
>  	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
>  	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
> -	    !test_opt(inode->i_sb, DELALLOC) &&
> -	    !mapping_large_folio_support(inode->i_mapping))) {
> +	    !test_opt(inode->i_sb, DELALLOC))) {
>  		/* We do not support data journalling for encrypted data */
>  		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
>  			return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 22d215f90c64..613a989bf750 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5152,9 +5152,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
>  
>  	if (!S_ISREG(inode->i_mode))
>  		return false;
> -	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> -	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> -		return false;
>  	if (ext4_has_feature_verity(sb))
>  		return false;
>  	if (ext4_has_feature_encrypt(sb))
> @@ -5172,12 +5169,20 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
>  		umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
>  void ext4_set_inode_mapping_order(struct inode *inode)
>  {
> +	u32 max_order;
> +
>  	if (!ext4_should_enable_large_folio(inode))
>  		return;
>  
> +	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> +		max_order = EXT4_SB(inode->i_sb)->s_min_folio_order;
> +	else
> +		max_order = EXT4_MAX_PAGECACHE_ORDER(inode);
> +
>  	mapping_set_folio_order_range(inode->i_mapping,
>  				      EXT4_SB(inode->i_sb)->s_min_folio_order,
> -				      EXT4_MAX_PAGECACHE_ORDER(inode));
> +				      max_order);
>  }
>  
>  struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
> @@ -6553,14 +6558,14 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  	 * dirty data which can be converted only after flushing the dirty
>  	 * data (and journalled aops don't know how to handle these cases).
>  	 */
> -	if (val) {
> -		filemap_invalidate_lock(inode->i_mapping);
> -		err = filemap_write_and_wait(inode->i_mapping);
> -		if (err < 0) {
> -			filemap_invalidate_unlock(inode->i_mapping);
> -			return err;
> -		}
> +	filemap_invalidate_lock(inode->i_mapping);
> +	err = filemap_write_and_wait(inode->i_mapping);
> +	if (err < 0) {
> +		filemap_invalidate_unlock(inode->i_mapping);
> +		return err;
>  	}
> +	/* Before switch the inode journalling mode evict all the page cache. */
> +	truncate_pagecache(inode, 0);
>  
>  	alloc_ctx = ext4_writepages_down_write(inode->i_sb);
>  	jbd2_journal_lock_updates(journal);
> @@ -6585,12 +6590,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
>  	}
>  	ext4_set_aops(inode);
> +	ext4_set_inode_mapping_order(inode);
>  
>  	jbd2_journal_unlock_updates(journal);
>  	ext4_writepages_up_write(inode->i_sb, alloc_ctx);
> -
> -	if (val)
> -		filemap_invalidate_unlock(inode->i_mapping);
> +	filemap_invalidate_unlock(inode->i_mapping);
>  
>  	/* Finally we can mark the inode as dirty. */
>  
> -- 
> 2.46.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
Re: [PATCH v3 21/24] ext4: make data=journal support large block size
Posted by Zhang Yi 2 months, 4 weeks ago
On 11/11/2025 10:26 PM, libaokun@huaweicloud.com wrote:
> From: Baokun Li <libaokun1@huawei.com>
> 
> Currently, ext4_set_inode_mapping_order() does not set max folio order
> for files with the data journalling flag. For files that already have
> large folios enabled, ext4_inode_journal_mode() ignores the data
> journalling flag once max folio order is set.
> 
> This is not because data journalling cannot work with large folios, but
> because credit estimates will go through the roof if there are too many
> blocks per folio.
> 
> Since the real constraint is blocks-per-folio, to support data=journal
> under LBS, we now set max folio order to be equal to min folio order for
> files with the journalling flag. When LBS is disabled, the max folio order
> remains unset as before.
> 
> Therefore, before ext4_change_inode_journal_flag() switches the journalling
> mode, we call truncate_pagecache() to drop all page cache for that inode,
> and filemap_write_and_wait() is called unconditionally.
> 
> After that, once the journalling mode has been switched, we can safely
> reset the inode mapping order, and the mapping_large_folio_support() check
> in ext4_inode_journal_mode() can be removed.
> 
> Suggested-by: Jan Kara <jack@suse.cz>
> Signed-off-by: Baokun Li <libaokun1@huawei.com>

Looks good to me.

Reviewed-by: Zhang Yi <yi.zhang@huawei.com>

> ---
>  fs/ext4/ext4_jbd2.c |  3 +--
>  fs/ext4/inode.c     | 32 ++++++++++++++++++--------------
>  2 files changed, 19 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
> index a0e66bc10093..05e5946ed9b3 100644
> --- a/fs/ext4/ext4_jbd2.c
> +++ b/fs/ext4/ext4_jbd2.c
> @@ -16,8 +16,7 @@ int ext4_inode_journal_mode(struct inode *inode)
>  	    ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
>  	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
>  	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
> -	    !test_opt(inode->i_sb, DELALLOC) &&
> -	    !mapping_large_folio_support(inode->i_mapping))) {
> +	    !test_opt(inode->i_sb, DELALLOC))) {
>  		/* We do not support data journalling for encrypted data */
>  		if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
>  			return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 22d215f90c64..613a989bf750 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -5152,9 +5152,6 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
>  
>  	if (!S_ISREG(inode->i_mode))
>  		return false;
> -	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> -	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> -		return false;
>  	if (ext4_has_feature_verity(sb))
>  		return false;
>  	if (ext4_has_feature_encrypt(sb))
> @@ -5172,12 +5169,20 @@ static bool ext4_should_enable_large_folio(struct inode *inode)
>  		umin(MAX_PAGECACHE_ORDER, (11 + (i)->i_blkbits - PAGE_SHIFT))
>  void ext4_set_inode_mapping_order(struct inode *inode)
>  {
> +	u32 max_order;
> +
>  	if (!ext4_should_enable_large_folio(inode))
>  		return;
>  
> +	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
> +	    ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
> +		max_order = EXT4_SB(inode->i_sb)->s_min_folio_order;
> +	else
> +		max_order = EXT4_MAX_PAGECACHE_ORDER(inode);
> +
>  	mapping_set_folio_order_range(inode->i_mapping,
>  				      EXT4_SB(inode->i_sb)->s_min_folio_order,
> -				      EXT4_MAX_PAGECACHE_ORDER(inode));
> +				      max_order);
>  }
>  
>  struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
> @@ -6553,14 +6558,14 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  	 * dirty data which can be converted only after flushing the dirty
>  	 * data (and journalled aops don't know how to handle these cases).
>  	 */
> -	if (val) {
> -		filemap_invalidate_lock(inode->i_mapping);
> -		err = filemap_write_and_wait(inode->i_mapping);
> -		if (err < 0) {
> -			filemap_invalidate_unlock(inode->i_mapping);
> -			return err;
> -		}
> +	filemap_invalidate_lock(inode->i_mapping);
> +	err = filemap_write_and_wait(inode->i_mapping);
> +	if (err < 0) {
> +		filemap_invalidate_unlock(inode->i_mapping);
> +		return err;
>  	}
> +	/* Before switch the inode journalling mode evict all the page cache. */
> +	truncate_pagecache(inode, 0);
>  
>  	alloc_ctx = ext4_writepages_down_write(inode->i_sb);
>  	jbd2_journal_lock_updates(journal);
> @@ -6585,12 +6590,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
>  	}
>  	ext4_set_aops(inode);
> +	ext4_set_inode_mapping_order(inode);
>  
>  	jbd2_journal_unlock_updates(journal);
>  	ext4_writepages_up_write(inode->i_sb, alloc_ctx);
> -
> -	if (val)
> -		filemap_invalidate_unlock(inode->i_mapping);
> +	filemap_invalidate_unlock(inode->i_mapping);
>  
>  	/* Finally we can mark the inode as dirty. */
>