[PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle

Zhang Yi posted 13 patches 6 days, 6 hours ago
[PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle
Posted by Zhang Yi 6 days, 6 hours ago
From: Zhang Yi <yi.zhang@huawei.com>

In ext4_alloc_file_blocks(), pagecache_isize_extended() is called under
an active handle and may also hold folio lock if the block size is
smaller than the folio size. This also breaks the "folio lock ->
transaction start" lock ordering for the upcoming iomap buffered I/O
path.

Therefore, move pagecache_isize_extended() outside of an active handle.
Additionally, it is unnecessary to update the file length during each
iteration of the allocation loop. Instead, update the file length only
to the position where the allocation is successful. Postpone updating
the inode size until after the allocation loop completes or is
interrupted due to an error.

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
 fs/ext4/extents.c | 62 +++++++++++++++++++++++++++++------------------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7abe47f923c0..f13f604b1f67 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4553,7 +4553,7 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
 	ext4_lblk_t len_lblk;
 	struct ext4_map_blocks map;
 	unsigned int credits;
-	loff_t epos, old_size = i_size_read(inode);
+	loff_t epos = 0, old_size = i_size_read(inode);
 	unsigned int blkbits = inode->i_blkbits;
 	bool alloc_zero = false;
 
@@ -4618,44 +4618,60 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
 			ext4_journal_stop(handle);
 			break;
 		}
+		ext4_update_inode_fsync_trans(handle, inode, 1);
+		ret = ext4_journal_stop(handle);
+		if (unlikely(ret))
+			break;
+
 		/*
 		 * allow a full retry cycle for any remaining allocations
 		 */
 		retries = 0;
-		epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret);
-		if (new_size) {
-			if (epos > new_size)
-				epos = new_size;
-			ext4_update_inode_size(inode, epos);
-			if (epos > old_size)
-				pagecache_isize_extended(inode, old_size, epos);
-		}
-		ret2 = ext4_mark_inode_dirty(handle, inode);
-		ext4_update_inode_fsync_trans(handle, inode, 1);
-		ret3 = ext4_journal_stop(handle);
-		ret2 = ret3 ? ret3 : ret2;
-		if (unlikely(ret2))
-			break;
 
 		if (alloc_zero &&
 		    (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
-			ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
-						  map.m_len);
-			if (likely(!ret2))
-				ret2 = ext4_convert_unwritten_extents(NULL,
+			ret = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
+						 map.m_len);
+			if (likely(!ret))
+				ret = ext4_convert_unwritten_extents(NULL,
 					inode, (loff_t)map.m_lblk << blkbits,
 					(loff_t)map.m_len << blkbits);
-			if (ret2)
+			if (ret)
 				break;
 		}
 
-		map.m_lblk += ret;
-		map.m_len = len_lblk = len_lblk - ret;
+		map.m_lblk += map.m_len;
+		map.m_len = len_lblk = len_lblk - map.m_len;
+		epos = EXT4_LBLK_TO_B(inode, map.m_lblk);
 	}
+
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
 		goto retry;
 
-	return ret > 0 ? ret2 : ret;
+	if (!epos || !new_size)
+		return ret;
+
+	/*
+	 * Allocate blocks, update the file size to match the size of the
+	 * already successfully allocated blocks.
+	 */
+	if (epos > new_size)
+		epos = new_size;
+
+	handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+	if (IS_ERR(handle))
+		return ret ? ret : PTR_ERR(handle);
+
+	ext4_update_inode_size(inode, epos);
+	ret2 = ext4_mark_inode_dirty(handle, inode);
+	ext4_update_inode_fsync_trans(handle, inode, 1);
+	ret3 = ext4_journal_stop(handle);
+	ret2 = ret3 ? ret3 : ret2;
+
+	if (epos > old_size)
+		pagecache_isize_extended(inode, old_size, epos);
+
+	return ret ? ret : ret2;
 }
 
 static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
-- 
2.52.0
Re: [PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle
Posted by Jan Kara 1 day ago
On Fri 27-03-26 18:29:38, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> In ext4_alloc_file_blocks(), pagecache_isize_extended() is called under
> an active handle and may also hold folio lock if the block size is
> smaller than the folio size. This also breaks the "folio lock ->
> transaction start" lock ordering for the upcoming iomap buffered I/O
> path.
> 
> Therefore, move pagecache_isize_extended() outside of an active handle.
> Additionally, it is unnecessary to update the file length during each
> iteration of the allocation loop. Instead, update the file length only
> to the position where the allocation is successful. Postpone updating
> the inode size until after the allocation loop completes or is
> interrupted due to an error.
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good! Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/extents.c | 62 +++++++++++++++++++++++++++++------------------
>  1 file changed, 39 insertions(+), 23 deletions(-)
> 
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 7abe47f923c0..f13f604b1f67 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4553,7 +4553,7 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
>  	ext4_lblk_t len_lblk;
>  	struct ext4_map_blocks map;
>  	unsigned int credits;
> -	loff_t epos, old_size = i_size_read(inode);
> +	loff_t epos = 0, old_size = i_size_read(inode);
>  	unsigned int blkbits = inode->i_blkbits;
>  	bool alloc_zero = false;
>  
> @@ -4618,44 +4618,60 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
>  			ext4_journal_stop(handle);
>  			break;
>  		}
> +		ext4_update_inode_fsync_trans(handle, inode, 1);
> +		ret = ext4_journal_stop(handle);
> +		if (unlikely(ret))
> +			break;
> +
>  		/*
>  		 * allow a full retry cycle for any remaining allocations
>  		 */
>  		retries = 0;
> -		epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret);
> -		if (new_size) {
> -			if (epos > new_size)
> -				epos = new_size;
> -			ext4_update_inode_size(inode, epos);
> -			if (epos > old_size)
> -				pagecache_isize_extended(inode, old_size, epos);
> -		}
> -		ret2 = ext4_mark_inode_dirty(handle, inode);
> -		ext4_update_inode_fsync_trans(handle, inode, 1);
> -		ret3 = ext4_journal_stop(handle);
> -		ret2 = ret3 ? ret3 : ret2;
> -		if (unlikely(ret2))
> -			break;
>  
>  		if (alloc_zero &&
>  		    (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
> -			ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
> -						  map.m_len);
> -			if (likely(!ret2))
> -				ret2 = ext4_convert_unwritten_extents(NULL,
> +			ret = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
> +						 map.m_len);
> +			if (likely(!ret))
> +				ret = ext4_convert_unwritten_extents(NULL,
>  					inode, (loff_t)map.m_lblk << blkbits,
>  					(loff_t)map.m_len << blkbits);
> -			if (ret2)
> +			if (ret)
>  				break;
>  		}
>  
> -		map.m_lblk += ret;
> -		map.m_len = len_lblk = len_lblk - ret;
> +		map.m_lblk += map.m_len;
> +		map.m_len = len_lblk = len_lblk - map.m_len;
> +		epos = EXT4_LBLK_TO_B(inode, map.m_lblk);
>  	}
> +
>  	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
>  		goto retry;
>  
> -	return ret > 0 ? ret2 : ret;
> +	if (!epos || !new_size)
> +		return ret;
> +
> +	/*
> +	 * Allocate blocks, update the file size to match the size of the
> +	 * already successfully allocated blocks.
> +	 */
> +	if (epos > new_size)
> +		epos = new_size;
> +
> +	handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
> +	if (IS_ERR(handle))
> +		return ret ? ret : PTR_ERR(handle);
> +
> +	ext4_update_inode_size(inode, epos);
> +	ret2 = ext4_mark_inode_dirty(handle, inode);
> +	ext4_update_inode_fsync_trans(handle, inode, 1);
> +	ret3 = ext4_journal_stop(handle);
> +	ret2 = ret3 ? ret3 : ret2;
> +
> +	if (epos > old_size)
> +		pagecache_isize_extended(inode, old_size, epos);
> +
> +	return ret ? ret : ret2;
>  }
>  
>  static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
> -- 
> 2.52.0
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR
Re: [PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle
Posted by kernel test robot 2 days, 3 hours ago

Hello,

kernel test robot noticed a 22.1% improvement of fio.write_iops on:


commit: 7cdd1682909c26fb8784755a37688215e8bfbc50 ("[PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle")
url: https://github.com/intel-lab-lkp/linux/commits/Zhang-Yi/ext4-add-did_zero-output-parameter-to-ext4_block_zero_page_range/20260328-162522
base: https://git.kernel.org/cgit/linux/kernel/git/tytso/ext4.git dev
patch link: https://lore.kernel.org/all/20260327102939.1095257-13-yi.zhang@huaweicloud.com/
patch subject: [PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle

testcase: fio-basic
config: x86_64-rhel-9.4
compiler: gcc-14
test machine: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory
parameters:

	runtime: 300s
	disk: 1HDD
	fs: ext4
	nr_task: 1
	test_size: 128G
	rw: write
	bs: 4k
	ioengine: falloc
	cpufreq_governor: performance



Details are as below:
-------------------------------------------------------------------------------------------------->


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20260331/202603312104.b8cce0af-lkp@intel.com

=========================================================================================
bs/compiler/cpufreq_governor/disk/fs/ioengine/kconfig/nr_task/rootfs/runtime/rw/tbox_group/test_size/testcase:
  4k/gcc-14/performance/1HDD/ext4/falloc/x86_64-rhel-9.4/1/debian-13-x86_64-20250902.cgz/300s/write/lkp-icl-2sp9/128G/fio-basic

commit: 
  ac5088c694 ("ext4: remove ctime/mtime update from ext4_alloc_file_blocks()")
  7cdd168290 ("[PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle")

ac5088c694659c49 7cdd1682909c26fb8784755a376 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      0.01            -0.0        0.00        fio.latency_250us%
      0.11 ±  3%      -0.0        0.08 ±  7%  fio.latency_2us%
      0.07 ±  2%      -0.0        0.05 ±  3%  fio.latency_4us%
     34.85 ±  2%     -18.0%      28.58        fio.time.elapsed_time
     34.85 ±  2%     -18.0%      28.58        fio.time.elapsed_time.max
     24.00           -24.1%      18.22        fio.time.system_time
      3744 ±  2%     -17.8%       3078        fio.time.voluntary_context_switches
      3807 ±  2%     +22.1%       4649        fio.write_bw_MBps
    846.67           -20.9%     669.33        fio.write_clat_90%_ns
    854.67           -21.1%     674.67        fio.write_clat_95%_ns
    870.67           -21.0%     688.00        fio.write_clat_99%_ns
    834.60           -21.3%     656.75        fio.write_clat_mean_ns
    974606 ±  2%     +22.1%    1190280        fio.write_iops
 2.331e+09 ±  2%     -17.4%  1.926e+09        cpuidle..time
      1.10            -5.8%       1.03        iostat.cpu.system
    112890 ±  2%     -14.9%      96120        turbostat.IRQ
      1472 ±  4%     +15.6%       1702        vmstat.io.bo
      1.10            -0.1        1.04        mpstat.cpu.all.sys%
      0.54 ±  2%      +0.1        0.64 ±  3%  mpstat.cpu.all.usr%
     49.52 ± 21%     -31.3%      34.03 ± 33%  perf-sched.total_wait_time.max.ms
     49.52 ± 21%     -31.3%      34.03 ± 33%  perf-sched.wait_time.max.ms.[unknown].[unknown].[unknown].[unknown].[unknown]
    123.26            -5.1%     116.96        uptime.boot
      7526            -5.3%       7128        uptime.idle
      0.03 ±  7%     +17.9%       0.03 ±  5%  perf-stat.i.MPKI
      0.29 ±  8%      +0.1        0.38        perf-stat.i.branch-miss-rate%
   4759878 ±  7%     +33.3%    6346860        perf-stat.i.branch-misses
    201390 ±  4%     +19.8%     241185 ±  6%  perf-stat.i.cache-misses
    891976           +14.7%    1023336        perf-stat.i.cache-references
      0.48 ±  2%      +3.1%       0.49        perf-stat.i.cpi
 3.807e+09            +1.5%  3.866e+09        perf-stat.i.cpu-cycles
     70.29            +2.0%      71.71        perf-stat.i.cpu-migrations
    382.22 ±  3%     +19.4%     456.21 ±  4%  perf-stat.i.minor-faults
    382.22 ±  3%     +19.4%     456.20 ±  4%  perf-stat.i.page-faults
      0.03 ±  6%     +21.4%       0.03 ±  6%  perf-stat.overall.MPKI
      0.28 ±  9%      +0.1        0.38        perf-stat.overall.branch-miss-rate%
     18830 ±  5%     -15.0%      16001 ±  5%  perf-stat.overall.cycles-between-cache-misses
      8436           -19.3%       6810        perf-stat.overall.path-length
   4633589 ±  7%     +32.5%    6139073        perf-stat.ps.branch-misses
    197159 ±  5%     +18.9%     234444 ±  6%  perf-stat.ps.cache-misses
    871990           +13.9%     993519        perf-stat.ps.cache-references
     68.47            +1.4%      69.43        perf-stat.ps.cpu-migrations
    379.37 ±  3%     +18.3%     448.62 ±  3%  perf-stat.ps.minor-faults
    379.37 ±  3%     +18.3%     448.62 ±  3%  perf-stat.ps.page-faults
 2.831e+11           -19.3%  2.285e+11        perf-stat.total.instructions
      8.95 ± 91%      -6.7        2.23 ±141%  perf-profile.calltrace.cycles-pp.do_file_open.do_sys_openat2.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe
      8.95 ± 91%      -6.7        2.23 ±141%  perf-profile.calltrace.cycles-pp.path_openat.do_file_open.do_sys_openat2.__x64_sys_openat.do_syscall_64
      6.48 ± 86%      -5.4        1.04 ±223%  perf-profile.calltrace.cycles-pp.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe
      6.48 ± 86%      -5.4        1.04 ±223%  perf-profile.calltrace.cycles-pp.do_sys_openat2.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe
      5.42 ±109%      -4.6        0.79 ±223%  perf-profile.calltrace.cycles-pp.lruvec_stat_mod_folio.folio_remove_rmap_ptes.zap_present_ptes.zap_pte_range.zap_pmd_range
      5.46 ± 47%      -3.6        1.84 ±143%  perf-profile.calltrace.cycles-pp.tlb_finish_mmu.exit_mmap.__mmput.exit_mm.do_exit
      4.28 ± 73%      -3.2        1.04 ±223%  perf-profile.calltrace.cycles-pp.open_last_lookups.path_openat.do_file_open.do_sys_openat2.__x64_sys_openat
      4.28 ± 73%      -2.4        1.84 ±143%  perf-profile.calltrace.cycles-pp.__tlb_batch_free_encoded_pages.tlb_finish_mmu.exit_mmap.__mmput.exit_mm
      4.28 ± 73%      -2.4        1.84 ±143%  perf-profile.calltrace.cycles-pp.free_pages_and_swap_cache.__tlb_batch_free_encoded_pages.tlb_finish_mmu.exit_mmap.__mmput
      3.40 ±101%      -2.4        1.04 ±223%  perf-profile.calltrace.cycles-pp.lookup_open.open_last_lookups.path_openat.do_file_open.do_sys_openat2
      2.99 ±101%      -1.2        1.84 ±143%  perf-profile.calltrace.cycles-pp.folios_put_refs.free_pages_and_swap_cache.__tlb_batch_free_encoded_pages.tlb_finish_mmu.exit_mmap
      2.99 ±101%      -0.6        2.43 ±143%  perf-profile.calltrace.cycles-pp.vfs_statx.vfs_fstatat.__do_sys_newfstatat.do_syscall_64.entry_SYSCALL_64_after_hwframe
      8.96 ± 91%      -6.7        2.23 ±141%  perf-profile.children.cycles-pp.do_file_open
      8.96 ± 91%      -6.7        2.23 ±141%  perf-profile.children.cycles-pp.path_openat
      5.46 ± 47%      -3.6        1.84 ±143%  perf-profile.children.cycles-pp.tlb_finish_mmu
      4.28 ± 73%      -3.2        1.04 ±223%  perf-profile.children.cycles-pp.open_last_lookups
      3.40 ±101%      -2.4        1.04 ±223%  perf-profile.children.cycles-pp.lookup_open
      3.40 ±101%      -2.2        1.19 ±223%  perf-profile.children.cycles-pp.link_path_walk
      4.14 ±102%      -2.0        2.18 ±149%  perf-profile.children.cycles-pp.lruvec_stat_mod_folio
      3.88 ±105%      -1.7        2.18 ±149%  perf-profile.children.cycles-pp.kernfs_fop_readdir
      3.92 ±109%      -1.5        2.43 ±143%  perf-profile.children.cycles-pp.__do_sys_newfstatat
      3.92 ±109%      -1.5        2.43 ±143%  perf-profile.children.cycles-pp.vfs_fstatat
      2.99 ±101%      -0.6        2.43 ±143%  perf-profile.children.cycles-pp.vfs_statx
      5.80 ± 74%      +6.5       12.30 ± 22%  perf-profile.children.cycles-pp.exc_page_fault
      3.88 ±105%      -3.9        0.00        perf-profile.self.cycles-pp.kernfs_fop_readdir




Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki