From: Zhang Yi <yi.zhang@huawei.com>
In ext4_alloc_file_blocks(), pagecache_isize_extended() is called under
an active handle and may also hold folio lock if the block size is
smaller than the folio size. This also breaks the "folio lock ->
transaction start" lock ordering for the upcoming iomap buffered I/O
path.
Therefore, move pagecache_isize_extended() outside of an active handle.
Additionally, it is unnecessary to update the file length during each
iteration of the allocation loop. Instead, update the file length only
to the position where the allocation is successful. Postpone updating
the inode size until after the allocation loop completes or is
interrupted due to an error.
Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
fs/ext4/extents.c | 62 +++++++++++++++++++++++++++++------------------
1 file changed, 39 insertions(+), 23 deletions(-)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7abe47f923c0..f13f604b1f67 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4553,7 +4553,7 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
ext4_lblk_t len_lblk;
struct ext4_map_blocks map;
unsigned int credits;
- loff_t epos, old_size = i_size_read(inode);
+ loff_t epos = 0, old_size = i_size_read(inode);
unsigned int blkbits = inode->i_blkbits;
bool alloc_zero = false;
@@ -4618,44 +4618,60 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
ext4_journal_stop(handle);
break;
}
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ ret = ext4_journal_stop(handle);
+ if (unlikely(ret))
+ break;
+
/*
* allow a full retry cycle for any remaining allocations
*/
retries = 0;
- epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret);
- if (new_size) {
- if (epos > new_size)
- epos = new_size;
- ext4_update_inode_size(inode, epos);
- if (epos > old_size)
- pagecache_isize_extended(inode, old_size, epos);
- }
- ret2 = ext4_mark_inode_dirty(handle, inode);
- ext4_update_inode_fsync_trans(handle, inode, 1);
- ret3 = ext4_journal_stop(handle);
- ret2 = ret3 ? ret3 : ret2;
- if (unlikely(ret2))
- break;
if (alloc_zero &&
(map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
- ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
- map.m_len);
- if (likely(!ret2))
- ret2 = ext4_convert_unwritten_extents(NULL,
+ ret = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
+ map.m_len);
+ if (likely(!ret))
+ ret = ext4_convert_unwritten_extents(NULL,
inode, (loff_t)map.m_lblk << blkbits,
(loff_t)map.m_len << blkbits);
- if (ret2)
+ if (ret)
break;
}
- map.m_lblk += ret;
- map.m_len = len_lblk = len_lblk - ret;
+ map.m_lblk += map.m_len;
+ map.m_len = len_lblk = len_lblk - map.m_len;
+ epos = EXT4_LBLK_TO_B(inode, map.m_lblk);
}
+
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
- return ret > 0 ? ret2 : ret;
+ if (!epos || !new_size)
+ return ret;
+
+ /*
+ * Allocate blocks, update the file size to match the size of the
+ * already successfully allocated blocks.
+ */
+ if (epos > new_size)
+ epos = new_size;
+
+ handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+ if (IS_ERR(handle))
+ return ret ? ret : PTR_ERR(handle);
+
+ ext4_update_inode_size(inode, epos);
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+ ext4_update_inode_fsync_trans(handle, inode, 1);
+ ret3 = ext4_journal_stop(handle);
+ ret2 = ret3 ? ret3 : ret2;
+
+ if (epos > old_size)
+ pagecache_isize_extended(inode, old_size, epos);
+
+ return ret ? ret : ret2;
}
static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
--
2.52.0
On Fri 27-03-26 18:29:38, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
>
> In ext4_alloc_file_blocks(), pagecache_isize_extended() is called under
> an active handle and may also hold folio lock if the block size is
> smaller than the folio size. This also breaks the "folio lock ->
> transaction start" lock ordering for the upcoming iomap buffered I/O
> path.
>
> Therefore, move pagecache_isize_extended() outside of an active handle.
> Additionally, it is unnecessary to update the file length during each
> iteration of the allocation loop. Instead, update the file length only
> to the position where the allocation is successful. Postpone updating
> the inode size until after the allocation loop completes or is
> interrupted due to an error.
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
Looks good! Feel free to add:
Reviewed-by: Jan Kara <jack@suse.cz>
Honza
> ---
> fs/ext4/extents.c | 62 +++++++++++++++++++++++++++++------------------
> 1 file changed, 39 insertions(+), 23 deletions(-)
>
> diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
> index 7abe47f923c0..f13f604b1f67 100644
> --- a/fs/ext4/extents.c
> +++ b/fs/ext4/extents.c
> @@ -4553,7 +4553,7 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
> ext4_lblk_t len_lblk;
> struct ext4_map_blocks map;
> unsigned int credits;
> - loff_t epos, old_size = i_size_read(inode);
> + loff_t epos = 0, old_size = i_size_read(inode);
> unsigned int blkbits = inode->i_blkbits;
> bool alloc_zero = false;
>
> @@ -4618,44 +4618,60 @@ static int ext4_alloc_file_blocks(struct file *file, loff_t offset, loff_t len,
> ext4_journal_stop(handle);
> break;
> }
> + ext4_update_inode_fsync_trans(handle, inode, 1);
> + ret = ext4_journal_stop(handle);
> + if (unlikely(ret))
> + break;
> +
> /*
> * allow a full retry cycle for any remaining allocations
> */
> retries = 0;
> - epos = EXT4_LBLK_TO_B(inode, map.m_lblk + ret);
> - if (new_size) {
> - if (epos > new_size)
> - epos = new_size;
> - ext4_update_inode_size(inode, epos);
> - if (epos > old_size)
> - pagecache_isize_extended(inode, old_size, epos);
> - }
> - ret2 = ext4_mark_inode_dirty(handle, inode);
> - ext4_update_inode_fsync_trans(handle, inode, 1);
> - ret3 = ext4_journal_stop(handle);
> - ret2 = ret3 ? ret3 : ret2;
> - if (unlikely(ret2))
> - break;
>
> if (alloc_zero &&
> (map.m_flags & (EXT4_MAP_MAPPED | EXT4_MAP_UNWRITTEN))) {
> - ret2 = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
> - map.m_len);
> - if (likely(!ret2))
> - ret2 = ext4_convert_unwritten_extents(NULL,
> + ret = ext4_issue_zeroout(inode, map.m_lblk, map.m_pblk,
> + map.m_len);
> + if (likely(!ret))
> + ret = ext4_convert_unwritten_extents(NULL,
> inode, (loff_t)map.m_lblk << blkbits,
> (loff_t)map.m_len << blkbits);
> - if (ret2)
> + if (ret)
> break;
> }
>
> - map.m_lblk += ret;
> - map.m_len = len_lblk = len_lblk - ret;
> + map.m_lblk += map.m_len;
> + map.m_len = len_lblk = len_lblk - map.m_len;
> + epos = EXT4_LBLK_TO_B(inode, map.m_lblk);
> }
> +
> if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
> goto retry;
>
> - return ret > 0 ? ret2 : ret;
> + if (!epos || !new_size)
> + return ret;
> +
> + /*
> + * Allocate blocks, update the file size to match the size of the
> + * already successfully allocated blocks.
> + */
> + if (epos > new_size)
> + epos = new_size;
> +
> + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
> + if (IS_ERR(handle))
> + return ret ? ret : PTR_ERR(handle);
> +
> + ext4_update_inode_size(inode, epos);
> + ret2 = ext4_mark_inode_dirty(handle, inode);
> + ext4_update_inode_fsync_trans(handle, inode, 1);
> + ret3 = ext4_journal_stop(handle);
> + ret2 = ret3 ? ret3 : ret2;
> +
> + if (epos > old_size)
> + pagecache_isize_extended(inode, old_size, epos);
> +
> + return ret ? ret : ret2;
> }
>
> static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
> --
> 2.52.0
>
--
Jan Kara <jack@suse.com>
SUSE Labs, CR
Hello,
kernel test robot noticed a 22.1% improvement of fio.write_iops on:
commit: 7cdd1682909c26fb8784755a37688215e8bfbc50 ("[PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle")
url: https://github.com/intel-lab-lkp/linux/commits/Zhang-Yi/ext4-add-did_zero-output-parameter-to-ext4_block_zero_page_range/20260328-162522
base: https://git.kernel.org/cgit/linux/kernel/git/tytso/ext4.git dev
patch link: https://lore.kernel.org/all/20260327102939.1095257-13-yi.zhang@huaweicloud.com/
patch subject: [PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle
testcase: fio-basic
config: x86_64-rhel-9.4
compiler: gcc-14
test machine: 64 threads 2 sockets Intel(R) Xeon(R) Gold 6346 CPU @ 3.10GHz (Ice Lake) with 256G memory
parameters:
runtime: 300s
disk: 1HDD
fs: ext4
nr_task: 1
test_size: 128G
rw: write
bs: 4k
ioengine: falloc
cpufreq_governor: performance
Details are as below:
-------------------------------------------------------------------------------------------------->
The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20260331/202603312104.b8cce0af-lkp@intel.com
=========================================================================================
bs/compiler/cpufreq_governor/disk/fs/ioengine/kconfig/nr_task/rootfs/runtime/rw/tbox_group/test_size/testcase:
4k/gcc-14/performance/1HDD/ext4/falloc/x86_64-rhel-9.4/1/debian-13-x86_64-20250902.cgz/300s/write/lkp-icl-2sp9/128G/fio-basic
commit:
ac5088c694 ("ext4: remove ctime/mtime update from ext4_alloc_file_blocks()")
7cdd168290 ("[PATCH v4 12/13] ext4: move pagecache_isize_extended() out of active handle")
ac5088c694659c49 7cdd1682909c26fb8784755a376
---------------- ---------------------------
%stddev %change %stddev
\ | \
0.01 -0.0 0.00 fio.latency_250us%
0.11 ± 3% -0.0 0.08 ± 7% fio.latency_2us%
0.07 ± 2% -0.0 0.05 ± 3% fio.latency_4us%
34.85 ± 2% -18.0% 28.58 fio.time.elapsed_time
34.85 ± 2% -18.0% 28.58 fio.time.elapsed_time.max
24.00 -24.1% 18.22 fio.time.system_time
3744 ± 2% -17.8% 3078 fio.time.voluntary_context_switches
3807 ± 2% +22.1% 4649 fio.write_bw_MBps
846.67 -20.9% 669.33 fio.write_clat_90%_ns
854.67 -21.1% 674.67 fio.write_clat_95%_ns
870.67 -21.0% 688.00 fio.write_clat_99%_ns
834.60 -21.3% 656.75 fio.write_clat_mean_ns
974606 ± 2% +22.1% 1190280 fio.write_iops
2.331e+09 ± 2% -17.4% 1.926e+09 cpuidle..time
1.10 -5.8% 1.03 iostat.cpu.system
112890 ± 2% -14.9% 96120 turbostat.IRQ
1472 ± 4% +15.6% 1702 vmstat.io.bo
1.10 -0.1 1.04 mpstat.cpu.all.sys%
0.54 ± 2% +0.1 0.64 ± 3% mpstat.cpu.all.usr%
49.52 ± 21% -31.3% 34.03 ± 33% perf-sched.total_wait_time.max.ms
49.52 ± 21% -31.3% 34.03 ± 33% perf-sched.wait_time.max.ms.[unknown].[unknown].[unknown].[unknown].[unknown]
123.26 -5.1% 116.96 uptime.boot
7526 -5.3% 7128 uptime.idle
0.03 ± 7% +17.9% 0.03 ± 5% perf-stat.i.MPKI
0.29 ± 8% +0.1 0.38 perf-stat.i.branch-miss-rate%
4759878 ± 7% +33.3% 6346860 perf-stat.i.branch-misses
201390 ± 4% +19.8% 241185 ± 6% perf-stat.i.cache-misses
891976 +14.7% 1023336 perf-stat.i.cache-references
0.48 ± 2% +3.1% 0.49 perf-stat.i.cpi
3.807e+09 +1.5% 3.866e+09 perf-stat.i.cpu-cycles
70.29 +2.0% 71.71 perf-stat.i.cpu-migrations
382.22 ± 3% +19.4% 456.21 ± 4% perf-stat.i.minor-faults
382.22 ± 3% +19.4% 456.20 ± 4% perf-stat.i.page-faults
0.03 ± 6% +21.4% 0.03 ± 6% perf-stat.overall.MPKI
0.28 ± 9% +0.1 0.38 perf-stat.overall.branch-miss-rate%
18830 ± 5% -15.0% 16001 ± 5% perf-stat.overall.cycles-between-cache-misses
8436 -19.3% 6810 perf-stat.overall.path-length
4633589 ± 7% +32.5% 6139073 perf-stat.ps.branch-misses
197159 ± 5% +18.9% 234444 ± 6% perf-stat.ps.cache-misses
871990 +13.9% 993519 perf-stat.ps.cache-references
68.47 +1.4% 69.43 perf-stat.ps.cpu-migrations
379.37 ± 3% +18.3% 448.62 ± 3% perf-stat.ps.minor-faults
379.37 ± 3% +18.3% 448.62 ± 3% perf-stat.ps.page-faults
2.831e+11 -19.3% 2.285e+11 perf-stat.total.instructions
8.95 ± 91% -6.7 2.23 ±141% perf-profile.calltrace.cycles-pp.do_file_open.do_sys_openat2.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe
8.95 ± 91% -6.7 2.23 ±141% perf-profile.calltrace.cycles-pp.path_openat.do_file_open.do_sys_openat2.__x64_sys_openat.do_syscall_64
6.48 ± 86% -5.4 1.04 ±223% perf-profile.calltrace.cycles-pp.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe
6.48 ± 86% -5.4 1.04 ±223% perf-profile.calltrace.cycles-pp.do_sys_openat2.__x64_sys_openat.do_syscall_64.entry_SYSCALL_64_after_hwframe
5.42 ±109% -4.6 0.79 ±223% perf-profile.calltrace.cycles-pp.lruvec_stat_mod_folio.folio_remove_rmap_ptes.zap_present_ptes.zap_pte_range.zap_pmd_range
5.46 ± 47% -3.6 1.84 ±143% perf-profile.calltrace.cycles-pp.tlb_finish_mmu.exit_mmap.__mmput.exit_mm.do_exit
4.28 ± 73% -3.2 1.04 ±223% perf-profile.calltrace.cycles-pp.open_last_lookups.path_openat.do_file_open.do_sys_openat2.__x64_sys_openat
4.28 ± 73% -2.4 1.84 ±143% perf-profile.calltrace.cycles-pp.__tlb_batch_free_encoded_pages.tlb_finish_mmu.exit_mmap.__mmput.exit_mm
4.28 ± 73% -2.4 1.84 ±143% perf-profile.calltrace.cycles-pp.free_pages_and_swap_cache.__tlb_batch_free_encoded_pages.tlb_finish_mmu.exit_mmap.__mmput
3.40 ±101% -2.4 1.04 ±223% perf-profile.calltrace.cycles-pp.lookup_open.open_last_lookups.path_openat.do_file_open.do_sys_openat2
2.99 ±101% -1.2 1.84 ±143% perf-profile.calltrace.cycles-pp.folios_put_refs.free_pages_and_swap_cache.__tlb_batch_free_encoded_pages.tlb_finish_mmu.exit_mmap
2.99 ±101% -0.6 2.43 ±143% perf-profile.calltrace.cycles-pp.vfs_statx.vfs_fstatat.__do_sys_newfstatat.do_syscall_64.entry_SYSCALL_64_after_hwframe
8.96 ± 91% -6.7 2.23 ±141% perf-profile.children.cycles-pp.do_file_open
8.96 ± 91% -6.7 2.23 ±141% perf-profile.children.cycles-pp.path_openat
5.46 ± 47% -3.6 1.84 ±143% perf-profile.children.cycles-pp.tlb_finish_mmu
4.28 ± 73% -3.2 1.04 ±223% perf-profile.children.cycles-pp.open_last_lookups
3.40 ±101% -2.4 1.04 ±223% perf-profile.children.cycles-pp.lookup_open
3.40 ±101% -2.2 1.19 ±223% perf-profile.children.cycles-pp.link_path_walk
4.14 ±102% -2.0 2.18 ±149% perf-profile.children.cycles-pp.lruvec_stat_mod_folio
3.88 ±105% -1.7 2.18 ±149% perf-profile.children.cycles-pp.kernfs_fop_readdir
3.92 ±109% -1.5 2.43 ±143% perf-profile.children.cycles-pp.__do_sys_newfstatat
3.92 ±109% -1.5 2.43 ±143% perf-profile.children.cycles-pp.vfs_fstatat
2.99 ±101% -0.6 2.43 ±143% perf-profile.children.cycles-pp.vfs_statx
5.80 ± 74% +6.5 12.30 ± 22% perf-profile.children.cycles-pp.exc_page_fault
3.88 ±105% -3.9 0.00 perf-profile.self.cycles-pp.kernfs_fop_readdir
Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2026 Red Hat, Inc.