When stripe width (io_opt) is configured, align the predicted
preallocation size to stripe boundaries. This ensures optimal I/O
performance on RAID and other striped storage devices by avoiding
partial stripe operations.
The current implementation uses hardcoded size predictions (16KB, 32KB,
64KB, etc.) that are not stripe-aware. This causes physical block
offsets on disk to be misaligned to stripe boundaries, leading to
read-modify-write penalties on RAID arrays and reduced performance.
This patch makes size prediction stripe-aware by using multiples of
stripe size (1x, 2x, 4x, 8x, 16x, 32x) when s_stripe is set.
Additionally, the start offset is aligned to stripe boundaries using
rounddown(), which works correctly for both power-of-2 and non-power-of-2
stripe sizes. For devices without stripe configuration, the original
behavior is preserved.
The predicted size is limited to max free chunk size (2 << bsbits) to
ensure reasonable allocation requests, with the limit rounded down to
maintain stripe alignment.
Test case:
Device: 32-disk RAID5, 64KB chunk size
Stripe: 496 blocks (31 data disks × 16 blocks/disk)
Before patch (misaligned physical offsets):
ext: logical_offset: physical_offset: length:
0: 0.. 63487: 34816.. 98303: 63488
1: 63488..126975: 100352..163839: 63488
2: 126976..190463: 165888..229375: 63488
3: 190464..253951: 231424..294911: 63488
4: 253952..262143: 296960..305151: 8192
Physical offsets: 34816 % 496 = 96 (misaligned)
100352 % 496 = 160 (misaligned)
165888 % 496 = 224 (misaligned)
→ Causes partial stripe writes on RAID
After patch (aligned physical offsets):
ext: logical_offset: physical_offset: length:
0: 0.. 17855: 9920.. 27775: 17856
1: 17856.. 42159: 34224.. 58527: 24304
2: 42160.. 73407: 65968.. 97215: 31248
3: 73408.. 97711: 99696..123999: 24304
... (all extents aligned until EOF)
Physical offsets: 9920 % 496 = 0 (aligned)
34224 % 496 = 0 (aligned)
65968 % 496 = 0 (aligned)
Extent lengths: 17856=496×36, 24304=496×49, 31248=496×63
→ Optimal RAID performance, no partial stripe writes
Benefits:
- Eliminates read-modify-write operations on RAID arrays
- Improves sequential write performance on striped devices
- Maintains proper alignment throughout file lifetime
- Works with any stripe size (power-of-2 or not)
Signed-off-by: Yu Kuai <yukuai@fnnas.com>
---
fs/ext4/mballoc.c | 60 +++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 58 insertions(+), 2 deletions(-)
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index eb46a4f5fb4f..dbd0b239cc96 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4500,7 +4500,10 @@ static inline bool ext4_mb_check_size(loff_t req, loff_t size,
/*
* Predict file size for preallocation. Returns the predicted size
- * in bytes and sets start_off if alignment is needed for large files.
+ * in bytes. When stripe width (io_opt) is configured, returns sizes
+ * that are multiples of stripe for optimal RAID performance.
+ *
+ * Sets start_off if alignment is needed for large files.
*/
static loff_t ext4_mb_predict_file_size(struct ext4_sb_info *sbi,
struct ext4_allocation_context *ac,
@@ -4511,6 +4514,59 @@ static loff_t ext4_mb_predict_file_size(struct ext4_sb_info *sbi,
*start_off = 0;
+ /*
+ * For RAID/striped devices, align preallocation size to stripe
+ * width (io_opt) for optimal I/O performance. Use power-of-2
+ * multiples of stripe size for size prediction.
+ */
+ if (sbi->s_stripe) {
+ loff_t stripe_bytes = (loff_t)sbi->s_stripe << bsbits;
+ loff_t max_size = (loff_t)max << bsbits;
+
+ /*
+ * TODO: If stripe is larger than max chunk size, we can't
+ * do stripe-aligned allocation. Fall back to traditional
+ * size prediction. This can happen with very large stripe
+ * configurations on small block sizes.
+ */
+ if (stripe_bytes > max_size)
+ goto no_stripe;
+
+ if (size <= stripe_bytes) {
+ size = stripe_bytes;
+ } else if (size <= stripe_bytes * 2) {
+ size = stripe_bytes * 2;
+ } else if (size <= stripe_bytes * 4) {
+ size = stripe_bytes * 4;
+ } else if (size <= stripe_bytes * 8) {
+ size = stripe_bytes * 8;
+ } else if (size <= stripe_bytes * 16) {
+ size = stripe_bytes * 16;
+ } else if (size <= stripe_bytes * 32) {
+ size = stripe_bytes * 32;
+ } else {
+ size = roundup(size, stripe_bytes);
+ }
+
+ /*
+ * Limit size to max free chunk size, rounded down to
+ * stripe alignment.
+ */
+ if (size > max_size)
+ size = rounddown(max_size, stripe_bytes);
+
+ /*
+ * Align start offset to stripe boundary for large allocations
+ * to ensure both start and size are stripe-aligned.
+ */
+ *start_off = rounddown((loff_t)ac->ac_o_ex.fe_logical << bsbits,
+ stripe_bytes);
+
+ return size;
+ }
+
+no_stripe:
+ /* No stripe: use traditional hardcoded size prediction */
if (size <= 16 * 1024) {
size = 16 * 1024;
} else if (size <= 32 * 1024) {
@@ -4556,7 +4612,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_super_block *es = sbi->s_es;
- int bsbits, max;
+ int bsbits;
loff_t size, start_off = 0, end;
loff_t orig_size __maybe_unused;
ext4_lblk_t start;
--
2.51.0
Hi Yu, kernel test robot noticed the following build errors: [auto build test ERROR on tytso-ext4/dev] [also build test ERROR on linus/master v6.18 next-20251211] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/ext4-refactor-size-prediction-into-helper-functions/20251208-163553 base: https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev patch link: https://lore.kernel.org/r/20251208083246.320965-3-yukuai%40fnnas.com patch subject: [PATCH 2/2] ext4: align preallocation size to stripe width config: arm-randconfig-r072-20251210 (https://download.01.org/0day-ci/archive/20251212/202512120613.mM5COVWV-lkp@intel.com/config) compiler: arm-linux-gnueabi-gcc (GCC) 12.5.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251212/202512120613.mM5COVWV-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202512120613.mM5COVWV-lkp@intel.com/ All errors (new ones prefixed by >>): arm-linux-gnueabi-ld: fs/ext4/mballoc.o: in function `ext4_mb_predict_file_size': >> mballoc.c:(.text+0x242): undefined reference to `__aeabi_ldivmod' arm-linux-gnueabi-ld: (__aeabi_ldivmod): Unknown destination type (ARM/Thumb) in fs/ext4/mballoc.o >> mballoc.c:(.text+0x242): dangerous relocation: unsupported relocation >> arm-linux-gnueabi-ld: mballoc.c:(.text+0x268): undefined reference to `__aeabi_ldivmod' arm-linux-gnueabi-ld: (__aeabi_ldivmod): Unknown destination type (ARM/Thumb) in fs/ext4/mballoc.o mballoc.c:(.text+0x268): dangerous relocation: unsupported relocation arm-linux-gnueabi-ld: mballoc.c:(.text+0x29c): undefined reference to `__aeabi_ldivmod' arm-linux-gnueabi-ld: (__aeabi_ldivmod): Unknown destination type (ARM/Thumb) in fs/ext4/mballoc.o mballoc.c:(.text+0x29c): dangerous relocation: unsupported relocation -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Hi Yu,
kernel test robot noticed the following build errors:
[auto build test ERROR on tytso-ext4/dev]
[also build test ERROR on linus/master v6.18 next-20251210]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/ext4-refactor-size-prediction-into-helper-functions/20251208-163553
base: https://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4.git dev
patch link: https://lore.kernel.org/r/20251208083246.320965-3-yukuai%40fnnas.com
patch subject: [PATCH 2/2] ext4: align preallocation size to stripe width
config: i386-randconfig-001-20251210 (https://download.01.org/0day-ci/archive/20251210/202512102331.yweFnVTU-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20251210/202512102331.yweFnVTU-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202512102331.yweFnVTU-lkp@intel.com/
All errors (new ones prefixed by >>):
ld: fs/ext4/mballoc.o: in function `ext4_mb_predict_file_size':
>> fs/ext4/mballoc.c:4570:(.text+0xc37): undefined reference to `__moddi3'
>> ld: fs/ext4/mballoc.c:4578:(.text+0xc7c): undefined reference to `__moddi3'
ld: fs/ext4/mballoc.c:4584:(.text+0xcf2): undefined reference to `__moddi3'
vim +4570 fs/ext4/mballoc.c
4522
4523 /*
4524 * Predict file size for preallocation. Returns the predicted size
4525 * in bytes. When stripe width (io_opt) is configured, returns sizes
4526 * that are multiples of stripe for optimal RAID performance.
4527 *
4528 * Sets start_off if alignment is needed for large files.
4529 */
4530 static loff_t ext4_mb_predict_file_size(struct ext4_sb_info *sbi,
4531 struct ext4_allocation_context *ac,
4532 loff_t size, loff_t *start_off)
4533 {
4534 int bsbits = ac->ac_sb->s_blocksize_bits;
4535 int max = 2 << bsbits;
4536
4537 *start_off = 0;
4538
4539 /*
4540 * For RAID/striped devices, align preallocation size to stripe
4541 * width (io_opt) for optimal I/O performance. Use power-of-2
4542 * multiples of stripe size for size prediction.
4543 */
4544 if (sbi->s_stripe) {
4545 loff_t stripe_bytes = (loff_t)sbi->s_stripe << bsbits;
4546 loff_t max_size = (loff_t)max << bsbits;
4547
4548 /*
4549 * TODO: If stripe is larger than max chunk size, we can't
4550 * do stripe-aligned allocation. Fall back to traditional
4551 * size prediction. This can happen with very large stripe
4552 * configurations on small block sizes.
4553 */
4554 if (stripe_bytes > max_size)
4555 goto no_stripe;
4556
4557 if (size <= stripe_bytes) {
4558 size = stripe_bytes;
4559 } else if (size <= stripe_bytes * 2) {
4560 size = stripe_bytes * 2;
4561 } else if (size <= stripe_bytes * 4) {
4562 size = stripe_bytes * 4;
4563 } else if (size <= stripe_bytes * 8) {
4564 size = stripe_bytes * 8;
4565 } else if (size <= stripe_bytes * 16) {
4566 size = stripe_bytes * 16;
4567 } else if (size <= stripe_bytes * 32) {
4568 size = stripe_bytes * 32;
4569 } else {
> 4570 size = roundup(size, stripe_bytes);
4571 }
4572
4573 /*
4574 * Limit size to max free chunk size, rounded down to
4575 * stripe alignment.
4576 */
4577 if (size > max_size)
> 4578 size = rounddown(max_size, stripe_bytes);
4579
4580 /*
4581 * Align start offset to stripe boundary for large allocations
4582 * to ensure both start and size are stripe-aligned.
4583 */
4584 *start_off = rounddown((loff_t)ac->ac_o_ex.fe_logical << bsbits,
4585 stripe_bytes);
4586
4587 return size;
4588 }
4589
4590 no_stripe:
4591 /* No stripe: use traditional hardcoded size prediction */
4592 if (size <= 16 * 1024) {
4593 size = 16 * 1024;
4594 } else if (size <= 32 * 1024) {
4595 size = 32 * 1024;
4596 } else if (size <= 64 * 1024) {
4597 size = 64 * 1024;
4598 } else if (size <= 128 * 1024) {
4599 size = 128 * 1024;
4600 } else if (size <= 256 * 1024) {
4601 size = 256 * 1024;
4602 } else if (size <= 512 * 1024) {
4603 size = 512 * 1024;
4604 } else if (size <= 1024 * 1024) {
4605 size = 1024 * 1024;
4606 } else if (ext4_mb_check_size(size, 4 * 1024 * 1024, max, 2 * 1024)) {
4607 *start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4608 (21 - bsbits)) << 21;
4609 size = 2 * 1024 * 1024;
4610 } else if (ext4_mb_check_size(size, 8 * 1024 * 1024, max, 4 * 1024)) {
4611 *start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4612 (22 - bsbits)) << 22;
4613 size = 4 * 1024 * 1024;
4614 } else if (ext4_mb_check_size(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
4615 (8<<20)>>bsbits, max, 8 * 1024)) {
4616 *start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
4617 (23 - bsbits)) << 23;
4618 size = 8 * 1024 * 1024;
4619 } else {
4620 *start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits;
4621 size = (loff_t)EXT4_C2B(sbi, ac->ac_o_ex.fe_len) << bsbits;
4622 }
4623
4624 return size;
4625 }
4626
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2025 Red Hat, Inc.