From nobody Fri Dec 19 11:32:51 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 650542D0C60; Mon, 8 Dec 2025 08:32:51 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1765182771; cv=none; b=GsllTLU+2YTh2yjWXJbW7eWqVBG0lNbKRrGaKY8XHcuEXIDTaBjm7eVdBKN2/Lm++d8JE5PZeYntesSTtxeWyOmG+gzwGXvNGJe7Dj0xtIglOgZqEff7fwt7xTQwY3UqjQ5LPYOmvMsHDgP8Mj3P5hj6iwoP1Nn/a9AgmhrZ86I= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1765182771; c=relaxed/simple; bh=NVXY/7yw3fhThO8b14OnLKgQtJK4BM0MuhUnD4nGMkY=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=owlQDc1UYAtkU0nwaGxqCyNqPI7JloYnR48FnLh6W+uV9+HejBTZTPeNglBb8OaLGM4uIrFgLnwW77lSkAOrVBZh+SphAG0Szq/CVMZwuUwVhdmbYlYqCCj/1u4yzHvD7vc/4Jbb5FYrhxnF/kHHoRjU7HIlwhBTEc2KTeUiD9Q= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 1B702C116B1; Mon, 8 Dec 2025 08:32:49 +0000 (UTC) From: Yu Kuai To: tytso@mit.edu, adilger.kernel@dilger.ca, linux-ext4@vger.kernel.org Cc: linux-kernel@vger.kernel.org, yukuai@fnnas.com Subject: [PATCH 1/2] ext4: refactor size prediction into helper functions Date: Mon, 8 Dec 2025 16:32:45 +0800 Message-ID: <20251208083246.320965-2-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251208083246.320965-1-yukuai@fnnas.com> References: <20251208083246.320965-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The ext4_mb_normalize_request() function contains a large if-else ladder for predicting file size and uses a macro NRL_CHECK_SIZE. Factor these out into proper helper functions to improve code readability and maintainability. This patch introduces: - ext4_mb_check_size(): static inline function replacing NRL_CHECK_SIZE mac= ro - ext4_mb_predict_file_size(): extracts size prediction logic No functional change. Signed-off-by: Yu Kuai --- fs/ext4/mballoc.c | 101 +++++++++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 9087183602e4..eb46a4f5fb4f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4489,6 +4489,63 @@ ext4_mb_pa_adjust_overlap(struct ext4_allocation_con= text *ac, *end =3D new_end; } =20 +/* + * Check if request size allows for chunk-based allocation + */ +static inline bool ext4_mb_check_size(loff_t req, loff_t size, + int max, int chunk_size) +{ + return (req <=3D size) || (max <=3D chunk_size); +} + +/* + * Predict file size for preallocation. Returns the predicted size + * in bytes and sets start_off if alignment is needed for large files. + */ +static loff_t ext4_mb_predict_file_size(struct ext4_sb_info *sbi, + struct ext4_allocation_context *ac, + loff_t size, loff_t *start_off) +{ + int bsbits =3D ac->ac_sb->s_blocksize_bits; + int max =3D 2 << bsbits; + + *start_off =3D 0; + + if (size <=3D 16 * 1024) { + size =3D 16 * 1024; + } else if (size <=3D 32 * 1024) { + size =3D 32 * 1024; + } else if (size <=3D 64 * 1024) { + size =3D 64 * 1024; + } else if (size <=3D 128 * 1024) { + size =3D 128 * 1024; + } else if (size <=3D 256 * 1024) { + size =3D 256 * 1024; + } else if (size <=3D 512 * 1024) { + size =3D 512 * 1024; + } else if (size <=3D 1024 * 1024) { + size =3D 1024 * 1024; + } else if (ext4_mb_check_size(size, 4 * 1024 * 1024, max, 2 * 1024)) { + *start_off =3D ((loff_t)ac->ac_o_ex.fe_logical >> + (21 - bsbits)) << 21; + size =3D 2 * 1024 * 1024; + } else if (ext4_mb_check_size(size, 8 * 1024 * 1024, max, 4 * 1024)) { + *start_off =3D ((loff_t)ac->ac_o_ex.fe_logical >> + (22 - bsbits)) << 22; + size =3D 4 * 1024 * 1024; + } else if (ext4_mb_check_size(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), + (8<<20)>>bsbits, max, 8 * 1024)) { + *start_off =3D ((loff_t)ac->ac_o_ex.fe_logical >> + (23 - bsbits)) << 23; + size =3D 8 * 1024 * 1024; + } else { + *start_off =3D (loff_t)ac->ac_o_ex.fe_logical << bsbits; + size =3D (loff_t)EXT4_C2B(sbi, ac->ac_o_ex.fe_len) << bsbits; + } + + return size; +} + /* * Normalization means making request better in terms of * size and alignment @@ -4500,7 +4557,7 @@ ext4_mb_normalize_request(struct ext4_allocation_cont= ext *ac, struct ext4_sb_info *sbi =3D EXT4_SB(ac->ac_sb); struct ext4_super_block *es =3D sbi->s_es; int bsbits, max; - loff_t size, start_off, end; + loff_t size, start_off =3D 0, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; =20 @@ -4533,47 +4590,9 @@ ext4_mb_normalize_request(struct ext4_allocation_con= text *ac, size =3D i_size_read(ac->ac_inode); orig_size =3D size; =20 - /* max size of free chunks */ - max =3D 2 << bsbits; + /* Predict file size for preallocation */ + size =3D ext4_mb_predict_file_size(sbi, ac, size, &start_off); =20 -#define NRL_CHECK_SIZE(req, size, max, chunk_size) \ - (req <=3D (size) || max <=3D (chunk_size)) - - /* first, try to predict filesize */ - /* XXX: should this table be tunable? */ - start_off =3D 0; - if (size <=3D 16 * 1024) { - size =3D 16 * 1024; - } else if (size <=3D 32 * 1024) { - size =3D 32 * 1024; - } else if (size <=3D 64 * 1024) { - size =3D 64 * 1024; - } else if (size <=3D 128 * 1024) { - size =3D 128 * 1024; - } else if (size <=3D 256 * 1024) { - size =3D 256 * 1024; - } else if (size <=3D 512 * 1024) { - size =3D 512 * 1024; - } else if (size <=3D 1024 * 1024) { - size =3D 1024 * 1024; - } else if (NRL_CHECK_SIZE(size, 4 * 1024 * 1024, max, 2 * 1024)) { - start_off =3D ((loff_t)ac->ac_o_ex.fe_logical >> - (21 - bsbits)) << 21; - size =3D 2 * 1024 * 1024; - } else if (NRL_CHECK_SIZE(size, 8 * 1024 * 1024, max, 4 * 1024)) { - start_off =3D ((loff_t)ac->ac_o_ex.fe_logical >> - (22 - bsbits)) << 22; - size =3D 4 * 1024 * 1024; - } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len), - (8<<20)>>bsbits, max, 8 * 1024)) { - start_off =3D ((loff_t)ac->ac_o_ex.fe_logical >> - (23 - bsbits)) << 23; - size =3D 8 * 1024 * 1024; - } else { - start_off =3D (loff_t) ac->ac_o_ex.fe_logical << bsbits; - size =3D (loff_t) EXT4_C2B(sbi, - ac->ac_o_ex.fe_len) << bsbits; - } size =3D size >> bsbits; start =3D start_off >> bsbits; =20 --=20 2.51.0 From nobody Fri Dec 19 11:32:51 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 358AF221540; Mon, 8 Dec 2025 08:32:52 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1765182773; cv=none; b=BgI46esGVU+MVnOuVoo1p08GTf0LN9DOnsKo+T6gOJAxPL2G2YKme+cvL53XgFE7V03jOAF7MfDwtsK7VoTypiydONycJH539xCiYG0viOC+5jDT9+8U3hhR8Hcp6NyKfe/ot/+o3ctf4ST5hlYg92tsCCgc8WhrAy6Mz/o7TjE= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1765182773; c=relaxed/simple; bh=uqex7jWNv/IVQhMmzlDWeAHV+HGDloKmgPRqAquGj7c=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=TXNAPtUn5o/rfBNM77exG4jNpNROPPU3PEzrGx0FmwJa5TZYUlW4Mjk3lvp0LiXsJk5ZlLekwQBo6okWfLYiJccXPwAk+O38DxDpKd3lmiCAVqycO1Z54DI2YLEPUdf2srlzzcXkpY51k7RBZetLnlHk6OYyOoPsWDk9V2PsB7U= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id A5A88C4CEF1; Mon, 8 Dec 2025 08:32:51 +0000 (UTC) From: Yu Kuai To: tytso@mit.edu, adilger.kernel@dilger.ca, linux-ext4@vger.kernel.org Cc: linux-kernel@vger.kernel.org, yukuai@fnnas.com Subject: [PATCH 2/2] ext4: align preallocation size to stripe width Date: Mon, 8 Dec 2025 16:32:46 +0800 Message-ID: <20251208083246.320965-3-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251208083246.320965-1-yukuai@fnnas.com> References: <20251208083246.320965-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable When stripe width (io_opt) is configured, align the predicted preallocation size to stripe boundaries. This ensures optimal I/O performance on RAID and other striped storage devices by avoiding partial stripe operations. The current implementation uses hardcoded size predictions (16KB, 32KB, 64KB, etc.) that are not stripe-aware. This causes physical block offsets on disk to be misaligned to stripe boundaries, leading to read-modify-write penalties on RAID arrays and reduced performance. This patch makes size prediction stripe-aware by using multiples of stripe size (1x, 2x, 4x, 8x, 16x, 32x) when s_stripe is set. Additionally, the start offset is aligned to stripe boundaries using rounddown(), which works correctly for both power-of-2 and non-power-of-2 stripe sizes. For devices without stripe configuration, the original behavior is preserved. The predicted size is limited to max free chunk size (2 << bsbits) to ensure reasonable allocation requests, with the limit rounded down to maintain stripe alignment. Test case: Device: 32-disk RAID5, 64KB chunk size Stripe: 496 blocks (31 data disks =C3=97 16 blocks/disk) Before patch (misaligned physical offsets): ext: logical_offset: physical_offset: length: 0: 0.. 63487: 34816.. 98303: 63488 1: 63488..126975: 100352..163839: 63488 2: 126976..190463: 165888..229375: 63488 3: 190464..253951: 231424..294911: 63488 4: 253952..262143: 296960..305151: 8192 Physical offsets: 34816 % 496 =3D 96 (misaligned) 100352 % 496 =3D 160 (misaligned) 165888 % 496 =3D 224 (misaligned) =E2=86=92 Causes partial stripe writes on RAID After patch (aligned physical offsets): ext: logical_offset: physical_offset: length: 0: 0.. 17855: 9920.. 27775: 17856 1: 17856.. 42159: 34224.. 58527: 24304 2: 42160.. 73407: 65968.. 97215: 31248 3: 73408.. 97711: 99696..123999: 24304 ... (all extents aligned until EOF) Physical offsets: 9920 % 496 =3D 0 (aligned) 34224 % 496 =3D 0 (aligned) 65968 % 496 =3D 0 (aligned) Extent lengths: 17856=3D496=C3=9736, 24304=3D496=C3=9749, 31248=3D496= =C3=9763 =E2=86=92 Optimal RAID performance, no partial stripe writes Benefits: - Eliminates read-modify-write operations on RAID arrays - Improves sequential write performance on striped devices - Maintains proper alignment throughout file lifetime - Works with any stripe size (power-of-2 or not) Signed-off-by: Yu Kuai --- fs/ext4/mballoc.c | 60 +++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index eb46a4f5fb4f..dbd0b239cc96 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4500,7 +4500,10 @@ static inline bool ext4_mb_check_size(loff_t req, lo= ff_t size, =20 /* * Predict file size for preallocation. Returns the predicted size - * in bytes and sets start_off if alignment is needed for large files. + * in bytes. When stripe width (io_opt) is configured, returns sizes + * that are multiples of stripe for optimal RAID performance. + * + * Sets start_off if alignment is needed for large files. */ static loff_t ext4_mb_predict_file_size(struct ext4_sb_info *sbi, struct ext4_allocation_context *ac, @@ -4511,6 +4514,59 @@ static loff_t ext4_mb_predict_file_size(struct ext4_= sb_info *sbi, =20 *start_off =3D 0; =20 + /* + * For RAID/striped devices, align preallocation size to stripe + * width (io_opt) for optimal I/O performance. Use power-of-2 + * multiples of stripe size for size prediction. + */ + if (sbi->s_stripe) { + loff_t stripe_bytes =3D (loff_t)sbi->s_stripe << bsbits; + loff_t max_size =3D (loff_t)max << bsbits; + + /* + * TODO: If stripe is larger than max chunk size, we can't + * do stripe-aligned allocation. Fall back to traditional + * size prediction. This can happen with very large stripe + * configurations on small block sizes. + */ + if (stripe_bytes > max_size) + goto no_stripe; + + if (size <=3D stripe_bytes) { + size =3D stripe_bytes; + } else if (size <=3D stripe_bytes * 2) { + size =3D stripe_bytes * 2; + } else if (size <=3D stripe_bytes * 4) { + size =3D stripe_bytes * 4; + } else if (size <=3D stripe_bytes * 8) { + size =3D stripe_bytes * 8; + } else if (size <=3D stripe_bytes * 16) { + size =3D stripe_bytes * 16; + } else if (size <=3D stripe_bytes * 32) { + size =3D stripe_bytes * 32; + } else { + size =3D roundup(size, stripe_bytes); + } + + /* + * Limit size to max free chunk size, rounded down to + * stripe alignment. + */ + if (size > max_size) + size =3D rounddown(max_size, stripe_bytes); + + /* + * Align start offset to stripe boundary for large allocations + * to ensure both start and size are stripe-aligned. + */ + *start_off =3D rounddown((loff_t)ac->ac_o_ex.fe_logical << bsbits, + stripe_bytes); + + return size; + } + +no_stripe: + /* No stripe: use traditional hardcoded size prediction */ if (size <=3D 16 * 1024) { size =3D 16 * 1024; } else if (size <=3D 32 * 1024) { @@ -4556,7 +4612,7 @@ ext4_mb_normalize_request(struct ext4_allocation_cont= ext *ac, { struct ext4_sb_info *sbi =3D EXT4_SB(ac->ac_sb); struct ext4_super_block *es =3D sbi->s_es; - int bsbits, max; + int bsbits; loff_t size, start_off =3D 0, end; loff_t orig_size __maybe_unused; ext4_lblk_t start; --=20 2.51.0