From nobody Tue Dec 2 01:50:38 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id D3C8E2F25F1; Fri, 21 Nov 2025 05:14:11 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702051; cv=none; b=qaXfUKzVniF96+YeIOza08prTYBJ2MyvltM5Zi81TZb+NXhdIJp9x7NYO8gmbSjpBAAb6sFZZHmtzjagMIYBlsyMcKxL1eOpwl7ljPDi3hc32jWOSwrsQRYD0Ge22BCZxBr8gSYoXY/qFxTYyINQEOETG8ernfXexzzoCkVKHXA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702051; c=relaxed/simple; bh=pKyrF07gFO8AVr8DqPKlGnUr8M/pZ9QM5w7148Jd/3o=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=VH3jqIFCokQryRFPBbIWb9E5FTD6DNzgs6x98tp/TrzekStoelSIPq8kkUBRFVgZJwW6Z2R0e/ecmdUvLGoFi2bQ9lvMVnTEkCAY5W4kc/7F/5yqpo0nRJpyOcoAbNBM59bd5xC2Cy/eIaJBDgFaQyFYl5RdFkG1Ku5PLrc0hEI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 11469C116D0; Fri, 21 Nov 2025 05:14:09 +0000 (UTC) From: Yu Kuai To: song@kernel.org, yukuai@fnnas.com Cc: linux-raid@vger.kernel.org, linux-kernel@vger.kernel.org, linan122@huawei.com, xni@redhat.com Subject: [PATCH 1/7] md/raid5: use mempool to allocate stripe_request_ctx Date: Fri, 21 Nov 2025 13:13:59 +0800 Message-ID: <20251121051406.1316884-2-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251121051406.1316884-1-yukuai@fnnas.com> References: <20251121051406.1316884-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" On the one hand, stripe_request_ctx is 72 bytes, and it's a bit huge for a stack variable. On the other hand, the bitmap sectors_to_do is a fixed size, result in max_hw_sector_kb of raid5 array is at most 256 * 4k =3D 1Mb, and this will make full stripe IO impossible for the array that chunk_size * data_disks is bigger. Allocate ctx during runtime will make it possible to get rid of this limit. Signed-off-by: Yu Kuai --- drivers/md/md.h | 4 +++ drivers/md/raid1-10.c | 5 ---- drivers/md/raid5.c | 58 +++++++++++++++++++++++++++---------------- drivers/md/raid5.h | 2 ++ 4 files changed, 42 insertions(+), 27 deletions(-) diff --git a/drivers/md/md.h b/drivers/md/md.h index 6985f2829bbd..75fd8c873b6f 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -22,6 +22,10 @@ #include =20 #define MaxSector (~(sector_t)0) +/* + * Number of guaranteed raid bios in case of extreme VM load: + */ +#define NR_RAID_BIOS 256 =20 enum md_submodule_type { MD_PERSONALITY =3D 0, diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 521625756128..c33099925f23 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -3,11 +3,6 @@ #define RESYNC_BLOCK_SIZE (64*1024) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) =20 -/* - * Number of guaranteed raid bios in case of extreme VM load: - */ -#define NR_RAID_BIOS 256 - /* when we get a read error on a read-only array, we redirect to another * device without failing the first device, or trying to over-write to * correct the read error. To keep track of bad blocks on a per-bio diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index cdbc7eba5c54..0ccb5907cd20 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6079,13 +6079,13 @@ static sector_t raid5_bio_lowest_chunk_sector(struc= t r5conf *conf, static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { DEFINE_WAIT_FUNC(wait, woken_wake_function); - bool on_wq; struct r5conf *conf =3D mddev->private; - sector_t logical_sector; - struct stripe_request_ctx ctx =3D {}; const int rw =3D bio_data_dir(bi); + struct stripe_request_ctx *ctx; + sector_t logical_sector; enum stripe_result res; int s, stripe_cnt; + bool on_wq; =20 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret =3D log_handle_flush_request(conf, bi); @@ -6097,11 +6097,6 @@ static bool raid5_make_request(struct mddev *mddev, = struct bio * bi) return true; } /* ret =3D=3D -EAGAIN, fallback */ - /* - * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, - * we need to flush journal device - */ - ctx.do_flush =3D bi->bi_opf & REQ_PREFLUSH; } =20 md_write_start(mddev, bi); @@ -6124,16 +6119,24 @@ static bool raid5_make_request(struct mddev *mddev,= struct bio * bi) } =20 logical_sector =3D bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTO= RS(conf)-1); - ctx.first_sector =3D logical_sector; - ctx.last_sector =3D bio_end_sector(bi); bi->bi_next =3D NULL; =20 - stripe_cnt =3D DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, + ctx =3D mempool_alloc(conf->ctx_pool, GFP_NOIO | __GFP_ZERO); + ctx->first_sector =3D logical_sector; + ctx->last_sector =3D bio_end_sector(bi); + /* + * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, + * we need to flush journal device + */ + if (unlikely(bi->bi_opf & REQ_PREFLUSH)) + ctx->do_flush =3D true; + + stripe_cnt =3D DIV_ROUND_UP_SECTOR_T(ctx->last_sector - logical_sector, RAID5_STRIPE_SECTORS(conf)); - bitmap_set(ctx.sectors_to_do, 0, stripe_cnt); + bitmap_set(ctx->sectors_to_do, 0, stripe_cnt); =20 pr_debug("raid456: %s, logical %llu to %llu\n", __func__, - bi->bi_iter.bi_sector, ctx.last_sector); + bi->bi_iter.bi_sector, ctx->last_sector); =20 /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && @@ -6141,6 +6144,7 @@ static bool raid5_make_request(struct mddev *mddev, s= truct bio * bi) bio_wouldblock_error(bi); if (rw =3D=3D WRITE) md_write_end(mddev); + mempool_free(ctx, conf->ctx_pool); return true; } md_account_bio(mddev, &bi); @@ -6159,10 +6163,10 @@ static bool raid5_make_request(struct mddev *mddev,= struct bio * bi) add_wait_queue(&conf->wait_for_reshape, &wait); on_wq =3D true; } - s =3D (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); + s =3D (logical_sector - ctx->first_sector) >> RAID5_STRIPE_SHIFT(conf); =20 while (1) { - res =3D make_stripe_request(mddev, conf, &ctx, logical_sector, + res =3D make_stripe_request(mddev, conf, ctx, logical_sector, bi); if (res =3D=3D STRIPE_FAIL || res =3D=3D STRIPE_WAIT_RESHAPE) break; @@ -6179,9 +6183,9 @@ static bool raid5_make_request(struct mddev *mddev, s= truct bio * bi) * raid5_activate_delayed() from making progress * and thus deadlocking. */ - if (ctx.batch_last) { - raid5_release_stripe(ctx.batch_last); - ctx.batch_last =3D NULL; + if (ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last =3D NULL; } =20 wait_woken(&wait, TASK_UNINTERRUPTIBLE, @@ -6189,21 +6193,23 @@ static bool raid5_make_request(struct mddev *mddev,= struct bio * bi) continue; } =20 - s =3D find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s); + s =3D find_next_bit_wrap(ctx->sectors_to_do, stripe_cnt, s); if (s =3D=3D stripe_cnt) break; =20 - logical_sector =3D ctx.first_sector + + logical_sector =3D ctx->first_sector + (s << RAID5_STRIPE_SHIFT(conf)); } if (unlikely(on_wq)) remove_wait_queue(&conf->wait_for_reshape, &wait); =20 - if (ctx.batch_last) - raid5_release_stripe(ctx.batch_last); + if (ctx->batch_last) + raid5_release_stripe(ctx->batch_last); =20 if (rw =3D=3D WRITE) md_write_end(mddev); + + mempool_free(ctx, conf->ctx_pool); if (res =3D=3D STRIPE_WAIT_RESHAPE) { md_free_cloned_bio(bi); return false; @@ -7370,6 +7376,7 @@ static void free_conf(struct r5conf *conf) bioset_exit(&conf->bio_split); kfree(conf->stripe_hashtbl); kfree(conf->pending_data); + mempool_destroy(conf->ctx_pool); kfree(conf); } =20 @@ -8053,6 +8060,13 @@ static int raid5_run(struct mddev *mddev) goto abort; } =20 + conf->ctx_pool =3D mempool_create_kmalloc_pool(NR_RAID_BIOS, + sizeof(struct stripe_request_ctx)); + if (!conf->ctx_pool) { + ret =3D -ENOMEM; + goto abort; + } + if (log_init(conf, journal_dev, raid5_has_ppl(conf))) goto abort; =20 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index eafc6e9ed6ee..6e3f07119fa4 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -690,6 +690,8 @@ struct r5conf { struct list_head pending_list; int pending_data_cnt; struct r5pending_data *next_pending_data; + + mempool_t *ctx_pool; }; =20 #if PAGE_SIZE =3D=3D DEFAULT_STRIPE_SIZE --=20 2.51.0 From nobody Tue Dec 2 01:50:38 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 36D782F5A28; Fri, 21 Nov 2025 05:14:14 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702055; cv=none; b=jTdYsy56e9jg7fPNuzArSqGcmJvI0SYajStChKMAtrhL6biP3CnGieMPxpMCZ3keEcotZ/LbDTD+WebmEhAezVB5YbCfTe0gwT/G7j46gaNJSGtSK1qJInhtI5K27GJvOK1tTvdoWJ9GfETsNKdCOpg2IFVSkc04FWLW7A9/TyY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702055; c=relaxed/simple; bh=zH5nN4Kn0Bpxrjvwfj6xPuCYUHPk1EVeZ5/ZeOMsBQU=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=er+zyn04v+erjqFbdRTG2NO6xXchu6aCDSWJYqXkJsiZMKKloxP2JaQodPqsE2VU9l2N1Y+8QDnmtfkj9sEH+/oyicGM9+W4P6SNF0g4eUJ7AXSaunZnqx2Hw7YNmmcWd48HkvK4adP6qgSfsbgYhBpmaSHhgmgP3fOSSesk8+A= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 823EAC116D0; Fri, 21 Nov 2025 05:14:13 +0000 (UTC) From: Yu Kuai To: song@kernel.org, yukuai@fnnas.com Cc: linux-raid@vger.kernel.org, linux-kernel@vger.kernel.org, linan122@huawei.com, xni@redhat.com Subject: [PATCH 2/7] md/raid5: make sure max_sectors is not less than io_opt Date: Fri, 21 Nov 2025 13:14:01 +0800 Message-ID: <20251121051406.1316884-4-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251121051406.1316884-1-yukuai@fnnas.com> References: <20251121051406.1316884-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Otherwise, even if user issue IO by io_opt, such IO will be split by max_sectors before they are submitted to raid5. For consequence, full stripe IO is impossible. BTW, dm-raid5 is not affected and still have such problem. Signed-off-by: Yu Kuai --- drivers/md/raid5.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0ccb5907cd20..dc7bdbdb04b7 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -773,14 +773,14 @@ struct stripe_request_ctx { /* last sector in the request */ sector_t last_sector; =20 + /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ + bool do_flush; + /* * bitmap to track stripe sectors that have been added to stripes * add one to account for unaligned requests */ - DECLARE_BITMAP(sectors_to_do, RAID5_MAX_REQ_STRIPES + 1); - - /* the request had REQ_PREFLUSH, cleared after the first stripe_head */ - bool do_flush; + unsigned long sectors_to_do[]; }; =20 /* @@ -7732,6 +7732,24 @@ static int only_parity(int raid_disk, int algo, int = raid_disks, int max_degraded return 0; } =20 +static int raid5_create_ctx_pool(struct r5conf *conf) +{ + struct stripe_request_ctx *ctx; + int size; + + if (mddev_is_dm(conf->mddev)) + size =3D BITS_TO_LONGS(RAID5_MAX_REQ_STRIPES); + else + size =3D BITS_TO_LONGS( + queue_max_hw_sectors(conf->mddev->gendisk->queue) >> + RAID5_STRIPE_SHIFT(conf)); + + conf->ctx_pool =3D mempool_create_kmalloc_pool(NR_RAID_BIOS, + struct_size(ctx, sectors_to_do, size)); + + return conf->ctx_pool ? 0 : -ENOMEM; +} + static int raid5_set_limits(struct mddev *mddev) { struct r5conf *conf =3D mddev->private; @@ -7788,6 +7806,8 @@ static int raid5_set_limits(struct mddev *mddev) * Limit the max sectors based on this. */ lim.max_hw_sectors =3D RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf); + if ((lim.max_hw_sectors << 9) < lim.io_opt) + lim.max_hw_sectors =3D lim.io_opt >> 9; =20 /* No restrictions on the number of segments in the request */ lim.max_segments =3D USHRT_MAX; @@ -8060,12 +8080,9 @@ static int raid5_run(struct mddev *mddev) goto abort; } =20 - conf->ctx_pool =3D mempool_create_kmalloc_pool(NR_RAID_BIOS, - sizeof(struct stripe_request_ctx)); - if (!conf->ctx_pool) { - ret =3D -ENOMEM; + ret =3D raid5_create_ctx_pool(conf); + if (ret) goto abort; - } =20 if (log_init(conf, journal_dev, raid5_has_ppl(conf))) goto abort; --=20 2.51.0 From nobody Tue Dec 2 01:50:38 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 185F32F5A28; Fri, 21 Nov 2025 05:14:16 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702057; cv=none; b=sV69MgorrteTfnO/NTTyp7AAizR5NjU9FX5i/NLvgzt+p21u8xFxGk5df5F0DTinshiXgMmOxdU6nfJTspgzb45wss/sD/nsKy3jDSbXI+p66AIoD/tiahDxLGWpYxxDea0Ayo+qU0LcmekWK9PS0uz/HLO4gyPl3fJ9sac2Llo= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702057; c=relaxed/simple; bh=4KDj2pNSq+lxsIMg4XH14s5KHKgds2odyGUls6jJW2Y=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=jAo7f3z7FN9YSeAsBM9s2LOxnARtegU19RECTmKowGoLMQqLacelTIKwaJOJ/+W8DKyg0nSeCE48PyuQbmW3Tet9Jc2hbyyWTbs6DJ97fcD0FF8g4964Al6bmWEEaIAOdC43Cm80wlIjZdvni4d7aQ7TzcL+GW2BPwM53fT5qSs= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 408DDC16AAE; Fri, 21 Nov 2025 05:14:15 +0000 (UTC) From: Yu Kuai To: song@kernel.org, yukuai@fnnas.com Cc: linux-raid@vger.kernel.org, linux-kernel@vger.kernel.org, linan122@huawei.com, xni@redhat.com Subject: [PATCH 3/7] md: support to align bio to limits Date: Fri, 21 Nov 2025 13:14:02 +0800 Message-ID: <20251121051406.1316884-5-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251121051406.1316884-1-yukuai@fnnas.com> References: <20251121051406.1316884-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" For personalities that report optimal IO size, it's indicate that users can get the best IO bandwidth if they issue IO with this size. However there is also an implicit condition that IO should also be aligned to the optimal IO size. Currently, bio will only be split by limits, if bio offset is not aligned to limits, then all split bio will not be aligned. This patch add a new feature to align bio to limits first, and following patches will support this for each personality if necessary. Signed-off-by: Yu Kuai --- drivers/md/md.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md.h | 1 + 2 files changed, 47 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7b5c5967568f..b09f87b27807 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -427,6 +427,48 @@ bool md_handle_request(struct mddev *mddev, struct bio= *bio) } EXPORT_SYMBOL(md_handle_request); =20 +static struct bio *__md_bio_align_to_limits(struct mddev *mddev, + struct bio *bio) +{ + unsigned int max_sectors =3D mddev->gendisk->queue->limits.max_sectors; + sector_t start =3D bio->bi_iter.bi_sector; + sector_t align_start =3D roundup(start, max_sectors); + sector_t end; + sector_t align_end; + + /* already aligned */ + if (align_start =3D=3D start) + return bio; + + end =3D start + bio_sectors(bio); + align_end =3D rounddown(end, max_sectors); + + /* bio is too small to split */ + if (align_end <=3D align_start) + return bio; + + return bio_submit_split_bioset(bio, align_start - start, + &mddev->gendisk->bio_split); +} + +static struct bio *md_bio_align_to_limits(struct mddev *mddev, struct bio = *bio) +{ + if (!mddev->bio_align_to_limits) + return bio; + + /* atomic write can't split */ + if (bio->bi_opf & REQ_ATOMIC) + return bio; + + switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + return __md_bio_align_to_limits(mddev, bio); + default: + return bio; + } +} + static void md_submit_bio(struct bio *bio) { const int rw =3D bio_data_dir(bio); @@ -442,6 +484,10 @@ static void md_submit_bio(struct bio *bio) return; } =20 + bio =3D md_bio_align_to_limits(mddev, bio); + if (!bio) + return; + bio =3D bio_split_to_limits(bio); if (!bio) return; diff --git a/drivers/md/md.h b/drivers/md/md.h index 75fd8c873b6f..1ed90fd85ac4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -630,6 +630,7 @@ struct mddev { bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; + bool bio_align_to_limits:1; }; =20 enum recovery_flags { --=20 2.51.0 From nobody Tue Dec 2 01:50:38 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id B0FB52F7AC0; Fri, 21 Nov 2025 05:14:18 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702058; cv=none; b=P790OFLopvsU/tgi9vffHzkRvs8PZejDcR6vICRL1JY9yTGtLhwSeN9+M0iB9OcgrF1KG7kMj6RLjHAwXbbrtTzCpGzK03gCbZ1Ch3WfutQWZWneszWQc2SVWi6E/nFmp5Ew406kfNRXBXRLmis7gSadg10o1wwvQ4fRarTVeWY= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702058; c=relaxed/simple; bh=JonqufBwustNHiKnuFEj+bemyGyDGnChMvxlDXMgQDE=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=cK6SWKYB1a1dPIlTVKXSKJvJNTOoSXtQlUzxiRrjEODE0zgge+CeT+G6BfaAUY3pFplAuel3P6zG+pt/SuiTcfnJur4px9mqhfM37HsHMd5DKepiQXn+doKWRXWuL+hI3kIs5qTO92lTXgfzsqp4fpNXF5jSqlCUTePmH22N7v0= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id F2883C116D0; Fri, 21 Nov 2025 05:14:16 +0000 (UTC) From: Yu Kuai To: song@kernel.org, yukuai@fnnas.com Cc: linux-raid@vger.kernel.org, linux-kernel@vger.kernel.org, linan122@huawei.com, xni@redhat.com Subject: [PATCH 4/7] md: add a helper md_config_align_limits() Date: Fri, 21 Nov 2025 13:14:03 +0800 Message-ID: <20251121051406.1316884-6-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251121051406.1316884-1-yukuai@fnnas.com> References: <20251121051406.1316884-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" This helper will be used by personalities that want to align bio to io_opt to get best IO bandwidth. Signed-off-by: Yu Kuai --- drivers/md/md.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/drivers/md/md.h b/drivers/md/md.h index 1ed90fd85ac4..c8190cf02701 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -1088,6 +1088,17 @@ static inline bool rdev_blocked(struct md_rdev *rdev) return false; } =20 +static inline void md_config_align_limits(struct mddev *mddev, + struct queue_limits *lim) +{ + if ((lim->max_hw_sectors << 9) < lim->io_opt) + lim->max_hw_sectors =3D lim->io_opt >> 9; + else + lim->max_hw_sectors =3D rounddown(lim->max_hw_sectors, + lim->io_opt >> 9); + mddev->bio_align_to_limits =3D true; +} + #define mddev_add_trace_msg(mddev, fmt, args...) \ do { \ if (!mddev_is_dm(mddev)) \ --=20 2.51.0 From nobody Tue Dec 2 01:50:38 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7196D2F9DAF; Fri, 21 Nov 2025 05:14:20 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702060; cv=none; b=pQnpmjCfJWPI+S7mEoOBcH7ZPBdFna8exfyJY4XqzwgPQMtTbhluF6rsdYL+HiEfOZHXueMMnl+4xsVwJblPazECCHEviYoTrErC0hwjH06x5NCw5qw4CZkP2sYYP44g5eV4KQdCJNXgqTRZem/85CvSXmA54HcypllwliKtvaI= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702060; c=relaxed/simple; bh=nwstt5FK8cOHQbLFXzZIEC3THAzxt9W6UVxROLDNAco=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=M8ecrLp+3tBdrAmm8q4ZKcogvuZqImpsSyVKi4rrgimo31Aof4id9cMAgvywmAyDSMgRPf2AS21yTlwY0YcPKkuq9Cb1PqPJfO+3C5D3cHRX5MLTjVCnGbb5NiY5l7c056zQRALdiXq1PZ8+LRBKgMuaNCiXROKj1krRbKrv8jM= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id B07BFC4CEFB; Fri, 21 Nov 2025 05:14:18 +0000 (UTC) From: Yu Kuai To: song@kernel.org, yukuai@fnnas.com Cc: linux-raid@vger.kernel.org, linux-kernel@vger.kernel.org, linan122@huawei.com, xni@redhat.com Subject: [PATCH 5/7] md/raid5: align bio to io_opt Date: Fri, 21 Nov 2025 13:14:04 +0800 Message-ID: <20251121051406.1316884-7-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251121051406.1316884-1-yukuai@fnnas.com> References: <20251121051406.1316884-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" raid5 internal implementaion indicates that if write bio is aligned to io_opt, then full stripe write will be used, which will be best for bandwidth because there is no need to read extra data to build new xor data. Simple test in my VM, 32 disks raid5 with 64kb chunksize: dd if=3D/dev/zero of=3D/dev/md0 bs=3D100M oflag=3Ddirect Before this patch: 782 MB/s With this patch: 1.1 GB/s BTW, there are still other bottleneck related to stripe handler, and require further optimization. Signed-off-by: Yu Kuai --- drivers/md/raid5.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dc7bdbdb04b7..2db4e4fe913a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7806,8 +7806,7 @@ static int raid5_set_limits(struct mddev *mddev) * Limit the max sectors based on this. */ lim.max_hw_sectors =3D RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf); - if ((lim.max_hw_sectors << 9) < lim.io_opt) - lim.max_hw_sectors =3D lim.io_opt >> 9; + md_config_align_limits(mddev, &lim); =20 /* No restrictions on the number of segments in the request */ lim.max_segments =3D USHRT_MAX; --=20 2.51.0 From nobody Tue Dec 2 01:50:38 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 39E592FB983; Fri, 21 Nov 2025 05:14:21 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702062; cv=none; b=cHiYM/SjX9YUun/Nv3oQqTAqoebJ9parzYbCX3YuNUS2JjgGPQ8zDtbnMBYIOqxO5IxD44HNNpPRXoH1rUQYt4lcxDZ4FhMBGuhuv/iwoDhhPoMktIHjOWu6xbXCURAbMYdYOzjgCMHMEF96+dUcBT533vKG96oEQbfIfXQLAV4= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702062; c=relaxed/simple; bh=1t5HNSEerc3b6cfSvsSN6hrE9PL80lCiXEwFQX75d1E=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=W1B4x9zIZEdnO+rrCUAQI4BSC8uz7cKen6pPrD7z2DyFjfTM+NbPtz9EUND7XQZA9jeTwDFLuYVksTpj6973CQklq2wCeAPlXmRwh2Ob6syyPFxEbr6174VQlL0M7+3IYzWfJUzZ0JS4vGvbn0vyZr/KndzUB/HQDHJCbYIUFq8= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 6E2FAC16AAE; Fri, 21 Nov 2025 05:14:20 +0000 (UTC) From: Yu Kuai To: song@kernel.org, yukuai@fnnas.com Cc: linux-raid@vger.kernel.org, linux-kernel@vger.kernel.org, linan122@huawei.com, xni@redhat.com Subject: [PATCH 6/7] md/raid10: align bio to io_opt Date: Fri, 21 Nov 2025 13:14:05 +0800 Message-ID: <20251121051406.1316884-8-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251121051406.1316884-1-yukuai@fnnas.com> References: <20251121051406.1316884-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The impact is not so significant for raid10 compared to raid5, however it's still more appropriate to issue IOs evenly to underlying disks. Signed-off-by: Yu Kuai --- drivers/md/raid10.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 84be4cc7e873..f6a4bb26fb4a 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4008,6 +4008,8 @@ static int raid10_set_queue_limits(struct mddev *mdde= v) err =3D mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) return err; + + md_config_align_limits(mddev, &lim); return queue_limits_set(mddev->gendisk->queue, &lim); } =20 --=20 2.51.0 From nobody Tue Dec 2 01:50:38 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 125A62FD686; Fri, 21 Nov 2025 05:14:23 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702064; cv=none; b=SFGUQCahng3eFA6+PAFVjRSyRt/H9duWQcOlqyYy0SId9I1jJdMModvpb1dg31XyXrCg7AeOrsWhI3rBmPI0NpAeetBvTFv4ZweW+X4Ct74B+APIGnBNZEMFIJBV9lQT+CPm1xhgSdcX23v0v2tzKqFmd2MILj6HyJrgwbuuXME= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702064; c=relaxed/simple; bh=BrsbRqsr457CDtLfeJtIVJOrgP/mPdHkCNkWNyPl22Y=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=L+dEQBsT/fAngI7nFcfycNPUB0UdbpX1M0394JCXVIE2KJeJ5iXHm4VWBzcIJgU4yG0ZVkM5a5fcNFQQCqg5YSssUNo1BBSJmnNj48UgNDuPDzWgMYYFxix21bqEr+sRP9U9aoCx49UKn/O//CIBy8pRK1VIEGiQjqIIhcSK8bI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 36CB5C19421; Fri, 21 Nov 2025 05:14:22 +0000 (UTC) From: Yu Kuai To: song@kernel.org, yukuai@fnnas.com Cc: linux-raid@vger.kernel.org, linux-kernel@vger.kernel.org, linan122@huawei.com, xni@redhat.com Subject: [PATCH 7/7] md/raid0: align bio to io_opt Date: Fri, 21 Nov 2025 13:14:06 +0800 Message-ID: <20251121051406.1316884-9-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251121051406.1316884-1-yukuai@fnnas.com> References: <20251121051406.1316884-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" The impact is not so significant for raid0 compared to raid5, however it's still more appropriate to issue IOs evenly to underlying disks. Signed-off-by: Yu Kuai --- drivers/md/raid0.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 47aee1b1d4d1..332f413bcf51 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -388,6 +388,8 @@ static int raid0_set_limits(struct mddev *mddev) err =3D mddev_stack_rdev_limits(mddev, &lim, MDDEV_STACK_INTEGRITY); if (err) return err; + + md_config_align_limits(mddev, &lim); return queue_limits_set(mddev->gendisk->queue, &lim); } =20 --=20 2.51.0 From nobody Tue Dec 2 01:50:38 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 862812F39A7; Fri, 21 Nov 2025 05:14:13 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702053; cv=none; b=rEYP0i5wQsMIsLm7KuwRQVqsNYUeIzRwxr5iwNae06EP2IxvYuSMCmy47x5pTKFywK9jPVXFO/ZCMqSJk/ajWTOzmX2zcH3C6CGxGN9vLDMHKzzvQqMq9lPY2ADjVAcD+YkLJ5u+bgwdHlZTGdRksLnNqyVsNmZMEMe0Nf6EcvM= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763702053; c=relaxed/simple; bh=4KDj2pNSq+lxsIMg4XH14s5KHKgds2odyGUls6jJW2Y=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=KMwjd0SptPb6YvGX2ue5NgGTlmHS4FK8b17PyWD0PiEvIu+ShtBXJoTqbIvCWJTCXxwjALQ4nNyZybjwDYAmdzJdGOlMbiCg3hpKOpbUInp6myNRjr8CnFy/+Cpgb47dqmj71qBAtKFaAUQ37V02Lhqevo4sRX2BSTTqt6NAoGg= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id C38DDC16AAE; Fri, 21 Nov 2025 05:14:11 +0000 (UTC) From: Yu Kuai To: song@kernel.org, yukuai@fnnas.com Cc: linux-raid@vger.kernel.org, linux-kernel@vger.kernel.org, linan122@huawei.com, xni@redhat.com Subject: [PATCH] md: support to align bio to limits Date: Fri, 21 Nov 2025 13:14:00 +0800 Message-ID: <20251121051406.1316884-3-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251121051406.1316884-1-yukuai@fnnas.com> References: <20251121051406.1316884-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" For personalities that report optimal IO size, it's indicate that users can get the best IO bandwidth if they issue IO with this size. However there is also an implicit condition that IO should also be aligned to the optimal IO size. Currently, bio will only be split by limits, if bio offset is not aligned to limits, then all split bio will not be aligned. This patch add a new feature to align bio to limits first, and following patches will support this for each personality if necessary. Signed-off-by: Yu Kuai --- drivers/md/md.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md.h | 1 + 2 files changed, 47 insertions(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 7b5c5967568f..b09f87b27807 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -427,6 +427,48 @@ bool md_handle_request(struct mddev *mddev, struct bio= *bio) } EXPORT_SYMBOL(md_handle_request); =20 +static struct bio *__md_bio_align_to_limits(struct mddev *mddev, + struct bio *bio) +{ + unsigned int max_sectors =3D mddev->gendisk->queue->limits.max_sectors; + sector_t start =3D bio->bi_iter.bi_sector; + sector_t align_start =3D roundup(start, max_sectors); + sector_t end; + sector_t align_end; + + /* already aligned */ + if (align_start =3D=3D start) + return bio; + + end =3D start + bio_sectors(bio); + align_end =3D rounddown(end, max_sectors); + + /* bio is too small to split */ + if (align_end <=3D align_start) + return bio; + + return bio_submit_split_bioset(bio, align_start - start, + &mddev->gendisk->bio_split); +} + +static struct bio *md_bio_align_to_limits(struct mddev *mddev, struct bio = *bio) +{ + if (!mddev->bio_align_to_limits) + return bio; + + /* atomic write can't split */ + if (bio->bi_opf & REQ_ATOMIC) + return bio; + + switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + return __md_bio_align_to_limits(mddev, bio); + default: + return bio; + } +} + static void md_submit_bio(struct bio *bio) { const int rw =3D bio_data_dir(bio); @@ -442,6 +484,10 @@ static void md_submit_bio(struct bio *bio) return; } =20 + bio =3D md_bio_align_to_limits(mddev, bio); + if (!bio) + return; + bio =3D bio_split_to_limits(bio); if (!bio) return; diff --git a/drivers/md/md.h b/drivers/md/md.h index 75fd8c873b6f..1ed90fd85ac4 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -630,6 +630,7 @@ struct mddev { bool has_superblocks:1; bool fail_last_dev:1; bool serialize_policy:1; + bool bio_align_to_limits:1; }; =20 enum recovery_flags { --=20 2.51.0