From nobody Tue Dec 2 01:07:58 2025 Received: from smtp.kernel.org (aws-us-west-2-korg-mail-1.web.codeaurora.org [10.30.226.201]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4883E2DEA68; Mon, 24 Nov 2025 06:32:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763965936; cv=none; b=TveaJyihNGpY98cAckh6MyTjZ6XiArQlUFEPbgEP3bFdVPebn/7ZGBcnQjoglD6nVgbydAwugp31DJzxQ3s/ZZ8OI2erupZ7OcM2W1FKxyyRa5rupchYhZjGy+G3UE2b5gtY7oda5YI8ADXNHEChsKDX+lp/k9HMA7N0ym6OS+A= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1763965936; c=relaxed/simple; bh=IePfBPIxr+E6Ux/GVNLZOkeaVAPNGv0frUzzEzz0u6A=; h=From:To:Cc:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version; b=W/7jkEUhvtiE355qb7HE+qNjYaPRvj5XVIqqMEjftMN6bkwcVIIeRNt70dsGtKXrARr3Hp9lvhSenfQf5VNCeJwz6DO8Mk62Je7D6tlbEHuzXsAeqmfjwrLSVbr10+mynTxqg3Wiwv+ysbiCO5zcANHIAKmxfXxfk4Yc8/AboRE= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; arc=none smtp.client-ip=10.30.226.201 Received: by smtp.kernel.org (Postfix) with ESMTPSA id 4C251C116C6; Mon, 24 Nov 2025 06:32:14 +0000 (UTC) From: Yu Kuai To: song@kernel.org, linux-raid@vger.kernel.org Cc: linux-kernel@vger.kernel.org, filippo@debian.org, colyli@fnnas.com, yukuai@fnnas.com Subject: [PATCH v2 04/11] md/raid5: use mempool to allocate stripe_request_ctx Date: Mon, 24 Nov 2025 14:31:56 +0800 Message-ID: <20251124063203.1692144-5-yukuai@fnnas.com> X-Mailer: git-send-email 2.51.0 In-Reply-To: <20251124063203.1692144-1-yukuai@fnnas.com> References: <20251124063203.1692144-1-yukuai@fnnas.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" On the one hand, stripe_request_ctx is 72 bytes, and it's a bit huge for a stack variable. On the other hand, the bitmap sectors_to_do is a fixed size, result in max_hw_sector_kb of raid5 array is at most 256 * 4k =3D 1Mb, and this will make full stripe IO impossible for the array that chunk_size * data_disks is bigger. Allocate ctx during runtime will make it possible to get rid of this limit. Signed-off-by: Yu Kuai --- drivers/md/md.h | 4 +++ drivers/md/raid1-10.c | 5 ---- drivers/md/raid5.c | 61 +++++++++++++++++++++++++++---------------- drivers/md/raid5.h | 2 ++ 4 files changed, 45 insertions(+), 27 deletions(-) diff --git a/drivers/md/md.h b/drivers/md/md.h index 6ee18045f41c..b8c5dec12b62 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -22,6 +22,10 @@ #include =20 #define MaxSector (~(sector_t)0) +/* + * Number of guaranteed raid bios in case of extreme VM load: + */ +#define NR_RAID_BIOS 256 =20 enum md_submodule_type { MD_PERSONALITY =3D 0, diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 521625756128..c33099925f23 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -3,11 +3,6 @@ #define RESYNC_BLOCK_SIZE (64*1024) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) =20 -/* - * Number of guaranteed raid bios in case of extreme VM load: - */ -#define NR_RAID_BIOS 256 - /* when we get a read error on a read-only array, we redirect to another * device without failing the first device, or trying to over-write to * correct the read error. To keep track of bad blocks on a per-bio diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f405ba7b99a7..0080dec4a6ef 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6083,13 +6083,13 @@ static sector_t raid5_bio_lowest_chunk_sector(struc= t r5conf *conf, static bool raid5_make_request(struct mddev *mddev, struct bio * bi) { DEFINE_WAIT_FUNC(wait, woken_wake_function); - bool on_wq; struct r5conf *conf =3D mddev->private; - sector_t logical_sector; - struct stripe_request_ctx ctx =3D {}; const int rw =3D bio_data_dir(bi); + struct stripe_request_ctx *ctx; + sector_t logical_sector; enum stripe_result res; int s, stripe_cnt; + bool on_wq; =20 if (unlikely(bi->bi_opf & REQ_PREFLUSH)) { int ret =3D log_handle_flush_request(conf, bi); @@ -6101,11 +6101,6 @@ static bool raid5_make_request(struct mddev *mddev, = struct bio * bi) return true; } /* ret =3D=3D -EAGAIN, fallback */ - /* - * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, - * we need to flush journal device - */ - ctx.do_flush =3D bi->bi_opf & REQ_PREFLUSH; } =20 md_write_start(mddev, bi); @@ -6128,16 +6123,24 @@ static bool raid5_make_request(struct mddev *mddev,= struct bio * bi) } =20 logical_sector =3D bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTO= RS(conf)-1); - ctx.first_sector =3D logical_sector; - ctx.last_sector =3D bio_end_sector(bi); bi->bi_next =3D NULL; =20 - stripe_cnt =3D DIV_ROUND_UP_SECTOR_T(ctx.last_sector - logical_sector, + ctx =3D mempool_alloc(conf->ctx_pool, GFP_NOIO | __GFP_ZERO); + ctx->first_sector =3D logical_sector; + ctx->last_sector =3D bio_end_sector(bi); + /* + * if r5l_handle_flush_request() didn't clear REQ_PREFLUSH, + * we need to flush journal device + */ + if (unlikely(bi->bi_opf & REQ_PREFLUSH)) + ctx->do_flush =3D true; + + stripe_cnt =3D DIV_ROUND_UP_SECTOR_T(ctx->last_sector - logical_sector, RAID5_STRIPE_SECTORS(conf)); - bitmap_set(ctx.sectors_to_do, 0, stripe_cnt); + bitmap_set(ctx->sectors_to_do, 0, stripe_cnt); =20 pr_debug("raid456: %s, logical %llu to %llu\n", __func__, - bi->bi_iter.bi_sector, ctx.last_sector); + bi->bi_iter.bi_sector, ctx->last_sector); =20 /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && @@ -6145,6 +6148,7 @@ static bool raid5_make_request(struct mddev *mddev, s= truct bio * bi) bio_wouldblock_error(bi); if (rw =3D=3D WRITE) md_write_end(mddev); + mempool_free(ctx, conf->ctx_pool); return true; } md_account_bio(mddev, &bi); @@ -6163,10 +6167,10 @@ static bool raid5_make_request(struct mddev *mddev,= struct bio * bi) add_wait_queue(&conf->wait_for_reshape, &wait); on_wq =3D true; } - s =3D (logical_sector - ctx.first_sector) >> RAID5_STRIPE_SHIFT(conf); + s =3D (logical_sector - ctx->first_sector) >> RAID5_STRIPE_SHIFT(conf); =20 while (1) { - res =3D make_stripe_request(mddev, conf, &ctx, logical_sector, + res =3D make_stripe_request(mddev, conf, ctx, logical_sector, bi); if (res =3D=3D STRIPE_FAIL || res =3D=3D STRIPE_WAIT_RESHAPE) break; @@ -6183,9 +6187,9 @@ static bool raid5_make_request(struct mddev *mddev, s= truct bio * bi) * raid5_activate_delayed() from making progress * and thus deadlocking. */ - if (ctx.batch_last) { - raid5_release_stripe(ctx.batch_last); - ctx.batch_last =3D NULL; + if (ctx->batch_last) { + raid5_release_stripe(ctx->batch_last); + ctx->batch_last =3D NULL; } =20 wait_woken(&wait, TASK_UNINTERRUPTIBLE, @@ -6193,21 +6197,23 @@ static bool raid5_make_request(struct mddev *mddev,= struct bio * bi) continue; } =20 - s =3D find_next_bit_wrap(ctx.sectors_to_do, stripe_cnt, s); + s =3D find_next_bit_wrap(ctx->sectors_to_do, stripe_cnt, s); if (s =3D=3D stripe_cnt) break; =20 - logical_sector =3D ctx.first_sector + + logical_sector =3D ctx->first_sector + (s << RAID5_STRIPE_SHIFT(conf)); } if (unlikely(on_wq)) remove_wait_queue(&conf->wait_for_reshape, &wait); =20 - if (ctx.batch_last) - raid5_release_stripe(ctx.batch_last); + if (ctx->batch_last) + raid5_release_stripe(ctx->batch_last); =20 if (rw =3D=3D WRITE) md_write_end(mddev); + + mempool_free(ctx, conf->ctx_pool); if (res =3D=3D STRIPE_WAIT_RESHAPE) { md_free_cloned_bio(bi); return false; @@ -7374,6 +7380,10 @@ static void free_conf(struct r5conf *conf) bioset_exit(&conf->bio_split); kfree(conf->stripe_hashtbl); kfree(conf->pending_data); + + if (conf->ctx_pool) + mempool_destroy(conf->ctx_pool); + kfree(conf); } =20 @@ -8057,6 +8067,13 @@ static int raid5_run(struct mddev *mddev) goto abort; } =20 + conf->ctx_pool =3D mempool_create_kmalloc_pool(NR_RAID_BIOS, + sizeof(struct stripe_request_ctx)); + if (!conf->ctx_pool) { + ret =3D -ENOMEM; + goto abort; + } + if (log_init(conf, journal_dev, raid5_has_ppl(conf))) goto abort; =20 diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index eafc6e9ed6ee..6e3f07119fa4 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -690,6 +690,8 @@ struct r5conf { struct list_head pending_list; int pending_data_cnt; struct r5pending_data *next_pending_data; + + mempool_t *ctx_pool; }; =20 #if PAGE_SIZE =3D=3D DEFAULT_STRIPE_SIZE --=20 2.51.0