[v2] folio support for sync I/O in RAID

[PATCH v2 14/14] md/raid1,raid10: fall back to smaller order if sync folio alloc fails

Posted by linan666@huaweicloud.com 1 week, 5 days ago

From: Li Nan <linan122@huawei.com>

RESYNC_BLOCK_SIZE (64K) has higher allocation failure chance than 4k,
so retry with lower orders to improve allocation reliability.

A r1/10_bio may have different rf->folio orders. Use minimum order as
r1/10_bio sectors to prevent exceeding size when adding folio to IO later.

Signed-off-by: Li Nan <linan122@huawei.com>
---
 drivers/md/raid1-10.c | 14 +++++++++++---
 drivers/md/raid1.c    | 13 +++++++++----
 drivers/md/raid10.c   | 28 ++++++++++++++++++++++++++--
 3 files changed, 46 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index ffbd7bd0f6e8..e966d11a81e7 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -41,12 +41,20 @@ static void rbio_pool_free(void *rbio, void *data)
 }
 
 static inline int resync_alloc_folio(struct resync_folio *rf,
-				     gfp_t gfp_flags)
+				     gfp_t gfp_flags, int *order)
 {
-	rf->folio = folio_alloc(gfp_flags, get_order(RESYNC_BLOCK_SIZE));
-	if (!rf->folio)
+	struct folio *folio;
+
+	do {
+		folio = folio_alloc(gfp_flags, *order);
+		if (folio)
+			break;
+	} while (--(*order) > 0);
+
+	if (!folio)
 		return -ENOMEM;
 
+	rf->folio = folio;
 	return 0;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 2253e65c5f03..5bee846f1534 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -149,6 +149,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 	int need_folio;
 	int j;
 	struct resync_folio *rfs;
+	int order = get_order(RESYNC_BLOCK_SIZE);
 
 	r1_bio = r1bio_pool_alloc(gfp_flags, conf);
 	if (!r1_bio)
@@ -182,7 +183,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 		struct resync_folio *rf = &rfs[j];
 
 		if (j < need_folio) {
-			if (resync_alloc_folio(rf, gfp_flags))
+			if (resync_alloc_folio(rf, gfp_flags, &order))
 				goto out_free_folio;
 		} else {
 			memcpy(rf, &rfs[0], sizeof(*rf));
@@ -193,6 +194,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 		r1_bio->bios[j]->bi_private = rf;
 	}
 
+	r1_bio->sectors = 1 << (order + PAGE_SECTORS_SHIFT);
 	r1_bio->master_bio = NULL;
 
 	return r1_bio;
@@ -2776,7 +2778,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	int write_targets = 0, read_targets = 0;
 	sector_t sync_blocks;
 	bool still_degraded = false;
-	int good_sectors = RESYNC_SECTORS;
+	int good_sectors;
 	int min_bad = 0; /* number of sectors that are bad in all devices */
 	int idx = sector_to_idx(sector_nr);
 
@@ -2858,8 +2860,11 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
 	r1_bio->sector = sector_nr;
 	r1_bio->state = 0;
 	set_bit(R1BIO_IsSync, &r1_bio->state);
-	/* make sure good_sectors won't go across barrier unit boundary */
-	good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
+	/*
+	 * make sure good_sectors won't go across barrier unit boundary.
+	 * r1_bio->sectors <= RESYNC_SECTORS.
+	 */
+	good_sectors = align_to_barrier_unit_end(sector_nr, r1_bio->sectors);
 
 	for (i = 0; i < conf->raid_disks * 2; i++) {
 		struct md_rdev *rdev;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 030812f908ac..72c77db9957c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -135,6 +135,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 	int j;
 	int nalloc, nalloc_rf;
 	struct resync_folio *rfs;
+	int order = get_order(RESYNC_BLOCK_SIZE);
 
 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
 	if (!r10_bio)
@@ -185,7 +186,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 
 		if (!j || test_bit(MD_RECOVERY_SYNC,
 				   &conf->mddev->recovery)) {
-			if (resync_alloc_folio(rf, gfp_flags))
+			if (resync_alloc_folio(rf, gfp_flags, &order))
 				goto out_free_folio;
 		} else {
 			memcpy(rf, &rfs[0], sizeof(*rf));
@@ -200,6 +201,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 		}
 	}
 
+	r10_bio->sectors = 1 << (order + PAGE_SECTORS_SHIFT);
 	return r10_bio;
 
 out_free_folio:
@@ -3374,6 +3376,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 						continue;
 					}
 				}
+
+				/*
+				 * RESYNC_BLOCK_SIZE folio might alloc failed in
+				 * resync_alloc_folio(). Fall back to smaller sync
+				 * size if needed.
+				 */
+				if (max_sync > r10_bio->sectors)
+					max_sync = r10_bio->sectors;
+
 				any_working = 1;
 				bio = r10_bio->devs[0].bio;
 				bio->bi_next = biolist;
@@ -3525,7 +3536,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
 		}
 		if (sync_blocks < max_sync)
 			max_sync = sync_blocks;
+
 		r10_bio = raid10_alloc_init_r10buf(conf);
+		/*
+		 * RESYNC_BLOCK_SIZE folio might alloc failed in resync_alloc_folio().
+		 * Fall back to smaller sync size if needed.
+		 */
+		if (max_sync > r10_bio->sectors)
+			max_sync = r10_bio->sectors;
+
 		r10_bio->state = 0;
 
 		r10_bio->mddev = mddev;
@@ -4702,7 +4721,12 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	r10_bio->mddev = mddev;
 	r10_bio->sector = sector_nr;
 	set_bit(R10BIO_IsReshape, &r10_bio->state);
-	r10_bio->sectors = last - sector_nr + 1;
+	/*
+	 * RESYNC_BLOCK_SIZE folio might alloc failed in
+	 * resync_alloc_folio(). Fall back to smaller sync
+	 * size if needed.
+	 */
+	r10_bio->sectors = min_t(int, r10_bio->sectors, last - sector_nr + 1);
 	rdev = read_balance(conf, r10_bio, &max_sectors);
 	BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
 
-- 
2.39.2

Re: [PATCH v2 14/14] md/raid1,raid10: fall back to smaller order if sync folio alloc fails

Posted by Yu Kuai 5 days, 7 hours ago

Hi,

在 2026/1/28 15:57, linan666@huaweicloud.com 写道:
> From: Li Nan <linan122@huawei.com>
>
> RESYNC_BLOCK_SIZE (64K) has higher allocation failure chance than 4k,
> so retry with lower orders to improve allocation reliability.
>
> A r1/10_bio may have different rf->folio orders. Use minimum order as
> r1/10_bio sectors to prevent exceeding size when adding folio to IO later.
>
> Signed-off-by: Li Nan <linan122@huawei.com>
> ---
>   drivers/md/raid1-10.c | 14 +++++++++++---
>   drivers/md/raid1.c    | 13 +++++++++----
>   drivers/md/raid10.c   | 28 ++++++++++++++++++++++++++--
>   3 files changed, 46 insertions(+), 9 deletions(-)

Looks like this patch should be merged into patch 5, there is no need to introduce
that higher allocation failure and then fix it here.

>
> diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
> index ffbd7bd0f6e8..e966d11a81e7 100644
> --- a/drivers/md/raid1-10.c
> +++ b/drivers/md/raid1-10.c
> @@ -41,12 +41,20 @@ static void rbio_pool_free(void *rbio, void *data)
>   }
>   
>   static inline int resync_alloc_folio(struct resync_folio *rf,
> -				     gfp_t gfp_flags)
> +				     gfp_t gfp_flags, int *order)
>   {
> -	rf->folio = folio_alloc(gfp_flags, get_order(RESYNC_BLOCK_SIZE));
> -	if (!rf->folio)
> +	struct folio *folio;
> +
> +	do {
> +		folio = folio_alloc(gfp_flags, *order);
> +		if (folio)
> +			break;
> +	} while (--(*order) > 0);
> +
> +	if (!folio)
>   		return -ENOMEM;
>   
> +	rf->folio = folio;
>   	return 0;
>   }
>   
> diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
> index 2253e65c5f03..5bee846f1534 100644
> --- a/drivers/md/raid1.c
> +++ b/drivers/md/raid1.c
> @@ -149,6 +149,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
>   	int need_folio;
>   	int j;
>   	struct resync_folio *rfs;
> +	int order = get_order(RESYNC_BLOCK_SIZE);
>   
>   	r1_bio = r1bio_pool_alloc(gfp_flags, conf);
>   	if (!r1_bio)
> @@ -182,7 +183,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
>   		struct resync_folio *rf = &rfs[j];
>   
>   		if (j < need_folio) {
> -			if (resync_alloc_folio(rf, gfp_flags))
> +			if (resync_alloc_folio(rf, gfp_flags, &order))
>   				goto out_free_folio;
>   		} else {
>   			memcpy(rf, &rfs[0], sizeof(*rf));
> @@ -193,6 +194,7 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
>   		r1_bio->bios[j]->bi_private = rf;
>   	}
>   
> +	r1_bio->sectors = 1 << (order + PAGE_SECTORS_SHIFT);
>   	r1_bio->master_bio = NULL;
>   
>   	return r1_bio;
> @@ -2776,7 +2778,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
>   	int write_targets = 0, read_targets = 0;
>   	sector_t sync_blocks;
>   	bool still_degraded = false;
> -	int good_sectors = RESYNC_SECTORS;
> +	int good_sectors;
>   	int min_bad = 0; /* number of sectors that are bad in all devices */
>   	int idx = sector_to_idx(sector_nr);
>   
> @@ -2858,8 +2860,11 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
>   	r1_bio->sector = sector_nr;
>   	r1_bio->state = 0;
>   	set_bit(R1BIO_IsSync, &r1_bio->state);
> -	/* make sure good_sectors won't go across barrier unit boundary */
> -	good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
> +	/*
> +	 * make sure good_sectors won't go across barrier unit boundary.
> +	 * r1_bio->sectors <= RESYNC_SECTORS.
> +	 */
> +	good_sectors = align_to_barrier_unit_end(sector_nr, r1_bio->sectors);
>   
>   	for (i = 0; i < conf->raid_disks * 2; i++) {
>   		struct md_rdev *rdev;
> diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
> index 030812f908ac..72c77db9957c 100644
> --- a/drivers/md/raid10.c
> +++ b/drivers/md/raid10.c
> @@ -135,6 +135,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
>   	int j;
>   	int nalloc, nalloc_rf;
>   	struct resync_folio *rfs;
> +	int order = get_order(RESYNC_BLOCK_SIZE);
>   
>   	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
>   	if (!r10_bio)
> @@ -185,7 +186,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
>   
>   		if (!j || test_bit(MD_RECOVERY_SYNC,
>   				   &conf->mddev->recovery)) {
> -			if (resync_alloc_folio(rf, gfp_flags))
> +			if (resync_alloc_folio(rf, gfp_flags, &order))
>   				goto out_free_folio;
>   		} else {
>   			memcpy(rf, &rfs[0], sizeof(*rf));
> @@ -200,6 +201,7 @@ static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
>   		}
>   	}
>   
> +	r10_bio->sectors = 1 << (order + PAGE_SECTORS_SHIFT);
>   	return r10_bio;
>   
>   out_free_folio:
> @@ -3374,6 +3376,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>   						continue;
>   					}
>   				}
> +
> +				/*
> +				 * RESYNC_BLOCK_SIZE folio might alloc failed in
> +				 * resync_alloc_folio(). Fall back to smaller sync
> +				 * size if needed.
> +				 */
> +				if (max_sync > r10_bio->sectors)
> +					max_sync = r10_bio->sectors;
> +
>   				any_working = 1;
>   				bio = r10_bio->devs[0].bio;
>   				bio->bi_next = biolist;
> @@ -3525,7 +3536,15 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
>   		}
>   		if (sync_blocks < max_sync)
>   			max_sync = sync_blocks;
> +
>   		r10_bio = raid10_alloc_init_r10buf(conf);
> +		/*
> +		 * RESYNC_BLOCK_SIZE folio might alloc failed in resync_alloc_folio().
> +		 * Fall back to smaller sync size if needed.
> +		 */
> +		if (max_sync > r10_bio->sectors)
> +			max_sync = r10_bio->sectors;
> +
>   		r10_bio->state = 0;
>   
>   		r10_bio->mddev = mddev;
> @@ -4702,7 +4721,12 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
>   	r10_bio->mddev = mddev;
>   	r10_bio->sector = sector_nr;
>   	set_bit(R10BIO_IsReshape, &r10_bio->state);
> -	r10_bio->sectors = last - sector_nr + 1;
> +	/*
> +	 * RESYNC_BLOCK_SIZE folio might alloc failed in
> +	 * resync_alloc_folio(). Fall back to smaller sync
> +	 * size if needed.
> +	 */
> +	r10_bio->sectors = min_t(int, r10_bio->sectors, last - sector_nr + 1);
>   	rdev = read_balance(conf, r10_bio, &max_sectors);
>   	BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
>   

-- 
Thansk,
Kuai

Re: [PATCH v2 14/14] md/raid1,raid10: fall back to smaller order if sync folio alloc fails

Posted by Li Nan 4 days, 17 hours ago


在 2026/2/5 0:48, Yu Kuai 写道:
> Hi,
> 
> 在 2026/1/28 15:57, linan666@huaweicloud.com 写道:
>> From: Li Nan <linan122@huawei.com>
>>
>> RESYNC_BLOCK_SIZE (64K) has higher allocation failure chance than 4k,
>> so retry with lower orders to improve allocation reliability.
>>
>> A r1/10_bio may have different rf->folio orders. Use minimum order as
>> r1/10_bio sectors to prevent exceeding size when adding folio to IO later.
>>
>> Signed-off-by: Li Nan <linan122@huawei.com>
>> ---
>>    drivers/md/raid1-10.c | 14 +++++++++++---
>>    drivers/md/raid1.c    | 13 +++++++++----
>>    drivers/md/raid10.c   | 28 ++++++++++++++++++++++++++--
>>    3 files changed, 46 insertions(+), 9 deletions(-)
> 
> Looks like this patch should be merged into patch 5, there is no need to introduce
> that higher allocation failure and then fix it here.
> 

OK, I will merge abouve patches.

-- 
Thanks,
Nan