Native SCSI multipath support

[PATCH 07/24] scsi-multipath: clone each bio

Posted by John Garry 1 month, 1 week ago

For failover handling, we must resubmit each bio.

However, unlike NVMe, for SCSI there is no guarantee that any bio submitted
is either all or none completed.

As such, for SCSI, for failover handling we will take the approach to
just re-submit the original bio. For this clone and submit each bio.

Signed-off-by: John Garry <john.g.garry@oracle.com>
---
 drivers/scsi/scsi_multipath.c | 51 ++++++++++++++++++++++++++++++++++-
 include/scsi/scsi_multipath.h |  1 +
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drivers/scsi/scsi_multipath.c b/drivers/scsi/scsi_multipath.c
index 4b7984e7e74ba..d79a92ec0cf6c 100644
--- a/drivers/scsi/scsi_multipath.c
+++ b/drivers/scsi/scsi_multipath.c
@@ -89,6 +89,14 @@ module_param_call(iopolicy, scsi_set_iopolicy, scsi_get_iopolicy,
 MODULE_PARM_DESC(iopolicy,
 	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
 
+struct scsi_mpath_clone_bio {
+	struct bio		*master_bio;
+	struct bio		clone;
+};
+
+#define scsi_mpath_to_master_bio(clone) \
+		container_of(clone, struct scsi_mpath_clone_bio, clone)
+
 static int scsi_mpath_unique_lun_id(struct scsi_device *sdev)
 {
 	struct scsi_mpath_device *scsi_mpath_dev = sdev->scsi_mpath_dev;
@@ -116,6 +124,7 @@ static void scsi_mpath_head_release(struct device *dev)
 	struct mpath_head *mpath_head = scsi_mpath_head->mpath_head;
 
 	scsi_mpath_delete_head(scsi_mpath_head);
+	bioset_exit(&scsi_mpath_head->bio_pool);
 	ida_free(&scsi_multipath_dev_ida, scsi_mpath_head->index);
 	mpath_put_head(mpath_head);
 	kfree(scsi_mpath_head);
@@ -260,6 +269,39 @@ static int scsi_multipath_sdev_init(struct scsi_device *sdev)
 	return 0;
 }
 
+static void scsi_mpath_clone_end_io(struct bio *clone)
+{
+	struct scsi_mpath_clone_bio *scsi_mpath_clone_bio =
+			scsi_mpath_to_master_bio(clone);
+	struct bio *master_bio = scsi_mpath_clone_bio->master_bio;
+
+	master_bio->bi_status = clone->bi_status;
+	bio_put(clone);
+	bio_endio(master_bio);
+}
+
+static struct bio *scsi_mpath_clone_bio(struct bio *bio)
+{
+	struct mpath_disk *mpath_disk = bio->bi_bdev->bd_disk->private_data;
+	struct mpath_head *mpath_head = mpath_disk->mpath_head;
+	struct scsi_mpath_clone_bio *scsi_mpath_clone_bio;
+	struct scsi_mpath_head *scsi_mpath_head = mpath_head->drvdata;
+	struct bio *clone;
+
+	clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOWAIT,
+				&scsi_mpath_head->bio_pool);
+	if (!clone)
+		return NULL;
+
+	clone->bi_end_io = scsi_mpath_clone_end_io;
+
+	scsi_mpath_clone_bio = container_of(clone,
+					struct scsi_mpath_clone_bio, clone);
+	scsi_mpath_clone_bio->master_bio = bio;
+
+	return clone;
+}
+
 static enum mpath_iopolicy_e scsi_mpath_get_iopolicy(struct mpath_head *mpath_head)
 {
 	struct scsi_mpath_head *scsi_mpath_head = mpath_head->drvdata;
@@ -269,6 +311,7 @@ static enum mpath_iopolicy_e scsi_mpath_get_iopolicy(struct mpath_head *mpath_he
 
 struct mpath_head_template smpdt_pr = {
 	.get_iopolicy = scsi_mpath_get_iopolicy,
+	.clone_bio = scsi_mpath_clone_bio,
 };
 
 static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
@@ -283,9 +326,13 @@ static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
 	ida_init(&scsi_mpath_head->ida);
 	mutex_init(&scsi_mpath_head->lock);
 
+	if (bioset_init(&scsi_mpath_head->bio_pool, SCSI_MAX_QUEUE_DEPTH,
+			offsetof(struct scsi_mpath_clone_bio, clone),
+			BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE))
+		goto out_free;
 	scsi_mpath_head->mpath_head = mpath_alloc_head();
 	if (IS_ERR(scsi_mpath_head->mpath_head))
-		goto out_free;
+		goto out_bioset_exit;
 	scsi_mpath_head->mpath_head->mpdt = &smpdt_pr;
 	scsi_mpath_head->mpath_head->drvdata = scsi_mpath_head;
 
@@ -307,6 +354,8 @@ static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
 	ida_free(&scsi_multipath_dev_ida, scsi_mpath_head->index);
 out_put_head:
 	mpath_put_head(scsi_mpath_head->mpath_head);
+out_bioset_exit:
+	bioset_exit(&scsi_mpath_head->bio_pool);
 out_free:
 	kfree(scsi_mpath_head);
 	return NULL;
diff --git a/include/scsi/scsi_multipath.h b/include/scsi/scsi_multipath.h
index 8dbe1c3784d2c..bd99ea017379d 100644
--- a/include/scsi/scsi_multipath.h
+++ b/include/scsi/scsi_multipath.h
@@ -26,6 +26,7 @@ struct scsi_mpath_head {
 	struct ida		ida;
 	struct mutex		lock;
 	struct mpath_iopolicy	iopolicy;
+	struct bio_set		bio_pool;
 	struct mpath_head	*mpath_head;
 	struct device		dev;
 	int			index;
-- 
2.43.5

Re: [PATCH 07/24] scsi-multipath: clone each bio

Posted by Benjamin Marzinski 1 month ago

On Wed, Feb 25, 2026 at 03:36:10PM +0000, John Garry wrote:
> For failover handling, we must resubmit each bio.
> 
> However, unlike NVMe, for SCSI there is no guarantee that any bio submitted
> is either all or none completed.
> 
> As such, for SCSI, for failover handling we will take the approach to
> just re-submit the original bio. For this clone and submit each bio.
> 
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
>  drivers/scsi/scsi_multipath.c | 51 ++++++++++++++++++++++++++++++++++-
>  include/scsi/scsi_multipath.h |  1 +
>  2 files changed, 51 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/scsi/scsi_multipath.c b/drivers/scsi/scsi_multipath.c
> index 4b7984e7e74ba..d79a92ec0cf6c 100644
> --- a/drivers/scsi/scsi_multipath.c
> +++ b/drivers/scsi/scsi_multipath.c
> @@ -89,6 +89,14 @@ module_param_call(iopolicy, scsi_set_iopolicy, scsi_get_iopolicy,
>  MODULE_PARM_DESC(iopolicy,
>  	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
>  
> +struct scsi_mpath_clone_bio {
> +	struct bio		*master_bio;
> +	struct bio		clone;
> +};

If the only extra information you need for your clone bios is a pointer
to the original bio, I think you can just store that in bi_private. So
you shouldn't actually need to allocate any front pad for your bioset.

> +
> +#define scsi_mpath_to_master_bio(clone) \
> +		container_of(clone, struct scsi_mpath_clone_bio, clone)
> +
>  static int scsi_mpath_unique_lun_id(struct scsi_device *sdev)
>  {
>  	struct scsi_mpath_device *scsi_mpath_dev = sdev->scsi_mpath_dev;

> @@ -260,6 +269,39 @@ static int scsi_multipath_sdev_init(struct scsi_device *sdev)
>  	return 0;
>  }
>  
> +static void scsi_mpath_clone_end_io(struct bio *clone)
> +{
> +	struct scsi_mpath_clone_bio *scsi_mpath_clone_bio =
> +			scsi_mpath_to_master_bio(clone);
> +	struct bio *master_bio = scsi_mpath_clone_bio->master_bio;
> +
> +	master_bio->bi_status = clone->bi_status;
> +	bio_put(clone);
> +	bio_endio(master_bio);
> +}
> +
> +static struct bio *scsi_mpath_clone_bio(struct bio *bio)
> +{
> +	struct mpath_disk *mpath_disk = bio->bi_bdev->bd_disk->private_data;
> +	struct mpath_head *mpath_head = mpath_disk->mpath_head;
> +	struct scsi_mpath_clone_bio *scsi_mpath_clone_bio;
> +	struct scsi_mpath_head *scsi_mpath_head = mpath_head->drvdata;
> +	struct bio *clone;
> +
> +	clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOWAIT,
> +				&scsi_mpath_head->bio_pool);

Why use GFP_NOWAIT? It's more likely to fail than GFP_NOIO. If the bio
has REQ_NOWAIT set, I can see where you would need this, but otherwise,
I don't see why GFP_NOIO wouldn't be better here.

> +	if (!clone)
> +		return NULL;
> +
> +	clone->bi_end_io = scsi_mpath_clone_end_io;
> +
> +	scsi_mpath_clone_bio = container_of(clone,
> +					struct scsi_mpath_clone_bio, clone);
> +	scsi_mpath_clone_bio->master_bio = bio;
> +
> +	return clone;
> +}
> +
>  static enum mpath_iopolicy_e scsi_mpath_get_iopolicy(struct mpath_head *mpath_head)
>  {
>  	struct scsi_mpath_head *scsi_mpath_head = mpath_head->drvdata;
> @@ -269,6 +311,7 @@ static enum mpath_iopolicy_e scsi_mpath_get_iopolicy(struct mpath_head *mpath_he
>  
>  struct mpath_head_template smpdt_pr = {
>  	.get_iopolicy = scsi_mpath_get_iopolicy,
> +	.clone_bio = scsi_mpath_clone_bio,
>  };
>  
>  static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
> @@ -283,9 +326,13 @@ static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
>  	ida_init(&scsi_mpath_head->ida);
>  	mutex_init(&scsi_mpath_head->lock);
>  
> +	if (bioset_init(&scsi_mpath_head->bio_pool, SCSI_MAX_QUEUE_DEPTH,
> +			offsetof(struct scsi_mpath_clone_bio, clone),
> +			BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE))

You don't need 4096 cached bios to guarantee forward progress. I don't
see why BIO_POOL_SIZE won't work fine here. Also, since you are cloning
bios, they are sharing the original bio's iovecs, so you don't need
BIOSET_NEED_BVECS.

-Ben

Re: [PATCH 07/24] scsi-multipath: clone each bio

Posted by John Garry 1 month ago

On 02/03/2026 03:21, Benjamin Marzinski wrote:
> On Wed, Feb 25, 2026 at 03:36:10PM +0000, John Garry wrote:
>> For failover handling, we must resubmit each bio.
>>
>> However, unlike NVMe, for SCSI there is no guarantee that any bio submitted
>> is either all or none completed.
>>
>> As such, for SCSI, for failover handling we will take the approach to
>> just re-submit the original bio. For this clone and submit each bio.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>>   drivers/scsi/scsi_multipath.c | 51 ++++++++++++++++++++++++++++++++++-
>>   include/scsi/scsi_multipath.h |  1 +
>>   2 files changed, 51 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/scsi/scsi_multipath.c b/drivers/scsi/scsi_multipath.c
>> index 4b7984e7e74ba..d79a92ec0cf6c 100644
>> --- a/drivers/scsi/scsi_multipath.c
>> +++ b/drivers/scsi/scsi_multipath.c
>> @@ -89,6 +89,14 @@ module_param_call(iopolicy, scsi_set_iopolicy, scsi_get_iopolicy,
>>   MODULE_PARM_DESC(iopolicy,
>>   	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
>>   
>> +struct scsi_mpath_clone_bio {
>> +	struct bio		*master_bio;
>> +	struct bio		clone;
>> +};
> 
> If the only extra information you need for your clone bios is a pointer
> to the original bio, I think you can just store that in bi_private. So
> you shouldn't actually need to allocate any front pad for your bioset.

Yes, seems a decent idea

> 
>> +
>> +#define scsi_mpath_to_master_bio(clone) \
>> +		container_of(clone, struct scsi_mpath_clone_bio, clone)
>> +
>>   static int scsi_mpath_unique_lun_id(struct scsi_device *sdev)
>>   {
>>   	struct scsi_mpath_device *scsi_mpath_dev = sdev->scsi_mpath_dev;
> 
>> @@ -260,6 +269,39 @@ static int scsi_multipath_sdev_init(struct scsi_device *sdev)
>>   	return 0;
>>   }
>>   
>> +static void scsi_mpath_clone_end_io(struct bio *clone)
>> +{
>> +	struct scsi_mpath_clone_bio *scsi_mpath_clone_bio =
>> +			scsi_mpath_to_master_bio(clone);
>> +	struct bio *master_bio = scsi_mpath_clone_bio->master_bio;
>> +
>> +	master_bio->bi_status = clone->bi_status;
>> +	bio_put(clone);
>> +	bio_endio(master_bio);
>> +}
>> +
>> +static struct bio *scsi_mpath_clone_bio(struct bio *bio)
>> +{
>> +	struct mpath_disk *mpath_disk = bio->bi_bdev->bd_disk->private_data;
>> +	struct mpath_head *mpath_head = mpath_disk->mpath_head;
>> +	struct scsi_mpath_clone_bio *scsi_mpath_clone_bio;
>> +	struct scsi_mpath_head *scsi_mpath_head = mpath_head->drvdata;
>> +	struct bio *clone;
>> +
>> +	clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOWAIT,
>> +				&scsi_mpath_head->bio_pool);
> 
> Why use GFP_NOWAIT? It's more likely to fail than GFP_NOIO. If the bio
> has REQ_NOWAIT set, I can see where you would need this, but otherwise,
> I don't see why GFP_NOIO wouldn't be better here.

Seems reasonable to try GFP_NOIO. Furthermore, we really can't tolerate 
the clone to fail. So, if it does, we should return an error pointer 
here and mpath_bdev_submit_bio() should error the original bio.

> 
>> +	if (!clone)
>> +		return NULL;
>> +
>> +	clone->bi_end_io = scsi_mpath_clone_end_io;
>> +
>> +	scsi_mpath_clone_bio = container_of(clone,
>> +					struct scsi_mpath_clone_bio, clone);
>> +	scsi_mpath_clone_bio->master_bio = bio;
>> +
>> +	return clone;
>> +}
>> +
>>   static enum mpath_iopolicy_e scsi_mpath_get_iopolicy(struct mpath_head *mpath_head)
>>   {
>>   	struct scsi_mpath_head *scsi_mpath_head = mpath_head->drvdata;
>> @@ -269,6 +311,7 @@ static enum mpath_iopolicy_e scsi_mpath_get_iopolicy(struct mpath_head *mpath_he
>>   
>>   struct mpath_head_template smpdt_pr = {
>>   	.get_iopolicy = scsi_mpath_get_iopolicy,
>> +	.clone_bio = scsi_mpath_clone_bio,
>>   };
>>   
>>   static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
>> @@ -283,9 +326,13 @@ static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
>>   	ida_init(&scsi_mpath_head->ida);
>>   	mutex_init(&scsi_mpath_head->lock);
>>   
>> +	if (bioset_init(&scsi_mpath_head->bio_pool, SCSI_MAX_QUEUE_DEPTH,
>> +			offsetof(struct scsi_mpath_clone_bio, clone),
>> +			BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE))
> 
> You don't need 4096 cached bios to guarantee forward progress. I don't
> see why BIO_POOL_SIZE won't work fine here. 

Every bio which we are sent is cloned. And SCSI_MAX_QUEUE_DEPTH is used 
as the cached bio size - wouldn't it make sense to cache more than 2 bios?

> Also, since you are cloning
> bios, they are sharing the original bio's iovecs, so you don't need
> BIOSET_NEED_BVECS.
> 

ok

thanks!

Re: [PATCH 07/24] scsi-multipath: clone each bio

Posted by Benjamin Marzinski 1 month ago

On Mon, Mar 02, 2026 at 12:12:54PM +0000, John Garry wrote:
> On 02/03/2026 03:21, Benjamin Marzinski wrote:
> > On Wed, Feb 25, 2026 at 03:36:10PM +0000, John Garry wrote:
> > > For failover handling, we must resubmit each bio.
> > > 
> > > However, unlike NVMe, for SCSI there is no guarantee that any bio submitted
> > > is either all or none completed.
> > > 
> > > As such, for SCSI, for failover handling we will take the approach to
> > > just re-submit the original bio. For this clone and submit each bio.
> > > 
> > > Signed-off-by: John Garry <john.g.garry@oracle.com>
> > > ---
> > >   drivers/scsi/scsi_multipath.c | 51 ++++++++++++++++++++++++++++++++++-
> > >   include/scsi/scsi_multipath.h |  1 +
> > >   2 files changed, 51 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/drivers/scsi/scsi_multipath.c b/drivers/scsi/scsi_multipath.c
> > > index 4b7984e7e74ba..d79a92ec0cf6c 100644
> > > --- a/drivers/scsi/scsi_multipath.c
> > > +++ b/drivers/scsi/scsi_multipath.c
> > > @@ -89,6 +89,14 @@ module_param_call(iopolicy, scsi_set_iopolicy, scsi_get_iopolicy,
> > >   MODULE_PARM_DESC(iopolicy,
> > >   	"Default multipath I/O policy; 'numa' (default), 'round-robin' or 'queue-depth'");
> > > +struct scsi_mpath_clone_bio {
> > > +	struct bio		*master_bio;
> > > +	struct bio		clone;
> > > +};
> > 
> > If the only extra information you need for your clone bios is a pointer
> > to the original bio, I think you can just store that in bi_private. So
> > you shouldn't actually need to allocate any front pad for your bioset.
> 
> Yes, seems a decent idea
> 
> > 
> > > +
> > > +#define scsi_mpath_to_master_bio(clone) \
> > > +		container_of(clone, struct scsi_mpath_clone_bio, clone)
> > > +
> > >   static int scsi_mpath_unique_lun_id(struct scsi_device *sdev)
> > >   {
> > >   	struct scsi_mpath_device *scsi_mpath_dev = sdev->scsi_mpath_dev;
> > 
> > > @@ -260,6 +269,39 @@ static int scsi_multipath_sdev_init(struct scsi_device *sdev)
> > >   	return 0;
> > >   }
> > > +static void scsi_mpath_clone_end_io(struct bio *clone)
> > > +{
> > > +	struct scsi_mpath_clone_bio *scsi_mpath_clone_bio =
> > > +			scsi_mpath_to_master_bio(clone);
> > > +	struct bio *master_bio = scsi_mpath_clone_bio->master_bio;
> > > +
> > > +	master_bio->bi_status = clone->bi_status;
> > > +	bio_put(clone);
> > > +	bio_endio(master_bio);
> > > +}
> > > +
> > > +static struct bio *scsi_mpath_clone_bio(struct bio *bio)
> > > +{
> > > +	struct mpath_disk *mpath_disk = bio->bi_bdev->bd_disk->private_data;
> > > +	struct mpath_head *mpath_head = mpath_disk->mpath_head;
> > > +	struct scsi_mpath_clone_bio *scsi_mpath_clone_bio;
> > > +	struct scsi_mpath_head *scsi_mpath_head = mpath_head->drvdata;
> > > +	struct bio *clone;
> > > +
> > > +	clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOWAIT,
> > > +				&scsi_mpath_head->bio_pool);
> > 
> > Why use GFP_NOWAIT? It's more likely to fail than GFP_NOIO. If the bio
> > has REQ_NOWAIT set, I can see where you would need this, but otherwise,
> > I don't see why GFP_NOIO wouldn't be better here.
> 
> Seems reasonable to try GFP_NOIO. Furthermore, we really can't tolerate the
> clone to fail. So, if it does, we should return an error pointer here and
> mpath_bdev_submit_bio() should error the original bio.
> 
> > 
> > > +	if (!clone)
> > > +		return NULL;
> > > +
> > > +	clone->bi_end_io = scsi_mpath_clone_end_io;
> > > +
> > > +	scsi_mpath_clone_bio = container_of(clone,
> > > +					struct scsi_mpath_clone_bio, clone);
> > > +	scsi_mpath_clone_bio->master_bio = bio;
> > > +
> > > +	return clone;
> > > +}
> > > +
> > >   static enum mpath_iopolicy_e scsi_mpath_get_iopolicy(struct mpath_head *mpath_head)
> > >   {
> > >   	struct scsi_mpath_head *scsi_mpath_head = mpath_head->drvdata;
> > > @@ -269,6 +311,7 @@ static enum mpath_iopolicy_e scsi_mpath_get_iopolicy(struct mpath_head *mpath_he
> > >   struct mpath_head_template smpdt_pr = {
> > >   	.get_iopolicy = scsi_mpath_get_iopolicy,
> > > +	.clone_bio = scsi_mpath_clone_bio,
> > >   };
> > >   static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
> > > @@ -283,9 +326,13 @@ static struct scsi_mpath_head *scsi_mpath_alloc_head(void)
> > >   	ida_init(&scsi_mpath_head->ida);
> > >   	mutex_init(&scsi_mpath_head->lock);
> > > +	if (bioset_init(&scsi_mpath_head->bio_pool, SCSI_MAX_QUEUE_DEPTH,
> > > +			offsetof(struct scsi_mpath_clone_bio, clone),
> > > +			BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE))
> > 
> > You don't need 4096 cached bios to guarantee forward progress. I don't
> > see why BIO_POOL_SIZE won't work fine here.
> 
> Every bio which we are sent is cloned. And SCSI_MAX_QUEUE_DEPTH is used as
> the cached bio size - wouldn't it make sense to cache more than 2 bios?

IIRC, the reserved pool is there to guarantee forward progress under
memory pressure, so that if the system is short on memory, and it needs
to write out data to this multipath device in order to free up memory,
it there will be enough resources to do that.

Under normal conditions, your new bios should be getting pulled from the
per-cpu cache anyways, since you set BIOSET_PERCPU_CACHE. That's going
to be the fastest way to get one.

-Ben

> 
> > Also, since you are cloning
> > bios, they are sharing the original bio's iovecs, so you don't need
> > BIOSET_NEED_BVECS.
> > 
> 
> ok
> 
> thanks!

Re: [PATCH 07/24] scsi-multipath: clone each bio

Posted by John Garry 1 month ago

On 02/03/2026 16:27, Benjamin Marzinski wrote:
>> Every bio which we are sent is cloned. And SCSI_MAX_QUEUE_DEPTH is used as
>> the cached bio size - wouldn't it make sense to cache more than 2 bios?
> IIRC, the reserved pool is there to guarantee forward progress under
> memory pressure, so that if the system is short on memory, and it needs
> to write out data to this multipath device in order to free up memory,
> it there will be enough resources to do that.
> 
> Under normal conditions, your new bios should be getting pulled from the
> per-cpu cache anyways, since you set BIOSET_PERCPU_CACHE. That's going
> to be the fastest way to get one.

ok, got it

Thanks