Add support to submit a bio per-path. In addition, for failover, add
support to requeue a failed bio.
NVMe has almost like-for-like equivalents here:
- nvme_available_path() -> mpath_available_path()
- nvme_requeue_work() -> mpath_requeue_work()
- nvme_ns_head_submit_bio() -> mpath_bdev_submit_bio()
For failover, a driver may want to re-submit a bio, so add support to
clone a bio prior to submission.
A bio which is submitted to a per-path device has flag REQ_MPATH set,
same as what is done for NVMe with REQ_NVME_MPATH.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
include/linux/multipath.h | 15 +++++++
lib/multipath.c | 92 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 106 insertions(+), 1 deletion(-)
diff --git a/include/linux/multipath.h b/include/linux/multipath.h
index c964a1aba9c42..d557fb9bab4c9 100644
--- a/include/linux/multipath.h
+++ b/include/linux/multipath.h
@@ -3,6 +3,7 @@
#define _LIBMULTIPATH_H
#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
#include <linux/srcu.h>
extern const struct block_device_operations mpath_ops;
@@ -40,10 +41,12 @@ struct mpath_device {
};
struct mpath_head_template {
+ bool (*available_path)(struct mpath_device *, bool *);
bool (*is_disabled)(struct mpath_device *);
bool (*is_optimized)(struct mpath_device *);
enum mpath_access_state (*get_access_state)(struct mpath_device *);
enum mpath_iopolicy_e (*get_iopolicy)(struct mpath_head *);
+ struct bio *(*clone_bio)(struct bio *);
const struct attribute_group **device_groups;
};
@@ -56,12 +59,23 @@ struct mpath_head {
struct kref ref;
+ struct bio_list requeue_list; /* list for requeing bio */
+ spinlock_t requeue_lock;
+ struct work_struct requeue_work; /* work struct for requeue */
+
unsigned long flags;
struct mpath_device __rcu *current_path[MAX_NUMNODES];
const struct mpath_head_template *mpdt;
void *drvdata;
};
+#define REQ_MPATH REQ_DRV
+
+static inline bool is_mpath_request(struct request *req)
+{
+ return req->cmd_flags & REQ_MPATH;
+}
+
static inline struct mpath_disk *mpath_bd_device_to_disk(struct device *dev)
{
return dev_get_drvdata(dev);
@@ -82,6 +96,7 @@ int mpath_set_iopolicy(const char *val, int *iopolicy);
int mpath_get_iopolicy(char *buf, int iopolicy);
int mpath_get_head(struct mpath_head *mpath_head);
void mpath_put_head(struct mpath_head *mpath_head);
+void mpath_requeue_work(struct work_struct *work);
struct mpath_head *mpath_alloc_head(void);
void mpath_put_disk(struct mpath_disk *mpath_disk);
void mpath_remove_disk(struct mpath_disk *mpath_disk);
diff --git a/lib/multipath.c b/lib/multipath.c
index 65a0d2d2bf524..b494b35e8dccc 100644
--- a/lib/multipath.c
+++ b/lib/multipath.c
@@ -5,6 +5,7 @@
*/
#include <linux/module.h>
#include <linux/multipath.h>
+#include <trace/events/block.h>
static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head);
@@ -227,7 +228,6 @@ static struct mpath_device *mpath_numa_path(struct mpath_head *mpath_head,
return mpath_device;
}
-__maybe_unused
static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head)
{
enum mpath_iopolicy_e iopolicy =
@@ -243,6 +243,66 @@ static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head)
}
}
+static bool mpath_available_path(struct mpath_head *mpath_head)
+{
+ struct mpath_device *mpath_device;
+
+ if (!test_bit(MPATH_HEAD_DISK_LIVE, &mpath_head->flags))
+ return false;
+
+ list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings,
+ srcu_read_lock_held(&mpath_head->srcu)) {
+ bool available = false;
+
+ if (!mpath_head->mpdt->available_path(mpath_device,
+ &available))
+ continue;
+ if (available)
+ return true;
+ }
+
+ return false;
+}
+
+static void mpath_bdev_submit_bio(struct bio *bio)
+{
+ struct mpath_disk *mpath_disk = bio->bi_bdev->bd_disk->private_data;
+ struct mpath_head *mpath_head = mpath_disk->mpath_head;
+ struct device *dev = mpath_disk->parent;
+ struct mpath_device *mpath_device;
+ int srcu_idx;
+
+ bio = bio_split_to_limits(bio);
+ if (!bio)
+ return;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ mpath_device = mpath_find_path(mpath_head);
+
+ if (likely(mpath_device)) {
+ bio->bi_opf |= REQ_MPATH;
+ if (mpath_head->mpdt->clone_bio)
+ bio = mpath_head->mpdt->clone_bio(bio);
+ trace_block_bio_remap(bio, disk_devt(mpath_device->disk),
+ bio->bi_iter.bi_sector);
+ bio_set_dev(bio, mpath_device->disk->part0);
+
+ submit_bio_noacct(bio);
+ } else if (mpath_available_path(mpath_head)) {
+ dev_warn_ratelimited(dev, "no usable path - requeuing I/O\n");
+
+ spin_lock_irq(&mpath_head->requeue_lock);
+ bio_list_add(&mpath_head->requeue_list, bio);
+ spin_unlock_irq(&mpath_head->requeue_lock);
+ } else {
+ dev_warn_ratelimited(dev, "no available path - failing I/O\n");
+
+ bio_io_error(bio);
+ }
+
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+}
+
static void mpath_free_head(struct kref *ref)
{
struct mpath_head *mpath_head =
@@ -310,6 +370,7 @@ const struct block_device_operations mpath_ops = {
.owner = THIS_MODULE,
.open = mpath_bdev_open,
.release = mpath_bdev_release,
+ .submit_bio = mpath_bdev_submit_bio,
};
EXPORT_SYMBOL_GPL(mpath_ops);
@@ -327,6 +388,24 @@ static void multipath_partition_scan_work(struct work_struct *work)
mutex_unlock(&mpath_disk->disk->open_mutex);
}
+void mpath_requeue_work(struct work_struct *work)
+{
+ struct mpath_head *mpath_head =
+ container_of(work, struct mpath_head, requeue_work);
+ struct bio *bio, *next;
+
+ spin_lock_irq(&mpath_head->requeue_lock);
+ next = bio_list_get(&mpath_head->requeue_list);
+ spin_unlock_irq(&mpath_head->requeue_lock);
+
+ while ((bio = next) != NULL) {
+ next = bio->bi_next;
+ bio->bi_next = NULL;
+ submit_bio_noacct(bio);
+ }
+}
+EXPORT_SYMBOL_GPL(mpath_requeue_work);
+
void mpath_remove_disk(struct mpath_disk *mpath_disk)
{
struct mpath_head *mpath_head = mpath_disk->mpath_head;
@@ -334,6 +413,12 @@ void mpath_remove_disk(struct mpath_disk *mpath_disk)
if (test_and_clear_bit(MPATH_HEAD_DISK_LIVE, &mpath_head->flags)) {
struct gendisk *disk = mpath_disk->disk;
+ /*
+ * requeue I/O after MPATH_HEAD_DISK_LIVE has been cleared
+ * to allow multipath to fail all I/O.
+ */
+ kblockd_schedule_work(&mpath_head->requeue_work);
+
mpath_synchronize(mpath_head);
del_gendisk(disk);
}
@@ -409,6 +494,7 @@ void mpath_device_set_live(struct mpath_disk *mpath_disk,
mutex_unlock(&mpath_head->lock);
mpath_synchronize(mpath_head);
+ kblockd_schedule_work(&mpath_head->requeue_work);
}
EXPORT_SYMBOL_GPL(mpath_device_set_live);
@@ -424,6 +510,10 @@ struct mpath_head *mpath_alloc_head(void)
mutex_init(&mpath_head->lock);
kref_init(&mpath_head->ref);
+ INIT_WORK(&mpath_head->requeue_work, mpath_requeue_work);
+ spin_lock_init(&mpath_head->requeue_lock);
+ bio_list_init(&mpath_head->requeue_list);
+
ret = init_srcu_struct(&mpath_head->srcu);
if (ret) {
kfree(mpath_head);
--
2.43.5
On 2/25/26 9:02 PM, John Garry wrote:
> Add support to submit a bio per-path. In addition, for failover, add
> support to requeue a failed bio.
>
> NVMe has almost like-for-like equivalents here:
> - nvme_available_path() -> mpath_available_path()
> - nvme_requeue_work() -> mpath_requeue_work()
> - nvme_ns_head_submit_bio() -> mpath_bdev_submit_bio()
>
> For failover, a driver may want to re-submit a bio, so add support to
> clone a bio prior to submission.
>
> A bio which is submitted to a per-path device has flag REQ_MPATH set,
> same as what is done for NVMe with REQ_NVME_MPATH.
>
> Signed-off-by: John Garry<john.g.garry@oracle.com>
> ---
> include/linux/multipath.h | 15 +++++++
> lib/multipath.c | 92 ++++++++++++++++++++++++++++++++++++++-
> 2 files changed, 106 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/multipath.h b/include/linux/multipath.h
> index c964a1aba9c42..d557fb9bab4c9 100644
> --- a/include/linux/multipath.h
> +++ b/include/linux/multipath.h
> @@ -3,6 +3,7 @@
> #define _LIBMULTIPATH_H
>
> #include <linux/blkdev.h>
> +#include <linux/blk-mq.h>
> #include <linux/srcu.h>
>
> extern const struct block_device_operations mpath_ops;
> @@ -40,10 +41,12 @@ struct mpath_device {
> };
>
> struct mpath_head_template {
> + bool (*available_path)(struct mpath_device *, bool *);
> bool (*is_disabled)(struct mpath_device *);
> bool (*is_optimized)(struct mpath_device *);
> enum mpath_access_state (*get_access_state)(struct mpath_device *);
> enum mpath_iopolicy_e (*get_iopolicy)(struct mpath_head *);
> + struct bio *(*clone_bio)(struct bio *);
> const struct attribute_group **device_groups;
> };
>
> @@ -56,12 +59,23 @@ struct mpath_head {
>
> struct kref ref;
>
> + struct bio_list requeue_list; /* list for requeing bio */
> + spinlock_t requeue_lock;
> + struct work_struct requeue_work; /* work struct for requeue */
> +
> unsigned long flags;
> struct mpath_device __rcu *current_path[MAX_NUMNODES];
> const struct mpath_head_template *mpdt;
> void *drvdata;
> };
>
> +#define REQ_MPATH REQ_DRV
> +
> +static inline bool is_mpath_request(struct request *req)
> +{
> + return req->cmd_flags & REQ_MPATH;
> +}
> +
> static inline struct mpath_disk *mpath_bd_device_to_disk(struct device *dev)
> {
> return dev_get_drvdata(dev);
> @@ -82,6 +96,7 @@ int mpath_set_iopolicy(const char *val, int *iopolicy);
> int mpath_get_iopolicy(char *buf, int iopolicy);
> int mpath_get_head(struct mpath_head *mpath_head);
> void mpath_put_head(struct mpath_head *mpath_head);
> +void mpath_requeue_work(struct work_struct *work);
> struct mpath_head *mpath_alloc_head(void);
> void mpath_put_disk(struct mpath_disk *mpath_disk);
> void mpath_remove_disk(struct mpath_disk *mpath_disk);
> diff --git a/lib/multipath.c b/lib/multipath.c
> index 65a0d2d2bf524..b494b35e8dccc 100644
> --- a/lib/multipath.c
> +++ b/lib/multipath.c
> @@ -5,6 +5,7 @@
> */
> #include <linux/module.h>
> #include <linux/multipath.h>
> +#include <trace/events/block.h>
>
> static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head);
>
> @@ -227,7 +228,6 @@ static struct mpath_device *mpath_numa_path(struct mpath_head *mpath_head,
> return mpath_device;
> }
>
> -__maybe_unused
> static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head)
> {
> enum mpath_iopolicy_e iopolicy =
> @@ -243,6 +243,66 @@ static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head)
> }
> }
>
> +static bool mpath_available_path(struct mpath_head *mpath_head)
> +{
> + struct mpath_device *mpath_device;
> +
> + if (!test_bit(MPATH_HEAD_DISK_LIVE, &mpath_head->flags))
> + return false;
> +
> + list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings,
> + srcu_read_lock_held(&mpath_head->srcu)) {
> + bool available = false;
> +
> + if (!mpath_head->mpdt->available_path(mpath_device,
> + &available))
> + continue;
> + if (available)
> + return true;
> + }
> +
> + return false;
> +}
IMO, we may further simplify the callback ->available_path() to return
true or false instead of passing the result in a separate @available
argument.
Thanks,
--Nilay
On 02/03/2026 12:39, Nilay Shroff wrote:
>> static struct mpath_device *mpath_find_path(struct mpath_head
>> *mpath_head)
>> {
>> enum mpath_iopolicy_e iopolicy =
>> @@ -243,6 +243,66 @@ static struct mpath_device
>> *mpath_find_path(struct mpath_head *mpath_head)
>> }
>> }
>> +static bool mpath_available_path(struct mpath_head *mpath_head)
>> +{
>> + struct mpath_device *mpath_device;
>> +
>> + if (!test_bit(MPATH_HEAD_DISK_LIVE, &mpath_head->flags))
>> + return false;
>> +
>> + list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list,
>> siblings,
>> + srcu_read_lock_held(&mpath_head->srcu)) {
>> + bool available = false;
>> +
>> + if (!mpath_head->mpdt->available_path(mpath_device,
>> + &available))
>> + continue;
>> + if (available)
>> + return true;
>> + }
>> +
>> + return false;
>> +}
>
> IMO, we may further simplify the callback ->available_path() to return
> true or false instead of passing the result in a separate @available
> argument.
I have to admit that I am not keen on this abstraction at all, as it is
purely generated to fit the current code.
Anyway, from checking mainline nvme_available_path(), we skip checking
the ctrl state if the ctrl failfast flag is set (which means
mpath_head->mpdt->available_path returns false). But I suppose the
callback could check both the ctrl flags and state (and just return a
single boolean), like:
if (failfast flag set)
return false;
if (ctrl live, resetting, connecting)
return true;
return false;
Thanks,
John
On 3/2/26 9:22 PM, John Garry wrote:
> On 02/03/2026 12:39, Nilay Shroff wrote:
>>> static struct mpath_device *mpath_find_path(struct mpath_head
>>> *mpath_head)
>>> {
>>> enum mpath_iopolicy_e iopolicy =
>>> @@ -243,6 +243,66 @@ static struct mpath_device
>>> *mpath_find_path(struct mpath_head *mpath_head)
>>> }
>>> }
>>> +static bool mpath_available_path(struct mpath_head *mpath_head)
>>> +{
>>> + struct mpath_device *mpath_device;
>>> +
>>> + if (!test_bit(MPATH_HEAD_DISK_LIVE, &mpath_head->flags))
>>> + return false;
>>> +
>>> + list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list,
>>> siblings,
>>> + srcu_read_lock_held(&mpath_head->srcu)) {
>>> + bool available = false;
>>> +
>>> + if (!mpath_head->mpdt->available_path(mpath_device,
>>> + &available))
>>> + continue;
>>> + if (available)
>>> + return true;
>>> + }
>>> +
>>> + return false;
>>> +}
>>
>> IMO, we may further simplify the callback ->available_path() to return
>> true or false instead of passing the result in a separate @available
>> argument.
>
> I have to admit that I am not keen on this abstraction at all, as it is
> purely generated to fit the current code.
>
> Anyway, from checking mainline nvme_available_path(), we skip checking
> the ctrl state if the ctrl failfast flag is set (which means mpath_head-
> >mpdt->available_path returns false). But I suppose the callback could
> check both the ctrl flags and state (and just return a single boolean),
> like:
>
> if (failfast flag set)
> return false;
> if (ctrl live, resetting, connecting)
> return true;
> return false;
>
Yes I think, as now the ->dev_list (or ns sibling) iterator is handled
within libmultipath code, the above logic makes sense. We should plan to
simplify nvme_available_path() as per the above pseudo code.
Thanks,
--Nilay
© 2016 - 2026 Red Hat, Inc.