Add code for path selection.
NVMe ANA is abstracted into enum mpath_access_state. The motivation here is
so that SCSI ALUA can be used. Callbacks .is_disabled, .is_optimized,
.get_access_state are added to get the path access state.
Path selection modes round-robin, NUMA, and queue-depth are added, same
as NVMe supports.
NVMe has almost like-for-like equivalents here:
- __mpath_find_path() -> __nvme_find_path()
- mpath_find_path() -> nvme_find_path()
and similar for all introduced callee functions.
Functions mpath_set_iopolicy() and mpath_get_iopolicy() are added for
setting default iopolicy.
A separate mpath_iopolicy structure is introduced. There is no iopolicy
member included in the mpath_head structure as it may not suit NVMe, where
iopolicy is per-subsystem and not per namespace.
Signed-off-by: John Garry <john.g.garry@oracle.com>
---
include/linux/multipath.h | 36 ++++++
lib/multipath.c | 251 ++++++++++++++++++++++++++++++++++++++
2 files changed, 287 insertions(+)
diff --git a/include/linux/multipath.h b/include/linux/multipath.h
index be9dd9fb83345..c964a1aba9c42 100644
--- a/include/linux/multipath.h
+++ b/include/linux/multipath.h
@@ -7,6 +7,22 @@
extern const struct block_device_operations mpath_ops;
+enum mpath_iopolicy_e {
+ MPATH_IOPOLICY_NUMA,
+ MPATH_IOPOLICY_RR,
+ MPATH_IOPOLICY_QD,
+};
+
+struct mpath_iopolicy {
+ enum mpath_iopolicy_e iopolicy;
+};
+
+enum mpath_access_state {
+ MPATH_STATE_OPTIMIZED,
+ MPATH_STATE_ACTIVE,
+ MPATH_STATE_INVALID = 0xFF
+};
+
struct mpath_disk {
struct gendisk *disk;
struct kref ref;
@@ -18,10 +34,16 @@ struct mpath_disk {
struct mpath_device {
struct list_head siblings;
+ atomic_t nr_active;
struct gendisk *disk;
+ int numa_node;
};
struct mpath_head_template {
+ bool (*is_disabled)(struct mpath_device *);
+ bool (*is_optimized)(struct mpath_device *);
+ enum mpath_access_state (*get_access_state)(struct mpath_device *);
+ enum mpath_iopolicy_e (*get_iopolicy)(struct mpath_head *);
const struct attribute_group **device_groups;
};
@@ -50,6 +72,14 @@ static inline struct mpath_disk *mpath_gendisk_to_disk(struct gendisk *disk)
return mpath_bd_device_to_disk(disk_to_dev(disk));
}
+static inline enum mpath_iopolicy_e mpath_read_iopolicy(
+ struct mpath_iopolicy *mpath_iopolicy)
+{
+ return READ_ONCE(mpath_iopolicy->iopolicy);
+}
+void mpath_synchronize(struct mpath_head *mpath_head);
+int mpath_set_iopolicy(const char *val, int *iopolicy);
+int mpath_get_iopolicy(char *buf, int iopolicy);
int mpath_get_head(struct mpath_head *mpath_head);
void mpath_put_head(struct mpath_head *mpath_head);
struct mpath_head *mpath_alloc_head(void);
@@ -66,4 +96,10 @@ static inline bool is_mpath_head(struct gendisk *disk)
{
return disk->fops == &mpath_ops;
}
+
+static inline bool mpath_qd_iopolicy(struct mpath_iopolicy *mpath_iopolicy)
+{
+ return mpath_read_iopolicy(mpath_iopolicy) == MPATH_IOPOLICY_QD;
+}
+
#endif // _LIBMULTIPATH_H
diff --git a/lib/multipath.c b/lib/multipath.c
index 88efb0ae16acb..65a0d2d2bf524 100644
--- a/lib/multipath.c
+++ b/lib/multipath.c
@@ -6,8 +6,243 @@
#include <linux/module.h>
#include <linux/multipath.h>
+static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head);
+
static struct workqueue_struct *mpath_wq;
+static const char *mpath_iopolicy_names[] = {
+ [MPATH_IOPOLICY_NUMA] = "numa",
+ [MPATH_IOPOLICY_RR] = "round-robin",
+ [MPATH_IOPOLICY_QD] = "queue-depth",
+};
+
+int mpath_set_iopolicy(const char *val, int *iopolicy)
+{
+ if (!val)
+ return -EINVAL;
+ if (!strncmp(val, "numa", 4))
+ *iopolicy = MPATH_IOPOLICY_NUMA;
+ else if (!strncmp(val, "round-robin", 11))
+ *iopolicy = MPATH_IOPOLICY_RR;
+ else if (!strncmp(val, "queue-depth", 11))
+ *iopolicy = MPATH_IOPOLICY_QD;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(mpath_set_iopolicy);
+
+int mpath_get_iopolicy(char *buf, int iopolicy)
+{
+ return sprintf(buf, "%s\n", mpath_iopolicy_names[iopolicy]);
+}
+EXPORT_SYMBOL_GPL(mpath_get_iopolicy);
+
+
+void mpath_synchronize(struct mpath_head *mpath_head)
+{
+ synchronize_srcu(&mpath_head->srcu);
+}
+EXPORT_SYMBOL_GPL(mpath_synchronize);
+
+static bool mpath_path_is_disabled(struct mpath_head *mpath_head,
+ struct mpath_device *mpath_device)
+{
+ return mpath_head->mpdt->is_disabled(mpath_device);
+}
+
+static struct mpath_device *__mpath_find_path(struct mpath_head *mpath_head,
+ enum mpath_iopolicy_e iopolicy, int node)
+{
+ int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
+ struct mpath_device *mpath_dev_found, *mpath_dev_fallback,
+ *mpath_device;
+
+ list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings,
+ srcu_read_lock_held(&mpath_head->srcu)) {
+ if (mpath_path_is_disabled(mpath_head, mpath_device))
+ continue;
+
+ if (mpath_device->numa_node != NUMA_NO_NODE &&
+ (iopolicy == MPATH_IOPOLICY_NUMA))
+ distance = node_distance(node, mpath_device->numa_node);
+ else
+ distance = LOCAL_DISTANCE;
+
+ switch(mpath_head->mpdt->get_access_state(mpath_device)) {
+ case MPATH_STATE_OPTIMIZED:
+ if (distance < found_distance) {
+ found_distance = distance;
+ mpath_dev_found = mpath_device;
+ }
+ break;
+ case MPATH_STATE_ACTIVE:
+ if (distance < fallback_distance) {
+ fallback_distance = distance;
+ mpath_dev_fallback = mpath_device;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (!mpath_dev_found)
+ mpath_dev_found = mpath_dev_fallback;
+
+ if (mpath_dev_found)
+ rcu_assign_pointer(mpath_head->current_path[node],
+ mpath_dev_found);
+
+ return mpath_dev_found;
+}
+
+static struct mpath_device *mpath_next_dev(struct mpath_head *mpath_head,
+ struct mpath_device *mpath_dev)
+{
+ mpath_dev = list_next_or_null_rcu(&mpath_head->dev_list,
+ &mpath_dev->siblings, struct mpath_device,
+ siblings);
+
+ if (mpath_dev)
+ return mpath_dev;
+ return list_first_or_null_rcu(&mpath_head->dev_list,
+ struct mpath_device, siblings);
+}
+
+static struct mpath_device *mpath_round_robin_path(
+ struct mpath_head *mpath_head,
+ enum mpath_iopolicy_e iopolicy)
+{
+ struct mpath_device *mpath_device, *found = NULL;
+ int node = numa_node_id();
+ enum mpath_access_state access_state_old;
+ struct mpath_device *old =
+ srcu_dereference(mpath_head->current_path[node],
+ &mpath_head->srcu);
+
+ if (unlikely(!old))
+ return __mpath_find_path(mpath_head, iopolicy, node);
+
+ if (list_is_singular(&mpath_head->dev_list)) {
+ if (mpath_path_is_disabled(mpath_head, old))
+ return NULL;
+ return old;
+ }
+
+ for (mpath_device = mpath_next_dev(mpath_head, old);
+ mpath_device && mpath_device != old;
+ mpath_device = mpath_next_dev(mpath_head, mpath_device)) {
+ enum mpath_access_state access_state;
+
+ if (mpath_path_is_disabled(mpath_head, mpath_device))
+ continue;
+ access_state = mpath_head->mpdt->get_access_state(mpath_device);
+ if (access_state == MPATH_STATE_OPTIMIZED) {
+ found = mpath_device;
+ goto out;
+ }
+ if (access_state == MPATH_STATE_ACTIVE)
+ found = mpath_device;
+ }
+
+ /*
+ * The loop above skips the current path for round-robin semantics.
+ * Fall back to the current path if either:
+ * - no other optimized path found and current is optimized,
+ * - no other usable path found and current is usable.
+ */
+ access_state_old = mpath_head->mpdt->get_access_state(old);
+ if (!mpath_path_is_disabled(mpath_head, old) &&
+ (access_state_old == MPATH_STATE_OPTIMIZED ||
+ (!found && access_state_old == MPATH_STATE_ACTIVE)))
+ return old;
+
+ if (!found)
+ return NULL;
+out:
+ rcu_assign_pointer(mpath_head->current_path[node], found);
+
+ return found;
+}
+
+static struct mpath_device *mpath_queue_depth_path(struct mpath_head *mpath_head)
+{
+ struct mpath_device *best_opt = NULL, *mpath_device;
+ struct mpath_device *best_nonopt = NULL;
+ unsigned int min_depth_opt = UINT_MAX, min_depth_nonopt = UINT_MAX;
+ unsigned int depth;
+
+ list_for_each_entry_srcu(mpath_device, &mpath_head->dev_list, siblings,
+ srcu_read_lock_held(&mpath_head->srcu)) {
+
+ if (mpath_path_is_disabled(mpath_head, mpath_device))
+ continue;
+
+ depth = atomic_read(&mpath_device->nr_active);
+
+ switch (mpath_head->mpdt->get_access_state(mpath_device)) {
+ case MPATH_STATE_OPTIMIZED:
+ if (depth < min_depth_opt) {
+ min_depth_opt = depth;
+ best_opt = mpath_device;
+ }
+ break;
+ case MPATH_STATE_ACTIVE:
+ if (depth < min_depth_nonopt) {
+ min_depth_nonopt = depth;
+ best_nonopt = mpath_device;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (min_depth_opt == 0)
+ return best_opt;
+ }
+
+ return best_opt ? best_opt : best_nonopt;
+}
+
+static inline bool mpath_path_is_optimized(struct mpath_head *mpath_head,
+ struct mpath_device *mpath_device)
+{
+ return mpath_head->mpdt->is_optimized(mpath_device);
+}
+
+static struct mpath_device *mpath_numa_path(struct mpath_head *mpath_head,
+ enum mpath_iopolicy_e iopolicy)
+{
+ int node = numa_node_id();
+ struct mpath_device *mpath_device;
+
+ mpath_device = srcu_dereference(mpath_head->current_path[node],
+ &mpath_head->srcu);
+ if (unlikely(!mpath_device))
+ return __mpath_find_path(mpath_head, iopolicy, node);
+ if (unlikely(!mpath_path_is_optimized(mpath_head, mpath_device)))
+ return __mpath_find_path(mpath_head, iopolicy, node);
+ return mpath_device;
+}
+
+__maybe_unused
+static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head)
+{
+ enum mpath_iopolicy_e iopolicy =
+ mpath_head->mpdt->get_iopolicy(mpath_head);
+
+ switch (iopolicy) {
+ case MPATH_IOPOLICY_QD:
+ return mpath_queue_depth_path(mpath_head);
+ case MPATH_IOPOLICY_RR:
+ return mpath_round_robin_path(mpath_head, iopolicy);
+ default:
+ return mpath_numa_path(mpath_head, iopolicy);
+ }
+}
+
static void mpath_free_head(struct kref *ref)
{
struct mpath_head *mpath_head =
@@ -99,6 +334,7 @@ void mpath_remove_disk(struct mpath_disk *mpath_disk)
if (test_and_clear_bit(MPATH_HEAD_DISK_LIVE, &mpath_head->flags)) {
struct gendisk *disk = mpath_disk->disk;
+ mpath_synchronize(mpath_head);
del_gendisk(disk);
}
}
@@ -158,6 +394,21 @@ void mpath_device_set_live(struct mpath_disk *mpath_disk,
}
queue_work(mpath_wq, &mpath_disk->partition_scan_work);
}
+
+ mutex_lock(&mpath_head->lock);
+ if (mpath_path_is_optimized(mpath_head, mpath_device)) {
+ int node, srcu_idx;
+
+ srcu_idx = srcu_read_lock(&mpath_head->srcu);
+ for_each_online_node(node)
+ __mpath_find_path(mpath_head,
+ mpath_head->mpdt->get_iopolicy(mpath_head),
+ node);
+ srcu_read_unlock(&mpath_head->srcu, srcu_idx);
+ }
+ mutex_unlock(&mpath_head->lock);
+
+ mpath_synchronize(mpath_head);
}
EXPORT_SYMBOL_GPL(mpath_device_set_live);
--
2.43.5
Hi John,
On 2/25/26 9:02 PM, John Garry wrote:
> +static struct mpath_device *__mpath_find_path(struct mpath_head *mpath_head,
> + enum mpath_iopolicy_e iopolicy, int node)
> +{
> + int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
> + struct mpath_device *mpath_dev_found, *mpath_dev_fallback,
> + *mpath_device;
> +
I think we should initialize mpath_dev_found and mpath_dev_fallback to
NULL. Otherwise this may lead upto adding a junk mpath_device pointer
in ->current_path[node] when mpath_head->dev_list is empty. This may
particularly manifests when a controller is being shutdown and
concurrently I/O is forwarded to the same controller.
Thanks,
--Nilay
On 04/03/2026 13:10, Nilay Shroff wrote:
> On 2/25/26 9:02 PM, John Garry wrote:
>> +static struct mpath_device *__mpath_find_path(struct mpath_head
>> *mpath_head,
>> + enum mpath_iopolicy_e iopolicy, int node)
>> +{
>> + int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
>> + struct mpath_device *mpath_dev_found, *mpath_dev_fallback,
>> + *mpath_device;
>> +
>
> I think we should initialize mpath_dev_found and mpath_dev_fallback to
> NULL. Otherwise this may lead upto adding a junk mpath_device pointer
> in ->current_path[node] when mpath_head->dev_list is empty. This may
> particularly manifests when a controller is being shutdown and
> concurrently I/O is forwarded to the same controller.
Right, I see that we were doing the equivalent in __nvme_find_path().
Will fix.
Thanks for the notice.
On 2/25/26 9:02 PM, John Garry wrote:
> Add code for path selection.
>
> NVMe ANA is abstracted into enum mpath_access_state. The motivation here is
> so that SCSI ALUA can be used. Callbacks .is_disabled, .is_optimized,
> .get_access_state are added to get the path access state.
>
> Path selection modes round-robin, NUMA, and queue-depth are added, same
> as NVMe supports.
>
> NVMe has almost like-for-like equivalents here:
> - __mpath_find_path() -> __nvme_find_path()
> - mpath_find_path() -> nvme_find_path()
>
> and similar for all introduced callee functions.
>
> Functions mpath_set_iopolicy() and mpath_get_iopolicy() are added for
> setting default iopolicy.
>
> A separate mpath_iopolicy structure is introduced. There is no iopolicy
> member included in the mpath_head structure as it may not suit NVMe, where
> iopolicy is per-subsystem and not per namespace.
>
> Signed-off-by: John Garry <john.g.garry@oracle.com>
> ---
> include/linux/multipath.h | 36 ++++++
> lib/multipath.c | 251 ++++++++++++++++++++++++++++++++++++++
> 2 files changed, 287 insertions(+)
>
> diff --git a/include/linux/multipath.h b/include/linux/multipath.h
> index be9dd9fb83345..c964a1aba9c42 100644
> --- a/include/linux/multipath.h
> +++ b/include/linux/multipath.h
> @@ -7,6 +7,22 @@
>
> extern const struct block_device_operations mpath_ops;
>
> +enum mpath_iopolicy_e {
> + MPATH_IOPOLICY_NUMA,
> + MPATH_IOPOLICY_RR,
> + MPATH_IOPOLICY_QD,
> +};
> +
> +struct mpath_iopolicy {
> + enum mpath_iopolicy_e iopolicy;
> +};
> +
> +enum mpath_access_state {
> + MPATH_STATE_OPTIMIZED,
> + MPATH_STATE_ACTIVE,
> + MPATH_STATE_INVALID = 0xFF
> +};
Hmm so here we don't have MPATH_STATE_NONOPTIMIZED.
We are morphing NVME_ANA_NONOPTIMIZED as MPATH_STATE_ACTIVE.
Is it because SCSI doesn't have (NONOPTIMIZED) state?
> +
> struct mpath_disk {
> struct gendisk *disk;
> struct kref ref;
> @@ -18,10 +34,16 @@ struct mpath_disk {
>
> struct mpath_device {
> struct list_head siblings;
> + atomic_t nr_active;
> struct gendisk *disk;
> + int numa_node;
> };
>
I haven't seen any API which help set nr_active or numa_node.
Do we need to have those under struct mpath_head_template ?
Thanks,
--Nilay
On 02/03/2026 12:36, Nilay Shroff wrote:
> On 2/25/26 9:02 PM, John Garry wrote:
>> Add code for path selection.
>>
>> NVMe ANA is abstracted into enum mpath_access_state. The motivation
>> here is
>> so that SCSI ALUA can be used. Callbacks .is_disabled, .is_optimized,
>> .get_access_state are added to get the path access state.
>>
>> Path selection modes round-robin, NUMA, and queue-depth are added, same
>> as NVMe supports.
>>
>> NVMe has almost like-for-like equivalents here:
>> - __mpath_find_path() -> __nvme_find_path()
>> - mpath_find_path() -> nvme_find_path()
>>
>> and similar for all introduced callee functions.
>>
>> Functions mpath_set_iopolicy() and mpath_get_iopolicy() are added for
>> setting default iopolicy.
>>
>> A separate mpath_iopolicy structure is introduced. There is no iopolicy
>> member included in the mpath_head structure as it may not suit NVMe,
>> where
>> iopolicy is per-subsystem and not per namespace.
>>
>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>> ---
>> include/linux/multipath.h | 36 ++++++
>> lib/multipath.c | 251 ++++++++++++++++++++++++++++++++++++++
>> 2 files changed, 287 insertions(+)
>>
>> diff --git a/include/linux/multipath.h b/include/linux/multipath.h
>> index be9dd9fb83345..c964a1aba9c42 100644
>> --- a/include/linux/multipath.h
>> +++ b/include/linux/multipath.h
>> @@ -7,6 +7,22 @@
>> extern const struct block_device_operations mpath_ops;
>> +enum mpath_iopolicy_e {
>> + MPATH_IOPOLICY_NUMA,
>> + MPATH_IOPOLICY_RR,
>> + MPATH_IOPOLICY_QD,
>> +};
>> +
>> +struct mpath_iopolicy {
>> + enum mpath_iopolicy_e iopolicy;
>> +};
>> +
>> +enum mpath_access_state {
>> + MPATH_STATE_OPTIMIZED,
>> + MPATH_STATE_ACTIVE,
>> + MPATH_STATE_INVALID = 0xFF
>> +};
> Hmm so here we don't have MPATH_STATE_NONOPTIMIZED.
> We are morphing NVME_ANA_NONOPTIMIZED as MPATH_STATE_ACTIVE.
Yes, well it is treated the same (as NVME_ANA_NONOPTIMIZED) for path
selection.
> Is it because SCSI doesn't have (NONOPTIMIZED) state?
It does have an active (and optimal) state, but I think that keeping
NVMe terminology may be better for now.
>
>> +
>> struct mpath_disk {
>> struct gendisk *disk;
>> struct kref ref;
>> @@ -18,10 +34,16 @@ struct mpath_disk {
>> struct mpath_device {
>> struct list_head siblings;
>> + atomic_t nr_active;
>> struct gendisk *disk;
>> + int numa_node;
>> };
> I haven't seen any API which help set nr_active or numa_node.
I missed setting numa_node for NVMe. About nr_active, that is set/read
by the NVMe code, like nvme_mpath_start_request(). I did try to abstract
that function into a common helper, but it just becomes a mess.
> Do we need to have those under struct mpath_head_template ?
I think that the drivers can handle these directly.
Thanks
On 3/2/26 8:41 PM, John Garry wrote:
> On 02/03/2026 12:36, Nilay Shroff wrote:
>> On 2/25/26 9:02 PM, John Garry wrote:
>>> Add code for path selection.
>>>
>>> NVMe ANA is abstracted into enum mpath_access_state. The motivation
>>> here is
>>> so that SCSI ALUA can be used. Callbacks .is_disabled, .is_optimized,
>>> .get_access_state are added to get the path access state.
>>>
>>> Path selection modes round-robin, NUMA, and queue-depth are added, same
>>> as NVMe supports.
>>>
>>> NVMe has almost like-for-like equivalents here:
>>> - __mpath_find_path() -> __nvme_find_path()
>>> - mpath_find_path() -> nvme_find_path()
>>>
>>> and similar for all introduced callee functions.
>>>
>>> Functions mpath_set_iopolicy() and mpath_get_iopolicy() are added for
>>> setting default iopolicy.
>>>
>>> A separate mpath_iopolicy structure is introduced. There is no iopolicy
>>> member included in the mpath_head structure as it may not suit NVMe,
>>> where
>>> iopolicy is per-subsystem and not per namespace.
>>>
>>> Signed-off-by: John Garry <john.g.garry@oracle.com>
>>> ---
>>> include/linux/multipath.h | 36 ++++++
>>> lib/multipath.c | 251 ++++++++++++++++++++++++++++++++++++++
>>> 2 files changed, 287 insertions(+)
>>>
>>> diff --git a/include/linux/multipath.h b/include/linux/multipath.h
>>> index be9dd9fb83345..c964a1aba9c42 100644
>>> --- a/include/linux/multipath.h
>>> +++ b/include/linux/multipath.h
>>> @@ -7,6 +7,22 @@
>>> extern const struct block_device_operations mpath_ops;
>>> +enum mpath_iopolicy_e {
>>> + MPATH_IOPOLICY_NUMA,
>>> + MPATH_IOPOLICY_RR,
>>> + MPATH_IOPOLICY_QD,
>>> +};
>>> +
>>> +struct mpath_iopolicy {
>>> + enum mpath_iopolicy_e iopolicy;
>>> +};
>>> +
>>> +enum mpath_access_state {
>>> + MPATH_STATE_OPTIMIZED,
>>> + MPATH_STATE_ACTIVE,
>>> + MPATH_STATE_INVALID = 0xFF
>>> +};
>> Hmm so here we don't have MPATH_STATE_NONOPTIMIZED.
>> We are morphing NVME_ANA_NONOPTIMIZED as MPATH_STATE_ACTIVE.
>
> Yes, well it is treated the same (as NVME_ANA_NONOPTIMIZED) for path
> selection.
>
>> Is it because SCSI doesn't have (NONOPTIMIZED) state?
>
> It does have an active (and optimal) state, but I think that keeping
> NVMe terminology may be better for now.
>
>>
>>> +
>>> struct mpath_disk {
>>> struct gendisk *disk;
>>> struct kref ref;
>>> @@ -18,10 +34,16 @@ struct mpath_disk {
>>> struct mpath_device {
>>> struct list_head siblings;
>>> + atomic_t nr_active;
>>> struct gendisk *disk;
>>> + int numa_node;
>>> };
>> I haven't seen any API which help set nr_active or numa_node.
>
> I missed setting numa_node for NVMe. About nr_active, that is set/read
> by the NVMe code, like nvme_mpath_start_request(). I did try to abstract
> that function into a common helper, but it just becomes a mess.
>
The nvme_mpath_start_request() increments ns->ctrl->nr_active, and
nvme_mpath_end_request() decrements it. This means that nr_active is
maintained per controller. If multiple NVMe namespaces are created and
attached to the same controller, their I/O activity is accumulated in
the single ctrl->nr_active counter.
In contrast, libmultipath defines nr_active in struct mpath_device,
which is referenced from struct nvme_ns. Even if we add code to update
mpath_device->nr_active, that accounting would effectively be per
namespace, not per controller.
The nr_active value is used by the queue-depth policy. Currently,
mpath_queue_depth_path() accesses mpath_device->nr_active to make
forwarding decisions. However, if mpath_device->nr_active is maintained
per namespace, it does not correctly reflect controller-wide load when
multiple namespaces share the same controller.
Therefore, instead of maintaining a separate nr_active in struct
mpath_device, it may be more appropriate for mpath_queue_depth_path() to
reference ns->ctrl->nr_active directly. In that case, nr_active could be
removed from struct mpath_device entirely.
Thanks,
--Nilay
>> > The nvme_mpath_start_request() increments ns->ctrl->nr_active, and > nvme_mpath_end_request() decrements it. This means that nr_active is > maintained per controller. If multiple NVMe namespaces are created and > attached to the same controller, their I/O activity is accumulated in > the single ctrl->nr_active counter. > > In contrast, libmultipath defines nr_active in struct mpath_device, > which is referenced from struct nvme_ns. Even if we add code to update > mpath_device->nr_active, that accounting would effectively be per > namespace, not per controller. Right, I need to change that back to per-controller. > > The nr_active value is used by the queue-depth policy. Currently, > mpath_queue_depth_path() accesses mpath_device->nr_active to make > forwarding decisions. However, if mpath_device->nr_active is maintained > per namespace, it does not correctly reflect controller-wide load when > multiple namespaces share the same controller. Yes > > Therefore, instead of maintaining a separate nr_active in struct > mpath_device, it may be more appropriate for mpath_queue_depth_path() to > reference ns->ctrl->nr_active directly. In that case, nr_active could be > removed from struct mpath_device entirely. > I think so, but we will need scsi to maintain such a count internally to support this policy. And for NVMe we will need some abstraction to lookup the per-controller QD for a mpath_device. Thanks for checking!
On 3/3/26 6:11 PM, John Garry wrote: >>> >> The nvme_mpath_start_request() increments ns->ctrl->nr_active, and >> nvme_mpath_end_request() decrements it. This means that nr_active is >> maintained per controller. If multiple NVMe namespaces are created and >> attached to the same controller, their I/O activity is accumulated in >> the single ctrl->nr_active counter. >> >> In contrast, libmultipath defines nr_active in struct mpath_device, >> which is referenced from struct nvme_ns. Even if we add code to update >> mpath_device->nr_active, that accounting would effectively be per >> namespace, not per controller. > > Right, I need to change that back to per-controller. > >> >> The nr_active value is used by the queue-depth policy. Currently, >> mpath_queue_depth_path() accesses mpath_device->nr_active to make >> forwarding decisions. However, if mpath_device->nr_active is >> maintained per namespace, it does not correctly reflect controller- >> wide load when multiple namespaces share the same controller. > > Yes > >> >> Therefore, instead of maintaining a separate nr_active in struct >> mpath_device, it may be more appropriate for mpath_queue_depth_path() >> to reference ns->ctrl->nr_active directly. In that case, nr_active >> could be removed from struct mpath_device entirely. >> > > I think so, but we will need scsi to maintain such a count internally to > support this policy. And for NVMe we will need some abstraction to > lookup the per-controller QD for a mpath_device. > This raises another question regarding the current framework. From what I can see, all NVMe multipath I/O policies are currently supported for SCSI as well. Going forward, if we introduce a new I/O policy for NVMe that does not make sense for SCSI, how can we ensure that the new policy is supported only for NVMe and not for SCSI? Conversely, we may also want to introduce a policy that is relevant only for SCSI but not for NVMe. With the current framework, it seems difficult to restrict a policy to a specific transport. It appears that all policies are implicitly shared between NVMe and SCSI. Would it make sense to introduce some abstraction for I/O policies in the framework so that a given policy can be implemented and exposed only for the relevant transport (e.g., NVMe-only or SCSI-only), rather than requiring it to be supported by both? Thanks, --Nilay
On 04/03/2026 10:26, Nilay Shroff wrote: >> >> I think so, but we will need scsi to maintain such a count internally >> to support this policy. And for NVMe we will need some abstraction to >> lookup the per-controller QD for a mpath_device. >> > This raises another question regarding the current framework. From what > I can see, all NVMe multipath I/O policies are currently supported for > SCSI as well. Going forward, if we introduce a new I/O policy for NVMe > that does not make sense for SCSI, how can we ensure that the new policy > is supported only for NVMe and not for SCSI? Conversely, we may also > want to introduce a policy that is relevant only for SCSI but not for NVMe. > > With the current framework, it seems difficult to restrict a policy to a > specific transport. It appears that all policies are implicitly shared > between NVMe and SCSI. > > Would it make sense to introduce some abstraction for I/O policies in > the framework so that a given policy can be implemented and exposed only > for the relevant transport (e.g., NVMe-only or SCSI-only), rather than > requiring it to be supported by both? I think that we can cross that bridge if it ever happens. It should not be too difficult to allow a driver to specify which policies are supported/unsupported and the lib can take care of management of that. Thanks, John
On Wed, Feb 25, 2026 at 03:32:15PM +0000, John Garry wrote:
> +__maybe_unused
> +static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head)
> +{
> + enum mpath_iopolicy_e iopolicy =
> + mpath_head->mpdt->get_iopolicy(mpath_head);
> +
> + switch (iopolicy) {
> + case MPATH_IOPOLICY_QD:
> + return mpath_queue_depth_path(mpath_head);
> + case MPATH_IOPOLICY_RR:
> + return mpath_round_robin_path(mpath_head, iopolicy);
> + default:
> + return mpath_numa_path(mpath_head, iopolicy);
When we're in mpath_round_robin_path() and mpath_numa_path(), we know the
iopolicy, so we don't really need to pass it in.
-Ben
> + }
> +}
> +
On 26/02/2026 03:37, Benjamin Marzinski wrote:
> On Wed, Feb 25, 2026 at 03:32:15PM +0000, John Garry wrote:
>> +__maybe_unused
>> +static struct mpath_device *mpath_find_path(struct mpath_head *mpath_head)
>> +{
>> + enum mpath_iopolicy_e iopolicy =
>> + mpath_head->mpdt->get_iopolicy(mpath_head);
>> +
>> + switch (iopolicy) {
>> + case MPATH_IOPOLICY_QD:
>> + return mpath_queue_depth_path(mpath_head);
>> + case MPATH_IOPOLICY_RR:
>> + return mpath_round_robin_path(mpath_head, iopolicy);
>> + default:
>> + return mpath_numa_path(mpath_head, iopolicy);
> When we're in mpath_round_robin_path() and mpath_numa_path(), we know the
> iopolicy, so we don't really need to pass it in.
Sure, I don't need to pass that around.
Thanks!
© 2016 - 2026 Red Hat, Inc.