drivers/nvme/host/core.c | 14 ++++---------- drivers/nvme/host/multipath.c | 14 ++++++-------- drivers/nvme/host/nvme.h | 2 -- 3 files changed, 10 insertions(+), 20 deletions(-)
Since device-mapper multipath will no longer be operating on NVMe
devices, there is no longer a need for the "multipath" parameter.
Note that, when compiled with CONFIG_NVME_MULTIPATH off multi-path
capable controllers and namespaces will continue to present multiple
device entries - one for each controller/namespace discovered. This
could be confusing, as device-mapper multipath relies upon code in
nvme/host/multipath.c, and running device-mapper multipath with a
kernel compiled with CONFIG_NVME_MULTIPATH disabled is not supported.
Closes: https://lore.kernel.org/linux-nvme/20241121220321.40616-1-bgurney@redhat.com/
Tested-by: John Meneghini <jmeneghi@redhat.com>
Reviewed-by: John Meneghini <jmeneghi@redhat.com>
Signed-off-by: Bryan Gurney <bgurney@redhat.com>
---
drivers/nvme/host/core.c | 14 ++++----------
drivers/nvme/host/multipath.c | 14 ++++++--------
drivers/nvme/host/nvme.h | 2 --
3 files changed, 10 insertions(+), 20 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2bcd9f710cb6..b07cd482fbc1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3809,14 +3809,6 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
info->nsid);
goto out_put_ns_head;
}
-
- if (!multipath) {
- dev_warn(ctrl->device,
- "Found shared namespace %d, but multipathing not supported.\n",
- info->nsid);
- dev_warn_once(ctrl->device,
- "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
- }
}
list_add_tail_rcu(&ns->siblings, &head->list);
@@ -3915,12 +3907,14 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
ctrl->instance, ns->head->instance);
disk->flags |= GENHD_FL_HIDDEN;
- } else if (multipath) {
+ } else {
+#ifdef CONFIG_NVME_MULTIPATH
sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
ns->head->instance);
- } else {
+#else
sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
ns->head->instance);
+#endif
}
if (nvme_update_ns_info(ns, info))
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index a85d190942bd..28ab868182b2 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -9,11 +9,6 @@
#include <trace/events/block.h>
#include "nvme.h"
-bool multipath = true;
-module_param(multipath, bool, 0444);
-MODULE_PARM_DESC(multipath,
- "turn on native support for multiple controllers per subsystem");
-
static const char *nvme_iopolicy_names[] = {
[NVME_IOPOLICY_NUMA] = "numa",
[NVME_IOPOLICY_RR] = "round-robin",
@@ -632,9 +627,11 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
* We also do this for private namespaces as the namespace sharing flag
* could change after a rescan.
*/
+#ifdef CONFIG_NVME_MULTIPATH
if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
- !nvme_is_unique_nsid(ctrl, head) || !multipath)
+ !nvme_is_unique_nsid(ctrl, head))
return 0;
+#endif
blk_set_stacking_limits(&lim);
lim.dma_alignment = 3;
@@ -1038,10 +1035,11 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
size_t ana_log_size;
int error = 0;
+#ifdef CONFIG_NVME_MULTIPATH
/* check if multipath is enabled and we have the capability */
- if (!multipath || !ctrl->subsys ||
- !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
+ if (!ctrl->subsys || !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
return 0;
+#endif
/* initialize this in the identify path to cover controller resets */
atomic_set(&ctrl->nr_active, 0);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 2c76afd00390..6dea04f05b59 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -972,7 +972,6 @@ static inline void nvme_trace_bio_complete(struct request *req)
trace_block_bio_complete(ns->head->disk->queue, req->bio);
}
-extern bool multipath;
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
@@ -982,7 +981,6 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
return disk->fops == &nvme_ns_head_ops;
}
#else
-#define multipath false
static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
{
return false;
--
2.47.0
Bryan, See my comments below.
On 2/4/25 4:11 PM, Bryan Gurney wrote:
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index a85d190942bd..28ab868182b2 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -9,11 +9,6 @@
> #include <trace/events/block.h>
> #include "nvme.h"
>
> -bool multipath = true;
> -module_param(multipath, bool, 0444);
> -MODULE_PARM_DESC(multipath,
> - "turn on native support for multiple controllers per subsystem");
> -
> static const char *nvme_iopolicy_names[] = {
> [NVME_IOPOLICY_NUMA] = "numa",
> [NVME_IOPOLICY_RR] = "round-robin",
> @@ -632,9 +627,11 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
> * We also do this for private namespaces as the namespace sharing flag
> * could change after a rescan.
> */
> +#ifdef CONFIG_NVME_MULTIPATH
> if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
> - !nvme_is_unique_nsid(ctrl, head) || !multipath)
> + !nvme_is_unique_nsid(ctrl, head))
> return 0;
> +#endif
You don't need to add these #ifdef CONFIG_NVME_MULTIPATH conditional compile statements
to multipath.c because the multipath.c is not compiled when CONFIG_NVME_MULTIPATH=N.
These won't hurt anything, but they are redundant and we can make the patch smaller by removing them.
> blk_set_stacking_limits(&lim);
> lim.dma_alignment = 3;
> @@ -1038,10 +1035,11 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
> size_t ana_log_size;
> int error = 0;
>
> +#ifdef CONFIG_NVME_MULTIPATH
> /* check if multipath is enabled and we have the capability */
> - if (!multipath || !ctrl->subsys ||
> - !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
> + if (!ctrl->subsys || !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
> return 0;
> +#endif
Same here.
> /* initialize this in the identify path to cover controller resets */
> atomic_set(&ctrl->nr_active, 0);
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 2c76afd00390..6dea04f05b59 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -972,7 +972,6 @@ static inline void nvme_trace_bio_complete(struct request *req)
> trace_block_bio_complete(ns->head->disk->queue, req->bio);
> }
>
/John
Bryan, since we are re-spinning this patch please make the changes below.
On 2/4/25 4:11 PM, Bryan Gurney wrote:
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 2bcd9f710cb6..b07cd482fbc1 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3809,14 +3809,6 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
> info->nsid);
> goto out_put_ns_head;
> }
> -
> - if (!multipath) {
> - dev_warn(ctrl->device,
> - "Found shared namespace %d, but multipathing not supported.\n",
> - info->nsid);
> - dev_warn_once(ctrl->device,
> - "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
> - }
I think we want to keep a similar warning like the above, only move it down below.
Let's wait for Sagi's approval of the exact message.
> }
>
> list_add_tail_rcu(&ns->siblings, &head->list);
> @@ -3915,12 +3907,14 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
> sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
> ctrl->instance, ns->head->instance);
> disk->flags |= GENHD_FL_HIDDEN;
> - } else if (multipath) {
> + } else {
> +#ifdef CONFIG_NVME_MULTIPATH
> sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
> ns->head->instance);
> - } else {
> +#else
You don't need the this #ifdef conditional statement because the nvme_ns_head_multipath() function checks CONFIG_NVME_MULTIPATH
for you and the sprintf(disk->disk_name, "nvme%dn%d" statement is kind of redundant.
> sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
> ns->head->instance);
> +#endif
> }
>
> if (nvme_update_ns_info(ns, info))
Thanks,
/John
On 2/18/25 9:26 AM, John Meneghini wrote:
>> + } else {
>> +#ifdef CONFIG_NVME_MULTIPATH
>> sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
>> ns->head->instance);
>> - } else {
> > +#else
>
> You don't need the this #ifdef conditional statement because the nvme_ns_head_multipath() function checks CONFIG_NVME_MULTIPATH
> for you and the sprintf(disk->disk_name, "nvme%dn%d" statement is kind of redundant.
>
>
>> sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
>> ns->head->instance);
>> +#endif
Actually, I think I may be wrong about this. To maintain 100% backward compatibility with what the users see in the dmesg log, I think we need this code.
/John
Keith, Christoph and Sagi,
This patch has been fully tested and analyzed by Red Hat's QA group and no
unexpected side effects or regressions have been found. Both NVMe/FC and NVMe/TCP
have been tested. Our QE engineer has asked me to report this upstream.
Tested-by: Marco Patalano <mpatalan@redhat.com>
Fixes: 32acab3181c7 ("nvme: implement multipath access to nvme subsystems")
As discussed in previous email threads, the nvme.core_multipath parameter was included
by Christoph in the original implementation of nvme/host/multipath.c back in 2017
at the request of Red Hat. At that time Red Hat was only supporting DMMP multipath
with NVMe and we needed a way to disable core native nvme multipathing without
reconfiging the kernel.
The nvme.core_multipath parameter has been used by Red Hat, together with some additional
out-of-tree changes to nvme/host/core.c, to support DMMP multipath with NVMe since RHEL-8.
However, the plan all along has been to deprecate and remove support for DMMP multipath with NVMe
in RHEL and to remove this parameter. This change has been advertised and communicated to our
customers and partners, beginning with RHEL-9, through release notes and other communications.
In RHEL-9 we changed the default setting of this parameter from "N" to "Y" to match the upstream
kernel and the move our customers to native nvme multipath. DMMP multipath w/NVMe was deprecated
in RHEL-9 but still supported. Customers were still able to optionally enabled DMMP multipath
with NVMe if they wanted, by changing this parameter, and many of them do.
In the (soon to be released) RHEL-10 release DMMP multipath is longer supported with NVMe and
the out-of-tree patches needed to support DMMP multipath with NVMe have been removed.
Red Hat is proposing this patch to remove nvme.core_multipath parameter from the kernel as we believe
leaving this non-supported parameter in the kernel for future releases will only lead to confusion.
We plan to ship this patch with RHEL-10. So it would be really good if we could get this
change accepted and merged upstream, perhaps into v6.15.
/John
On 2/4/25 4:11 PM, Bryan Gurney wrote:
> Since device-mapper multipath will no longer be operating on NVMe
> devices, there is no longer a need for the "multipath" parameter.
>
> Note that, when compiled with CONFIG_NVME_MULTIPATH off multi-path
> capable controllers and namespaces will continue to present multiple
> device entries - one for each controller/namespace discovered. This
> could be confusing, as device-mapper multipath relies upon code in
> nvme/host/multipath.c, and running device-mapper multipath with a
> kernel compiled with CONFIG_NVME_MULTIPATH disabled is not supported.
>
> Closes: https://lore.kernel.org/linux-nvme/20241121220321.40616-1-bgurney@redhat.com/
>
> Tested-by: John Meneghini <jmeneghi@redhat.com>
> Reviewed-by: John Meneghini <jmeneghi@redhat.com>
>
> Signed-off-by: Bryan Gurney <bgurney@redhat.com>
> ---
> drivers/nvme/host/core.c | 14 ++++----------
> drivers/nvme/host/multipath.c | 14 ++++++--------
> drivers/nvme/host/nvme.h | 2 --
> 3 files changed, 10 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 2bcd9f710cb6..b07cd482fbc1 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3809,14 +3809,6 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
> info->nsid);
> goto out_put_ns_head;
> }
> -
> - if (!multipath) {
> - dev_warn(ctrl->device,
> - "Found shared namespace %d, but multipathing not supported.\n",
> - info->nsid);
> - dev_warn_once(ctrl->device,
> - "Support for shared namespaces without CONFIG_NVME_MULTIPATH is deprecated and will be removed in Linux 6.0.\n");
> - }
> }
>
> list_add_tail_rcu(&ns->siblings, &head->list);
> @@ -3915,12 +3907,14 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
> sprintf(disk->disk_name, "nvme%dc%dn%d", ctrl->subsys->instance,
> ctrl->instance, ns->head->instance);
> disk->flags |= GENHD_FL_HIDDEN;
> - } else if (multipath) {
> + } else {
> +#ifdef CONFIG_NVME_MULTIPATH
> sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
> ns->head->instance);
> - } else {
> +#else
> sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
> ns->head->instance);
> +#endif
> }
>
> if (nvme_update_ns_info(ns, info))
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index a85d190942bd..28ab868182b2 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -9,11 +9,6 @@
> #include <trace/events/block.h>
> #include "nvme.h"
>
> -bool multipath = true;
> -module_param(multipath, bool, 0444);
> -MODULE_PARM_DESC(multipath,
> - "turn on native support for multiple controllers per subsystem");
> -
> static const char *nvme_iopolicy_names[] = {
> [NVME_IOPOLICY_NUMA] = "numa",
> [NVME_IOPOLICY_RR] = "round-robin",
> @@ -632,9 +627,11 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
> * We also do this for private namespaces as the namespace sharing flag
> * could change after a rescan.
> */
> +#ifdef CONFIG_NVME_MULTIPATH
> if (!(ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) ||
> - !nvme_is_unique_nsid(ctrl, head) || !multipath)
> + !nvme_is_unique_nsid(ctrl, head))
> return 0;
> +#endif
>
> blk_set_stacking_limits(&lim);
> lim.dma_alignment = 3;
> @@ -1038,10 +1035,11 @@ int nvme_mpath_init_identify(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
> size_t ana_log_size;
> int error = 0;
>
> +#ifdef CONFIG_NVME_MULTIPATH
> /* check if multipath is enabled and we have the capability */
> - if (!multipath || !ctrl->subsys ||
> - !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
> + if (!ctrl->subsys || !(ctrl->subsys->cmic & NVME_CTRL_CMIC_ANA))
> return 0;
> +#endif
>
> /* initialize this in the identify path to cover controller resets */
> atomic_set(&ctrl->nr_active, 0);
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 2c76afd00390..6dea04f05b59 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -972,7 +972,6 @@ static inline void nvme_trace_bio_complete(struct request *req)
> trace_block_bio_complete(ns->head->disk->queue, req->bio);
> }
>
> -extern bool multipath;
> extern struct device_attribute dev_attr_ana_grpid;
> extern struct device_attribute dev_attr_ana_state;
> extern struct device_attribute subsys_attr_iopolicy;
> @@ -982,7 +981,6 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
> return disk->fops == &nvme_ns_head_ops;
> }
> #else
> -#define multipath false
> static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
> {
> return false;
On Thu, Feb 13, 2025 at 03:37:28PM -0500, John Meneghini wrote: > Keith, Christoph and Sagi, > > This patch has been fully tested and analyzed by Red Hat's QA group and no > unexpected side effects or regressions have been found. Both NVMe/FC and NVMe/TCP > have been tested. Our QE engineer has asked me to report this upstream. What's the harm in leaving the parameter? *I* use it so I can test both ways without needing to compile multiple kernels.
On 2/18/25 10:06 AM, Keith Busch wrote: > On Thu, Feb 13, 2025 at 03:37:28PM -0500, John Meneghini wrote: >> Keith, Christoph and Sagi, >> >> This patch has been fully tested and analyzed by Red Hat's QA group and no >> unexpected side effects or regressions have been found. Both NVMe/FC and NVMe/TCP >> have been tested. Our QE engineer has asked me to report this upstream. > > What's the harm in leaving the parameter? *I* use it so I can test both > ways without needing to compile multiple kernels. LOL. We've been talking about this since 2017. The goal has always been to remove support for DMMP with NVMe. We want to remove this parameter because it is causing confusion with users and customers who keep trying to use DMMP with their multipath NVMe devices. We want to remove this parameter because: 1) the upstream kernel does not support multipath nvme devices without CONFIG_NVME_MULTIPATH enabled 2) the upstream kernel does not support multipath nvme devices when core.nvme_multipath is set to N 3) Non-multipath nvme devies are supported just fine with core.nvme_multipath is set to Y You don't need set core.nvme_multipath to N to test your devices both ways w/o recompiling the kernel. All of the code paths involved here are controlled by NVME_CTRL_CMIC_MULTI_CTRL and setting core.nvme_multipath to N doesn't do anything to help your single path NVMe devices. It doesn't remove multipath.c, reduce the code path length or do anything else to optimize your non-NVME_CTRL_CMIC_MULTI_CTRL devices. All it does is provide an escape hatch to disable the incore multipath scheduler start creating multiple /dev/nvme%d/n%d entries so that DMMP can be used with multipath capable NVMe devices. Personally, I'd like to remove CONFIG_NVME_MULTIPATH as well. It's just another source of confusion. Most users are running Linux with the the default settings for NVME_MULTIPATH. This is what Red Hat customers use and that's what's used upstream. So what's the point? /John
On Tue, Feb 18, 2025 at 11:31:58AM -0500, John Meneghini wrote: > On 2/18/25 10:06 AM, Keith Busch wrote: > > On Thu, Feb 13, 2025 at 03:37:28PM -0500, John Meneghini wrote: > > > Keith, Christoph and Sagi, > > > > > > This patch has been fully tested and analyzed by Red Hat's QA group and no > > > unexpected side effects or regressions have been found. Both NVMe/FC and NVMe/TCP > > > have been tested. Our QE engineer has asked me to report this upstream. > > > > What's the harm in leaving the parameter? *I* use it so I can test both > > ways without needing to compile multiple kernels. > > LOL. We've been talking about this since 2017. The goal has always been to remove support for DMMP with NVMe. I understand that disabling nvme native mp it is required for device mapper, and I get you want to prevent the possibility of anyone using dm-mp with nvme, but that isn't the only user that wants to see all namespace paths. > We want to remove this parameter because it is causing confusion with users and customers who keep trying to use > DMMP with their multipath NVMe devices. > > We want to remove this parameter because: > > 1) the upstream kernel does not support multipath nvme devices without CONFIG_NVME_MULTIPATH enabled What do you mean by "support"? I assume you mean no one upstream will help you debug your problems if you've set yourself up that way, and that's probably true. The kernel currently doesn't stop you from doing this though, so it's supported in that sense. Some people are fine doing this on their own, they're not seeking upstream help. Changing this would break userspace because it makes the driver fail to create device nodes that used to show up. > 2) the upstream kernel does not support multipath nvme devices when core.nvme_multipath is set to N > 3) Non-multipath nvme devies are supported just fine with core.nvme_multipath is set to Y > > You don't need set core.nvme_multipath to N to test your devices both ways w/o recompiling the kernel. > All of the code paths involved here are controlled by NVME_CTRL_CMIC_MULTI_CTRL and setting core.nvme_multipath > to N doesn't do anything to help your single path NVMe devices. It doesn't remove multipath.c, reduce the code > path length or do anything else to optimize your non-NVME_CTRL_CMIC_MULTI_CTRL devices. All it does is provide > an escape hatch to disable the incore multipath scheduler start creating multiple /dev/nvme%d/n%d entries so > that DMMP can be used with multipath capable NVMe devices. > > Personally, I'd like to remove CONFIG_NVME_MULTIPATH as well. It's just another source of confusion. Most users > are running Linux with the the default settings for NVME_MULTIPATH. This is what Red Hat customers use and that's > what's used upstream. So what's the point? There are devices that report CMIC and NMIC despite being single path, perhaps as some vestigial sr-iov feature. That adds an unnecessary layer for all IO to go through. Having the param makes it easy to test both possible driver paths. In production though, I'd expect to just disable the CONFIG option if that's the behavior someoone wants, so I think the config option ought to stay.
On 2/18/25 10:45 PM, Keith Busch wrote: > On Tue, Feb 18, 2025 at 11:31:58AM -0500, John Meneghini wrote: >> On 2/18/25 10:06 AM, Keith Busch wrote: >>> On Thu, Feb 13, 2025 at 03:37:28PM -0500, John Meneghini wrote: >>>> Keith, Christoph and Sagi, >>>> >>>> This patch has been fully tested and analyzed by Red Hat's QA group and no >>>> unexpected side effects or regressions have been found. Both NVMe/FC and NVMe/TCP >>>> have been tested. Our QE engineer has asked me to report this upstream. >>> >>> What's the harm in leaving the parameter? *I* use it so I can test both >>> ways without needing to compile multiple kernels. >> >> LOL. We've been talking about this since 2017. The goal has always been to remove support for DMMP with NVMe. > > I understand that disabling nvme native mp it is required for device > mapper, and I get you want to prevent the possibility of anyone using > dm-mp with nvme, but that isn't the only user that wants to see all > namespace paths. > Agreed! I do have nvme multi-controller disks and I toggle multipath module parameter to test it both ways. One with each individual path to a shared namespace and another with a single head node path and then let IO policy determine which path to choose for sending IO to disk. In fact, we have few nvme blktests which only runs if device is configured with single path and so testing it on multi-controller nvme disk would require us to turn off multipath module parameter. In that way this parameter is very handy. My typical way of toggling this parameter is: 1. disable multipath # rmmod nvme # rmmod nvme_core # modprobe nvme_core multipath=0 # modprobe nvme Now I could run all nvme blktests which require us disable multipath. 2. enable multipath: # rmmod nvme # rmmmod nvme_core # modprobe nvme_core multipath=1 # modprobe nvme Now if we get away with multipath parameter then it means we have to re-compile kernel disabling CONFIG_NVME_MULTIPATH option and that's something we may want to avoid, if possible. Having said that I'm not against this patch however we certainly have users who would be using multipath parameter and taking away this parameter may break those user application. So I'm wondering why is it not possible for RedHat to let their customer know the fact that starting with RHEL-10, we don't support DMMP for NVMe device even though the multipath module parameter is present? Thanks, --Nilay
On 19/02/2025 16:47, Nilay Shroff wrote: > > On 2/18/25 10:45 PM, Keith Busch wrote: >> On Tue, Feb 18, 2025 at 11:31:58AM -0500, John Meneghini wrote: >>> On 2/18/25 10:06 AM, Keith Busch wrote: >>>> On Thu, Feb 13, 2025 at 03:37:28PM -0500, John Meneghini wrote: >>>>> Keith, Christoph and Sagi, >>>>> >>>>> This patch has been fully tested and analyzed by Red Hat's QA group and no >>>>> unexpected side effects or regressions have been found. Both NVMe/FC and NVMe/TCP >>>>> have been tested. Our QE engineer has asked me to report this upstream. >>>> What's the harm in leaving the parameter? *I* use it so I can test both >>>> ways without needing to compile multiple kernels. >>> LOL. We've been talking about this since 2017. The goal has always been to remove support for DMMP with NVMe. >> I understand that disabling nvme native mp it is required for device >> mapper, and I get you want to prevent the possibility of anyone using >> dm-mp with nvme, but that isn't the only user that wants to see all >> namespace paths. >> > Agreed! I do have nvme multi-controller disks and I toggle multipath module parameter to > test it both ways. One with each individual path to a shared namespace and another with > a single head node path and then let IO policy determine which path to choose for sending > IO to disk. > In fact, we have few nvme blktests which only runs if device is configured with single path > and so testing it on multi-controller nvme disk would require us to turn off multipath module > parameter. In that way this parameter is very handy. My typical way of > toggling this parameter is: > 1. disable multipath > # rmmod nvme > # rmmod nvme_core > # modprobe nvme_core multipath=0 > # modprobe nvme > > Now I could run all nvme blktests which require us disable multipath. > > 2. enable multipath: > # rmmod nvme > # rmmmod nvme_core > # modprobe nvme_core multipath=1 > # modprobe nvme > > Now if we get away with multipath parameter then it means we have to re-compile kernel > disabling CONFIG_NVME_MULTIPATH option and that's something we may want to avoid, > if possible. > > Having said that I'm not against this patch however we certainly have users who would > be using multipath parameter and taking away this parameter may break those user > application. Sounds like the user application is already broken. This like any modparam is not guaranteed to stay, and we already have been warning users this is temporary and will be removed for years. > So I'm wondering why is it not possible for RedHat to let their customer > know the fact that starting with RHEL-10, we don't support DMMP for NVMe device even > though the multipath module parameter is present? I get the testing argument, but I don't think its sufficient to keep exposing a user facing modparam. This discussion is not specific to RHEL, if there is a real use-case that we are interested in supporting, we can change our minds and keep it (and simply remove the log msg), but I haven't heard any real life use-cases thus far.
On Thu, Feb 20, 2025 at 01:05:04PM +0200, Sagi Grimberg wrote: > This discussion is not specific to RHEL, if there is a real use-case > that we are interested in supporting, we can change our minds and keep > it (and simply remove the log msg), but I haven't heard any real life > use-cases thus far. One use case: ublk. Other use cases are manufacturing and debugging. Linux has been a great environment for both, which don't want anything hidden behind virtual devices. The module parameter makes it possible to do this with your distro's stock kernel that came with the CONFIG option enabled. The device mapper multipath needed some layering violations out of the driver to make failover work correctly/better. That's one reason it's not supported here, and that's an appropriate place to draw the line on what kinds of patches should be accepted.
On 2/20/25 17:47, Keith Busch wrote: > On Thu, Feb 20, 2025 at 01:05:04PM +0200, Sagi Grimberg wrote: >> This discussion is not specific to RHEL, if there is a real use-case >> that we are interested in supporting, we can change our minds and keep >> it (and simply remove the log msg), but I haven't heard any real life >> use-cases thus far. > > One use case: ublk. > > Other use cases are manufacturing and debugging. Linux has been a great > environment for both, which don't want anything hidden behind virtual > devices. > > The module parameter makes it possible to do this with your distro's > stock kernel that came with the CONFIG option enabled. > > The device mapper multipath needed some layering violations out of the > driver to make failover work correctly/better. That's one reason it's > not supported here, and that's an appropriate place to draw the line on > what kinds of patches should be accepted. > Plus there are some NVMe devices out there which _despite_ being PCIe do report NMIC and CMIC set (I won't name names, if you came across them you'll know). This is causing stacking drivers (most notably MD) to behave vastly different on hotplug. Having the module option is an easy way of debugging (and, in quite some cases, fixing) the issue. If the module option really causes issues just make it read-only; that way you can still set if if absolutely required, and at the same time catch installations which try to modify it. Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On 2/26/25 4:55 AM, Hannes Reinecke wrote: > Plus there are some NVMe devices out there which _despite_ being PCIe do report NMIC and CMIC set > (I won't name names, if you came across them you'll know). This is causing stacking drivers (most notably MD) > to behave vastly different on hotplug. Having the module option is an easy way of debugging (and, in quite > some cases, fixing) the issue. So some MD driver configurations that use PCIe devices that report CMIC and NMIC need to turn off core.nvme_multipath. Is that what you're saying? Are these PCIe devices multiported? > If the module option really causes issues just make it read-only; that > way you can still set if if absolutely required, and at the same time > catch installations which try to modify it. If we do that the parameter can't be used to support use cases like your MD driver, or Kieth's ublk driver. /John
On Wed, Feb 26, 2025 at 10:55:21AM +0100, Hannes Reinecke wrote: > Plus there are some NVMe devices out there which _despite_ being PCIe do > report NMIC and CMIC set (I won't name names, if you came across them > you'll know) ????? NMIC and CMIC is perfectly normal and expected for multiported PCIe. WTF are you talking about?
On Wed, Mar 05, 2025 at 03:15:54PM +0100, Christoph Hellwig wrote: > On Wed, Feb 26, 2025 at 10:55:21AM +0100, Hannes Reinecke wrote: > > Plus there are some NVMe devices out there which _despite_ being PCIe do > > report NMIC and CMIC set (I won't name names, if you came across them > > you'll know) > > ????? > > NMIC and CMIC is perfectly normal and expected for multiported PCIe. > WTF are you talking about? Obviously he's not talking about multiported PCIe. And he's right, the behavior of a PCIe hot plug is very different and often undesirable when it's under native multipath.
On Wed, Mar 05, 2025 at 08:17:59AM -0700, Keith Busch wrote: > > > Plus there are some NVMe devices out there which _despite_ being PCIe do > > > report NMIC and CMIC set (I won't name names, if you came across them > > > you'll know) > > > > ????? > > > > NMIC and CMIC is perfectly normal and expected for multiported PCIe. > > WTF are you talking about? > > Obviously he's not talking about multiported PCIe. Why is that obvious? At least based on the stated works he talks about PCIe and not about multi-port. The only not multiported devices I've seen that report NMIC and CMIC are a specific firmware so that the customer would get multipath behavior, which is a great workaround for instable heavily switched fabrics. Note that multiported isn't always obvious as there are quite a few hacks using lane splitting around that a normal host can't really see. > And he's right, the > behavior of a PCIe hot plug is very different and often undesirable when > it's under native multipath. If you do actual hotplug and expect the device to go away it's indeed not desirable. If you want the same device to come back after switched fabric issues it is so desirable that people hack to devices to get it. People talked about adding a queue_if_no_path-like parameter to control keeping the multipath node alive a lot, but no one has ever invested work into actually implementing it.
On Thu, Mar 06, 2025 at 12:51:19AM +0100, Christoph Hellwig wrote: > On Wed, Mar 05, 2025 at 08:17:59AM -0700, Keith Busch wrote: > > > > Plus there are some NVMe devices out there which _despite_ being PCIe do > > > > report NMIC and CMIC set (I won't name names, if you came across them > > > > you'll know) > > > > > > ????? > > > > > > NMIC and CMIC is perfectly normal and expected for multiported PCIe. > > > WTF are you talking about? > > > > Obviously he's not talking about multiported PCIe. > > Why is that obvious? No one here would think a multiported device *wouldn't* report CMIC. The fact Hannes thinks that's a questionable feature for his device gives away that it is single ported. > At least based on the stated works he talks about > PCIe and not about multi-port. The only not multiported devices I've > seen that report NMIC and CMIC are a specific firmware so that the > customer would get multipath behavior, which is a great workaround for > instable heavily switched fabrics. Note that multiported isn't always > obvious as there are quite a few hacks using lane splitting around that > a normal host can't really see. In my experience, it's left enabled because of SRIOV, which many of these devices end up shipping without supporting in PCI space anyway. > > And he's right, the > > behavior of a PCIe hot plug is very different and often undesirable when > > it's under native multipath. > > If you do actual hotplug and expect the device to go away it's indeed > not desirable. If you want the same device to come back after switched > fabric issues it is so desirable that people hack to devices to get it. > People talked about adding a queue_if_no_path-like parameter to control > keeping the multipath node alive a lot, but no one has ever invested > work into actually implementing it. Not quite the same thing, but kind of related: I proposed this device missing debounce thing about a year ago: https://lore.kernel.org/linux-nvme/Y+1aKcQgbskA2tra@kbusch-mbp.dhcp.thefacebook.com/
On Wed, Mar 05, 2025 at 04:57:44PM -0700, Keith Busch wrote: > > > Obviously he's not talking about multiported PCIe. > > > > Why is that obvious? > > No one here would think a multiported device *wouldn't* report CMIC. I hopes so. > The > fact Hannes thinks that's a questionable feature for his device gives > away that it is single ported. Well, his quote reads like he doesn't know about multiport PCIe devices. But maybe he just meant to say "despite being single-ported" > > At least based on the stated works he talks about > > PCIe and not about multi-port. The only not multiported devices I've > > seen that report NMIC and CMIC are a specific firmware so that the > > customer would get multipath behavior, which is a great workaround for > > instable heavily switched fabrics. Note that multiported isn't always > > obvious as there are quite a few hacks using lane splitting around that > > a normal host can't really see. > > In my experience, it's left enabled because of SRIOV, which many of > these devices end up shipping without supporting in PCI space anyway. If a device supports SR-IO setting CMIC and NMIC is corret, but I've actually seen surprisingly few production controllers actually supporting SR-IOV despite what the datasheets say. > > > > And he's right, the > > > behavior of a PCIe hot plug is very different and often undesirable when > > > it's under native multipath. > > > > If you do actual hotplug and expect the device to go away it's indeed > > not desirable. If you want the same device to come back after switched > > fabric issues it is so desirable that people hack to devices to get it. > > People talked about adding a queue_if_no_path-like parameter to control > > keeping the multipath node alive a lot, but no one has ever invested > > work into actually implementing it. > > Not quite the same thing, but kind of related: I proposed this device > missing debounce thing about a year ago: > > https://lore.kernel.org/linux-nvme/Y+1aKcQgbskA2tra@kbusch-mbp.dhcp.thefacebook.com/ Yes, that somehow fell off the cliff.
On 3/6/25 01:03, Christoph Hellwig wrote: > On Wed, Mar 05, 2025 at 04:57:44PM -0700, Keith Busch wrote: >>>> Obviously he's not talking about multiported PCIe. >>> >>> Why is that obvious? >> >> No one here would think a multiported device *wouldn't* report CMIC. > > I hopes so. > >> The >> fact Hannes thinks that's a questionable feature for his device gives >> away that it is single ported. > > Well, his quote reads like he doesn't know about multiport PCIe devices. > But maybe he just meant to say "despite being single-ported" > Single ported. There is a range of Samsung NVMe where one is a normal, single ported, NVMe, and one with a nearly identical model number reporting CMIC. Causing _quite_ a lot of confusion with the customer (and L3) when used under MD, as for the first hotplug works, for the second ... not so much. Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On Thu, Mar 06, 2025 at 08:12:03AM +0100, Hannes Reinecke wrote: > Single ported. > There is a range of Samsung NVMe where one is a normal, single ported, > NVMe, and one with a nearly identical model number reporting CMIC. > > Causing _quite_ a lot of confusion with the customer (and L3) when > used under MD, as for the first hotplug works, for the second ... not so > much. If the device is indeed entirely single ported and Samsung can confirm the setting is incorret and not easily fixable by a firmware update we can quirk it.
On Thu, Mar 06, 2025 at 03:18:37PM +0100, Christoph Hellwig wrote: > On Thu, Mar 06, 2025 at 08:12:03AM +0100, Hannes Reinecke wrote: > > Single ported. > > There is a range of Samsung NVMe where one is a normal, single ported, > > NVMe, and one with a nearly identical model number reporting CMIC. > > > > Causing _quite_ a lot of confusion with the customer (and L3) when > > used under MD, as for the first hotplug works, for the second ... not so > > much. > > If the device is indeed entirely single ported and Samsung can confirm > the setting is incorret and not easily fixable by a firmware update > we can quirk it. It's not one vendor or device. Making this a quirk is a good way to get the mailing list spammed with requests to add yet another device to a quirk table. Or consider a true multiport PCIe where each port connects to a different host. Each host sees a single port so they're not using multipath capabilities, and the admin wants the MD behavior that removes a disk on hot plug. Or even if one host sees both paths of a multiport PCIe, they still might want that hot plug behavior. The module parameter makes that possible, so some equivalent should be available before removing it.
On Thu, Mar 06, 2025 at 08:01:19AM -0700, Keith Busch wrote: > > If the device is indeed entirely single ported and Samsung can confirm > > the setting is incorret and not easily fixable by a firmware update > > we can quirk it. > > It's not one vendor or device. I've not seen in in the wild so it can't be that common. > Or consider a true multiport PCIe where each port connects to a > different host. Each host sees a single port so they're not using > multipath capabilities, and the admin wants the MD behavior that removes > a disk on hot plug. Or even if one host sees both paths of a multiport > PCIe, they still might want that hot plug behavior. The module parameter > makes that possible, so some equivalent should be available before > removing it. A module-wide parameter is absolutely the wrong way to configure it. You'd ad best want it per-controller or even per-namespace. One tradeoff would be to disable the multipath code for private namespaces, although that would cause problems when rescanning changes the flag.
On Thu, Mar 06, 2025 at 04:16:54PM +0100, Christoph Hellwig wrote: > On Thu, Mar 06, 2025 at 08:01:19AM -0700, Keith Busch wrote: > > > Or consider a true multiport PCIe where each port connects to a > > different host. Each host sees a single port so they're not using > > multipath capabilities, and the admin wants the MD behavior that removes > > a disk on hot plug. Or even if one host sees both paths of a multiport > > PCIe, they still might want that hot plug behavior. The module parameter > > makes that possible, so some equivalent should be available before > > removing it. > > A module-wide parameter is absolutely the wrong way to configure it. > You'd ad best want it per-controller or even per-namespace. One > tradeoff would be to disable the multipath code for private namespaces, > although that would cause problems when rescanning changes the flag. It's not really about private vs. shared namespaces, though. There really is no programatic way for the driver to know what behavior the admin needs out of their system without user input. If you don't want a module parameter, then the driver will just have to default to something, then the user will have to do something to change it later. Not very pleasant compared to a simple one time boot parameter.
On Thu, Mar 06, 2025 at 05:46:46PM -0700, Keith Busch wrote: > On Thu, Mar 06, 2025 at 04:16:54PM +0100, Christoph Hellwig wrote: > > On Thu, Mar 06, 2025 at 08:01:19AM -0700, Keith Busch wrote: > > > > > Or consider a true multiport PCIe where each port connects to a > > > different host. Each host sees a single port so they're not using > > > multipath capabilities, and the admin wants the MD behavior that removes > > > a disk on hot plug. Or even if one host sees both paths of a multiport > > > PCIe, they still might want that hot plug behavior. The module parameter > > > makes that possible, so some equivalent should be available before > > > removing it. > > > > A module-wide parameter is absolutely the wrong way to configure it. > > You'd ad best want it per-controller or even per-namespace. One > > tradeoff would be to disable the multipath code for private namespaces, > > although that would cause problems when rescanning changes the flag. > > It's not really about private vs. shared namespaces, though. PArt of it is about that. A private namespace can't have another path. > There > really is no programatic way for the driver to know what behavior the > admin needs out of their system without user input. If you don't want a > module parameter, then the driver will just have to default to > something, then the user will have to do something to change it later. > Not very pleasant compared to a simple one time boot parameter. The point is that different devices want different behavior. Think of a fabrics attached array vs a usb4 dongle used by the admin.
On 3/7/25 6:16 AM, Keith Busch wrote: > On Thu, Mar 06, 2025 at 04:16:54PM +0100, Christoph Hellwig wrote: >> On Thu, Mar 06, 2025 at 08:01:19AM -0700, Keith Busch wrote: >> >>> Or consider a true multiport PCIe where each port connects to a >>> different host. Each host sees a single port so they're not using >>> multipath capabilities, and the admin wants the MD behavior that removes >>> a disk on hot plug. Or even if one host sees both paths of a multiport >>> PCIe, they still might want that hot plug behavior. The module parameter >>> makes that possible, so some equivalent should be available before >>> removing it. >> >> A module-wide parameter is absolutely the wrong way to configure it. >> You'd ad best want it per-controller or even per-namespace. One >> tradeoff would be to disable the multipath code for private namespaces, >> although that would cause problems when rescanning changes the flag. > > It's not really about private vs. shared namespaces, though. There > really is no programatic way for the driver to know what behavior the > admin needs out of their system without user input. If you don't want a > module parameter, then the driver will just have to default to > something, then the user will have to do something to change it later. > Not very pleasant compared to a simple one time boot parameter. > I think always creating multipath head node even for the disk which doesn't have CMIC/NMIC capability should be useful. That way, we may then be able to remove multipath module parameter? In fact, you already mentioned about it in one of your previous message. I see two approaches (one of them you proposed and another one Christoph proposed: https://lore.kernel.org/linux-nvme/Y+1aKcQgbskA2tra@kbusch-mbp.dhcp.thefacebook.com/). Maybe in first cut we should create multipath head disk node always for single/multi ported NVMe disk. Later we may enhance it and allow pinning the head node for hotplug events so that head node dev name remains consistent across disk add/remove hotplug events. Thanks, --Nilay
On Fri, Mar 07, 2025 at 08:49:09PM +0530, Nilay Shroff wrote: > On 3/7/25 6:16 AM, Keith Busch wrote: > I think always creating multipath head node even for the disk which doesn't > have CMIC/NMIC capability should be useful. That way, we may then be able > to remove multipath module parameter? In fact, you already mentioned about > it in one of your previous message. I see two approaches (one of them you > proposed and another one Christoph proposed: > https://lore.kernel.org/linux-nvme/Y+1aKcQgbskA2tra@kbusch-mbp.dhcp.thefacebook.com/). > > Maybe in first cut we should create multipath head disk node always for > single/multi ported NVMe disk. Later we may enhance it and allow pinning the > head node for hotplug events so that head node dev name remains consistent > across disk add/remove hotplug events. It honestly has potential to solve some real problems, like re-enumeration triggered by a link reset on an in-use drive. You'd currently need to close the old handle and open a new on, even though it's the same device. It may not even be possible to do that if that device contains your root partition, and then you can only power cycle. The downside is we wouldn't get the short cut to blk_mq_submit_bio. We'd instead stack that atop an indirect call, so it's not free.
On 3/7/25 9:13 PM, Keith Busch wrote: > On Fri, Mar 07, 2025 at 08:49:09PM +0530, Nilay Shroff wrote: >> On 3/7/25 6:16 AM, Keith Busch wrote: >> I think always creating multipath head node even for the disk which doesn't >> have CMIC/NMIC capability should be useful. That way, we may then be able >> to remove multipath module parameter? In fact, you already mentioned about >> it in one of your previous message. I see two approaches (one of them you >> proposed and another one Christoph proposed: >> https://lore.kernel.org/linux-nvme/Y+1aKcQgbskA2tra@kbusch-mbp.dhcp.thefacebook.com/). >> >> Maybe in first cut we should create multipath head disk node always for >> single/multi ported NVMe disk. Later we may enhance it and allow pinning the >> head node for hotplug events so that head node dev name remains consistent >> across disk add/remove hotplug events. > > It honestly has potential to solve some real problems, like > re-enumeration triggered by a link reset on an in-use drive. You'd > currently need to close the old handle and open a new on, even though > it's the same device. It may not even be possible to do that if that > device contains your root partition, and then you can only power cycle. > > The downside is we wouldn't get the short cut to blk_mq_submit_bio. We'd > instead stack that atop an indirect call, so it's not free. > Yes agreed however it seems advantages of using an indirect call outweighs using the short cut to blk_mq_submit_bio. Moreover it seems the cost of indirect call is trivial because we already cache the nexthop. I integrated your proposed patch (with few trivial additional changes on top) and I see that it's coming out nicely. I ran few tests and confirmed it's working well. However, in the proposed patch we *always* delay (~10 sec) the removal of multipath head node. That means that even while removing the nvme module (rmmod nvme) or if user delete/detache the namespace, we delay the removal of head node but that may not be what we want. So I'd suggest instead, delayed removal of multipath head not shall be configurable using a sysfs attribute. With this attribute then we shall let user opt for pinning the head node (with optional delayed time as well?). And it's only when user shows the intent to pin the node we should delay its removal. This is what exactly (pinning of head node) Christoph's proposed patch implements. So I'd suggest a bit of amalgamation of yours as well as Christoph patch to implement this change. If you and Christoph are busy with other work then in that case I'll be glad to pursue this further if you agree. Thanks, --Nilay
On 3/9/25 1:23 PM, Nilay Shroff wrote: >> It honestly has potential to solve some real problems, like >> re-enumeration triggered by a link reset on an in-use drive. You'd >> currently need to close the old handle and open a new on, even though >> it's the same device. It may not even be possible to do that if that >> device contains your root partition, and then you can only power cycle. >> >> The downside is we wouldn't get the short cut to blk_mq_submit_bio. We'd >> instead stack that atop an indirect call, so it's not free. >> > Yes agreed however it seems advantages of using an indirect call outweighs > using the short cut to blk_mq_submit_bio. Moreover it seems the cost of > indirect call is trivial because we already cache the nexthop. > > I integrated your proposed patch (with few trivial additional changes on top) > and I see that it's coming out nicely. I ran few tests and confirmed it's > working well. However, in the proposed patch we*always* delay (~10 sec) the Have you tested this with a NVMe-oF controller... yet? Where did the number 10 seconds come from? > removal of multipath head node. That means that even while removing the > nvme module (rmmod nvme) or if user delete/detache the namespace, we delay > the removal of head node but that may not be what we want. So I'd suggest > instead, delayed removal of multipath head not shall be configurable using a > sysfs attribute. With this attribute then we shall let user opt for pinning > the head node (with optional delayed time as well?). And it's only when user So be aware the TP-4129 is adding a CQT parameter which does almost exactly this. > shows the intent to pin the node we should delay its removal. This is what > exactly (pinning of head node) Christoph's proposed patch implements. So I'd > suggest a bit of amalgamation of yours as well as Christoph patch to implement > this change. Please cc: me on your patches Nilay, I'd like to test them with my NVMe-oF testbed. /John
On 3/12/25 9:17 AM, John Meneghini wrote: > On 3/9/25 1:23 PM, Nilay Shroff wrote: >>> It honestly has potential to solve some real problems, like >>> re-enumeration triggered by a link reset on an in-use drive. You'd >>> currently need to close the old handle and open a new on, even though >>> it's the same device. It may not even be possible to do that if that >>> device contains your root partition, and then you can only power cycle. >>> >>> The downside is we wouldn't get the short cut to blk_mq_submit_bio. We'd >>> instead stack that atop an indirect call, so it's not free. >>> >> Yes agreed however it seems advantages of using an indirect call outweighs >> using the short cut to blk_mq_submit_bio. Moreover it seems the cost of >> indirect call is trivial because we already cache the nexthop. >> >> I integrated your proposed patch (with few trivial additional changes on top) >> and I see that it's coming out nicely. I ran few tests and confirmed it's >> working well. However, in the proposed patch we*always* delay (~10 sec) the > Have you tested this with a NVMe-oF controller... yet? Not on real target. But tested it against blktests which has few NVMeOF test cases though it uses loopback interface. > > Where did the number 10 seconds come from? That was probably used as hard coded value for POC. However, we shall be able to configure that. > >> removal of multipath head node. That means that even while removing the >> nvme module (rmmod nvme) or if user delete/detache the namespace, we delay >> the removal of head node but that may not be what we want. So I'd suggest >> instead, delayed removal of multipath head not shall be configurable using a >> sysfs attribute. With this attribute then we shall let user opt for pinning >> the head node (with optional delayed time as well?). And it's only when user > > So be aware the TP-4129 is adding a CQT parameter which does almost exactly this. > >> shows the intent to pin the node we should delay its removal. This is what >> exactly (pinning of head node) Christoph's proposed patch implements. So I'd >> suggest a bit of amalgamation of yours as well as Christoph patch to implement >> this change. > > Please cc: me on your patches Nilay, I'd like to test them with my NVMe-oF testbed. > Sure, I will keep you in Cc when I send the patch. Thanks, --Nilay
On Sun, Mar 09, 2025 at 10:53:19PM +0530, Nilay Shroff wrote: > Yes agreed however it seems advantages of using an indirect call outweighs > using the short cut to blk_mq_submit_bio. Moreover it seems the cost of > indirect call is trivial because we already cache the nexthop. Indirect calls are never cheap unfortunately. > If you and Christoph are busy with other work then in that case I'll be > glad to pursue this further if you agree. I'm not working in this area at the moment, so from my POV feel free to go ahead.
On Thu, Mar 06, 2025 at 01:03:48AM +0100, Christoph Hellwig wrote:
> On Wed, Mar 05, 2025 at 04:57:44PM -0700, Keith Busch wrote:
> > In my experience, it's left enabled because of SRIOV, which many of
> > these devices end up shipping without supporting in PCI space anyway.
>
> If a device supports SR-IO setting CMIC and NMIC is corret, but I've
> actually seen surprisingly few production controllers actually supporting
> SR-IOV despite what the datasheets say.
And I bet at least some of those devices left their {C,N}MIC bits on
despite not even supporting the virtual functions in their shipping
product.
> > Not quite the same thing, but kind of related: I proposed this device
> > missing debounce thing about a year ago:
> >
> > https://lore.kernel.org/linux-nvme/Y+1aKcQgbskA2tra@kbusch-mbp.dhcp.thefacebook.com/
>
> Yes, that somehow fell off the cliff.
I'll be happy to revive it, but I don't know how much time I can
dedicate to seeing it across the line.
On 2/18/25 12:15 PM, Keith Busch wrote:
> On Tue, Feb 18, 2025 at 11:31:58AM -0500, John Meneghini wrote:
>> On 2/18/25 10:06 AM, Keith Busch wrote:
>>> On Thu, Feb 13, 2025 at 03:37:28PM -0500, John Meneghini wrote:
>>>> Keith, Christoph and Sagi,
>>>>
>>>> This patch has been fully tested and analyzed by Red Hat's QA group and no
>>>> unexpected side effects or regressions have been found. Both NVMe/FC and NVMe/TCP
>>>> have been tested. Our QE engineer has asked me to report this upstream.
>>>
>>> What's the harm in leaving the parameter? *I* use it so I can test both
>>> ways without needing to compile multiple kernels.
>>
>> LOL. We've been talking about this since 2017. The goal has always been to remove support for DMMP with NVMe.
>
> I understand that disabling nvme native mp it is required for device
> mapper, and I get you want to prevent the possibility of anyone using
> dm-mp with nvme, but that isn't the only user that wants to see all
> namespace paths.
OK, maybe you have a use case in mind. I'll assume you do: some
applications want to disable native nvme multipathing and to see
all namespace paths in user space, and they are using this parameter
to do it. These are ostensibly user space MP applications.
So what happens when one of these user space MP applications needs
a change or a bug bug fix in the kernel? Are those patches being
merged into the kernel under a different auspices... is it just
DMMP that we don't want to enable support for?
>> We want to remove this parameter because it is causing confusion with users and customers who keep trying to use
>> DMMP with their multipath NVMe devices.
>>
>> We want to remove this parameter because:
>>
>> 1) the upstream kernel does not support multipath nvme devices without CONFIG_NVME_MULTIPATH enabled
>
> What do you mean by "support"? I assume you mean no one upstream will
> help you debug your problems if you've set yourself up that way, and
> that's probably true. The kernel currently doesn't stop you from doing
> this though, so it's supported in that sense. Some people are fine doing
> this on their own, they're not seeking upstream help. Changing this
What I mean by "no support" is: Red Hat has proposed multiple patches that were needed
to support core.nvme_multipath=N with DMMP. Those patches have all been NAKed because
"we don't support anything but native nvme multipath with NVMe".
And we are perfectly capable of debugging and supporting all of this w/out any upstream
help. The fact is we had to patch nvme/host/core.c and multipath.c to support DMMP with
nvme.core_multipath=N, and we've carried those out-of-tree patches in RHEL for almost 6
years because they were NAKed upstream.
> would break userspace because it makes the driver fail to create device
> nodes that used to show up.
User space has already been broken. I think we crossed that bridge long ago. Long before
multipath.c was implemented people were using user space MP applications (like DMMP) to support
nvme devices with multiple paths. We took that away as a supported feature a long time ago.
The reason I am saying this is because Red Hat wants to support DMMP with NVMe. We have many customers
who are very upset and unhappy that we are taking this away. Red Hat would much rather use DMMP
with NVMe, than not, and we have been working diligently to close the feature gap between native
nvme multipath and DMMP multipath (e.g. the queue-depth patches).
>> 2) the upstream kernel does not support multipath nvme devices when core.nvme_multipath is set to N
>> 3) Non-multipath nvme devies are supported just fine with core.nvme_multipath is set to Y
>>
>> You don't need set core.nvme_multipath to N to test your devices both ways w/o recompiling the kernel.
>> All of the code paths involved here are controlled by NVME_CTRL_CMIC_MULTI_CTRL and setting core.nvme_multipath
>> to N doesn't do anything to help your single path NVMe devices. It doesn't remove multipath.c, reduce the code
>> path length or do anything else to optimize your non-NVME_CTRL_CMIC_MULTI_CTRL devices. All it does is provide
>> an escape hatch to disable the incore multipath scheduler start creating multiple /dev/nvme%d/n%d entries so
>> that DMMP can be used with multipath capable NVMe devices.
>>
>> Personally, I'd like to remove CONFIG_NVME_MULTIPATH as well. It's just another source of confusion. Most users
>> are running Linux with the the default settings for NVME_MULTIPATH. This is what Red Hat customers use and that's
>> what's used upstream. So what's the point?
>
> There are devices that report CMIC and NMIC despite being single path,
> perhaps as some vestigial sr-iov feature. That adds an unnecessary layer
> for all IO to go through. Having the param makes it easy to test both
> possible driver paths. In production though, I'd expect to just disable
> the CONFIG option if that's the behavior someoone wants, so I think the
> config option ought to stay.
I get it... that's fine... I have no problem if we want to keep support for CONFIG_NVME_MULTIPATH=N.
That's what we originally agreed to and it was my mistake to send a patch the tried to remove it.
As it turns out, user space MP may be supportable with both CONFIG_NVME_MULTIPATH on or off.
Maybe we should just remove all of these dev_warnings from the code and let the users do what
they want. That's what they are going to do anyways. Just don't tell me we won't take patches
to support my user space MP application and not yours.
I am not trying to be a jerk here. Seriously. It's just that our duplicity about this subject
upstream is a little too much. We have these configurable options and parameters... if we don't
want to support them all then let's take them out. People who want continued support for user
space MP with core.nvme_multipath=N can take *this patch* out of their own out-of-tree repository.
This all comes down to who gets to carry the out-of-tree patch. If you don't want to take this patch,
then please take our DMMP multipath support patches. They are not big or invasive, they haven't
changed in years, and they are all controlled by core.nvme_multipath=N. My customers will thank
you if you do.
I just need the upstream maintainers to make up their mind. This is a change we've talked about
for years. I thought you'd want this change. I thought this would be a final end of the
"only native nvme multipath is supported" debate.
/John
On Tue, Feb 18, 2025 at 06:06:05PM -0500, John Meneghini wrote: > OK, maybe you have a use case in mind. I'll assume you do: some > applications want to disable native nvme multipathing and to see > all namespace paths in user space, and they are using this parameter > to do it. These are ostensibly user space MP applications. You can have a ublk frontend device with individual namespace paths on the backend, and let ublkserver do whatever it wants with them. > So what happens when one of these user space MP applications needs > a change or a bug bug fix in the kernel? Are those patches being > merged into the kernel under a different auspices... is it just > DMMP that we don't want to enable support for? I think patches exporting driver private information is a pretty clear indicator it's for stacking drivers.
On 13/02/2025 22:37, John Meneghini wrote:
> Keith, Christoph and Sagi,
>
> This patch has been fully tested and analyzed by Red Hat's QA group
> and no
> unexpected side effects or regressions have been found. Both NVMe/FC
> and NVMe/TCP
> have been tested. Our QE engineer has asked me to report this upstream.
>
> Tested-by: Marco Patalano <mpatalan@redhat.com>
>
> Fixes: 32acab3181c7 ("nvme: implement multipath access to nvme
> subsystems")
>
> As discussed in previous email threads, the nvme.core_multipath
> parameter was included
> by Christoph in the original implementation of nvme/host/multipath.c
> back in 2017
> at the request of Red Hat. At that time Red Hat was only supporting
> DMMP multipath
> with NVMe and we needed a way to disable core native nvme multipathing
> without
> reconfiging the kernel.
>
> The nvme.core_multipath parameter has been used by Red Hat, together
> with some additional
> out-of-tree changes to nvme/host/core.c, to support DMMP multipath
> with NVMe since RHEL-8.
> However, the plan all along has been to deprecate and remove support
> for DMMP multipath with NVMe
> in RHEL and to remove this parameter. This change has been advertised
> and communicated to our
> customers and partners, beginning with RHEL-9, through release notes
> and other communications.
>
> In RHEL-9 we changed the default setting of this parameter from "N" to
> "Y" to match the upstream
> kernel and the move our customers to native nvme multipath. DMMP
> multipath w/NVMe was deprecated
> in RHEL-9 but still supported. Customers were still able to optionally
> enabled DMMP multipath
> with NVMe if they wanted, by changing this parameter, and many of them
> do.
>
> In the (soon to be released) RHEL-10 release DMMP multipath is longer
> supported with NVMe and
> the out-of-tree patches needed to support DMMP multipath with NVMe
> have been removed.
>
> Red Hat is proposing this patch to remove nvme.core_multipath
> parameter from the kernel as we believe
> leaving this non-supported parameter in the kernel for future releases
> will only lead to confusion.
>
> We plan to ship this patch with RHEL-10. So it would be really good if
> we could get this
> change accepted and merged upstream, perhaps into v6.15.
Hey John,
This looks fine to me, I'm assuming this was also tested with
CONFIG_NVME_MULTIPATH=n ?
On 2/17/25 3:08 AM, Sagi Grimberg wrote: >> >> We plan to ship this patch with RHEL-10. So it would be really good if we could get this >> change accepted and merged upstream, perhaps into v6.15. > > Hey John, > > This looks fine to me, I'm assuming this was also tested with CONFIG_NVME_MULTIPATH=n ? Yes, everything has been tested with CONFIG_NVME_MULTIPATH both enabled (Y) and disabled (N). As we discussed in the previous email thread[1] there is an anomaly seen when you build a kernel with CONFIG_NVME_MULTIPATH=n. and the host discovers a multipath capable nvme device (CMIC/NMIC=1). You will see exactly the same thing that you do with CONFIG_NVME_MULTIPATH=y when the nvme_core.multipath parameter is N. You see a separate /dev/nvmeNN entry for every namespace/controller path. We can send send a separate patch to address that problem, but this patch, which simply removes the nvme_core.multipath parameter has beeen fully tested and is ready to go. [1] https://lore.kernel.org/linux-nvme/58519c4e-5801-4481-9087-be4f19b218f7@redhat.com/ Please approve/merge this patch. /John
On 17/02/2025 18:14, John Meneghini wrote: > > On 2/17/25 3:08 AM, Sagi Grimberg wrote: >>> >>> We plan to ship this patch with RHEL-10. So it would be really good >>> if we could get this >>> change accepted and merged upstream, perhaps into v6.15. >> >> Hey John, >> >> This looks fine to me, I'm assuming this was also tested with >> CONFIG_NVME_MULTIPATH=n ? > > Yes, everything has been tested with CONFIG_NVME_MULTIPATH both > enabled (Y) and disabled (N). > > As we discussed in the previous email thread[1] there is an anomaly > seen when you build a kernel with CONFIG_NVME_MULTIPATH=n. > and the host discovers a multipath capable nvme device (CMIC/NMIC=1). > You will see exactly the same thing that you do with > CONFIG_NVME_MULTIPATH=y when the nvme_core.multipath parameter is N. > You see a separate /dev/nvmeNN entry for > every namespace/controller path. > > We can send send a separate patch to address that problem, but this > patch, which simply removes the nvme_core.multipath parameter > has beeen fully tested and is ready to go. I think that we want to print a warning in this case though. Can you resubmit with logging a warning in this case?
On 2/18/25 3:19 AM, Sagi Grimberg wrote:
>> We can send send a separate patch to address that problem, but this patch, which simply removes the nvme_core.multipath parameter
>> has beeen fully tested and is ready to go.
>
> I think that we want to print a warning in this case though. Can you resubmit with logging a warning in this case?
Agreed, but I was thinking that warning should go into the second patch. The second patch should disable all secondary paths when
CONFIG_NVME_MULTPATH=N and a namespace is discovered with NMIC enabled. Basically, we don't want to instantiate more than one
namespace when CONFIG_NVME_MULTPATH=N.
But if you want to add a warning to this patch, we could do something like this:
@@ -3909,16 +3909,23 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
disk->flags |= GENHD_FL_HIDDEN;
} else {
#ifdef CONFIG_NVME_MULTIPATH
sprintf(disk->disk_name, "nvme%dn%d", ctrl->subsys->instance,
ns->head->instance);
#else
sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance,
ns->head->instance);
+ if (info->is_shared) {
+ dev_warn(ctrl->device,
+ "Found shared namespace %d but multipathing not supported.\n",
+ info->nsid);
+ dev_warn_one(ctrl->device,
+ "Shared namepaces without CONFIG_NVME_MULTIPATH=Y is not supported.\n")
+ }
#endif
}
if (nvme_update_ns_info(ns, info))
goto out_unlink_ns;
On 2/18/25 9:05 AM, John Meneghini wrote:
> + if (info->is_shared) {
> + dev_warn(ctrl->device,
> + "Found shared namespace %d but multipathing not supported.\n",
> + info->nsid);
Or maybe this should be
+ if ((ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) && info->is_shared) {
+ dev_warn(ctrl->device,
+ "Found shared namespace %d but multipathing not supported.\n",
+ info->nsid);
/John
© 2016 - 2025 Red Hat, Inc.