hw/nvme: Issue with multiple controllers behind a subsystem

Klaus Jensen posted 9 patches 3 months, 3 weeks ago
hw/nvme: Issue with multiple controllers behind a subsystem
Posted by alan.adamson@oracle.com 4 days, 7 hours ago
I'm running into a issue with the latest qemu-nvme with v10.0.0-rc2 with 
regards to multiple controllers behind a subsystem.  When I setup a 
subsystem with 2 controllers, each with a private/non-shared namespace, 
the two private/non-shared namespaces all get attached to one of the 
controllers.

I'm sending out diffs that resolve the problem but would like to get 
some feedback before sending a formal patch.

See below.

Thanks,

Alan Adamson




[root@localhost qemu-subsys]# git describe
v10.0.0-rc2
[root@localhost qemu-subsys]#


QEMU NVMe Config
================
-device nvme-subsys,id=subsys0 \
-device 
nvme,serial=deadbeef,id=nvme0,subsys=subsys0,atomic.dn=off,atomic.awun=31,atomic.awupf=15 
\
-device 
nvme,serial=deadbeef,id=nvme1,subsys=subsys0,atomic.dn=off,atomic.awun=127,atomic.awupf=63 
\
-drive id=ns1,file=/dev/nullb3,if=none \
-drive id=ns2,file=/dev/nullb2,if=none \
-drive id=ns3,file=/dev/nullb1,if=none \
-device nvme-ns,drive=ns1,bus=nvme0,nsid=1,shared=false \
-device nvme-ns,drive=ns2,bus=nvme1,nsid=2,shared=false \
-device nvme-ns,drive=ns3,bus=nvme1,nsid=3,shared=true  \

1 Subsystem (subsys0)
     2 Controllers (nvme0/nvme1)

nvme0
     private namespace (nsid=1)
     shared namespace (nsid=3)
nvme1
     private namespace (nsid=2)
     shared namespace (nsid=3)


What Linux is seeing
====================
[root@localhost ~]# nvme list -v
Subsystem Subsystem-NQN Controllers
---------------- 
------------------------------------------------------------------------------------------------ 
----------------
nvme-subsys1 nqn.2019-08.org.qemu:subsys0 nvme0, nvme1

Device           Cntlid SN MN                                       
FR       TxPort Address        Slot   Subsystem    Namespaces
---------------- ------ -------------------- 
---------------------------------------- -------- ------ -------------- 
------ ------------ ----------------
nvme0    0      deadbeef             QEMU NVMe 
Ctrl                           9.2.92   pcie   0000:00:04.0   4 
nvme-subsys1 nvme1n1, nvme1n2, nvme1n3
nvme1    1      deadbeef             QEMU NVMe 
Ctrl                           9.2.92   pcie   0000:00:05.0   5 
nvme-subsys1 nvme1n1

Device            Generic           NSID Usage                      
Format           Controllers
----------------- ----------------- ---------- 
-------------------------- ---------------- ----------------
/dev/nvme1n1 /dev/ng1n1   0x3        268.44  GB / 268.44  GB 512   B +  
0 B   nvme0, nvme1
/dev/nvme1n2 /dev/ng1n2   0x2        268.44  GB / 268.44  GB 512   B +  
0 B   nvme0
/dev/nvme1n3 /dev/ng1n3   0x1        268.44  GB / 268.44  GB 512   B +  
0 B   nvme0

[root@localhost ~]# nvme id-ctrl /dev/nvme1n2 |  grep awupf
awupf     : 15
[root@localhost ~]# nvme id-ctrl /dev/nvme1n3 |  grep awupf
awupf     : 15
[root@localhost ~]#

===================
Both private namspaces are being attached to the same controller (nvme0)


Fix
===
Need to keep track of the controller that registers a namespace with the 
subsystem
and only allow other controllers to attach to the namespace if it is shared.
Non-shared namespaces can only be attached to a controller than 
registers it.

Proposal
========
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 518d02dc6670..883d8d4722fd 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5063,7 +5063,8 @@ static uint16_t nvme_endgrp_info(NvmeCtrl *n, 
uint8_t rae, uint32_t buf_len,
      }

      for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
-        NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
+        NvmeNamespace *ns = nvme_subsys_ns(n, i);
+
          if (!ns) {
              continue;
          }
@@ -5700,7 +5701,7 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req, bool active)
      ns = nvme_ns(n, nsid);
      if (unlikely(!ns)) {
          if (!active) {
-            ns = nvme_subsys_ns(n->subsys, nsid);
+            ns = nvme_subsys_ns(n, nsid);
              if (!ns) {
[root@localhost qemu-subsys]# git log -p 
f4b9f10a80c30f1d6b9850d476eb8226bda3f553 > /tmp/p
^C
[root@localhost qemu-subsys]# vi /tmp/p
[root@localhost qemu-subsys]# cat /tmp/p
commit f4b9f10a80c30f1d6b9850d476eb8226bda3f553
Author: Alan Adamson <alan.adamson@oracle.com>
Date:   2025-04-03 14:10:30 -0400

     subsys multi controller shared fix

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index 518d02dc6670..883d8d4722fd 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -5063,7 +5063,8 @@ static uint16_t nvme_endgrp_info(NvmeCtrl *n, 
uint8_t rae, uint32_t buf_len,
      }

      for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
-        NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
+        NvmeNamespace *ns = nvme_subsys_ns(n, i);
+
          if (!ns) {
              continue;
          }
@@ -5700,7 +5701,7 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
NvmeRequest *req, bool active)
      ns = nvme_ns(n, nsid);
      if (unlikely(!ns)) {
          if (!active) {
-            ns = nvme_subsys_ns(n->subsys, nsid);
+            ns = nvme_subsys_ns(n, nsid);
              if (!ns) {
                  return nvme_rpt_empty_id_struct(n, req);
              }
@@ -5739,7 +5740,7 @@ static uint16_t nvme_identify_ctrl_list(NvmeCtrl 
*n, NvmeRequest *req,
              return NVME_INVALID_FIELD | NVME_DNR;
          }

-        ns = nvme_subsys_ns(n->subsys, nsid);
+        ns = nvme_subsys_ns(n, nsid);
          if (!ns) {
              return NVME_INVALID_FIELD | NVME_DNR;
          }
@@ -5809,7 +5810,7 @@ static uint16_t nvme_identify_ns_ind(NvmeCtrl *n, 
NvmeRequest *req, bool alloc)
      ns = nvme_ns(n, nsid);
      if (unlikely(!ns)) {
          if (alloc) {
-            ns = nvme_subsys_ns(n->subsys, nsid);
+            ns = nvme_subsys_ns(n, nsid);
              if (!ns) {
                  return nvme_rpt_empty_id_struct(n, req);
              }
@@ -5837,7 +5838,7 @@ static uint16_t nvme_identify_ns_csi(NvmeCtrl *n, 
NvmeRequest *req,
      ns = nvme_ns(n, nsid);
      if (unlikely(!ns)) {
          if (!active) {
-            ns = nvme_subsys_ns(n->subsys, nsid);
+            ns = nvme_subsys_ns(n, nsid);
              if (!ns) {
                  return nvme_rpt_empty_id_struct(n, req);
              }
@@ -5884,7 +5885,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req,
          ns = nvme_ns(n, i);
          if (!ns) {
              if (!active) {
-                ns = nvme_subsys_ns(n->subsys, i);
+                ns = nvme_subsys_ns(n, i);
                  if (!ns) {
                      continue;
                  }
@@ -5932,7 +5933,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl 
*n, NvmeRequest *req,
          ns = nvme_ns(n, i);
          if (!ns) {
              if (!active) {
-                ns = nvme_subsys_ns(n->subsys, i);
+                ns = nvme_subsys_ns(n, i);
                  if (!ns) {
                      continue;
                  }
@@ -6793,7 +6794,7 @@ static uint16_t nvme_ns_attachment(NvmeCtrl *n, 
NvmeRequest *req)
          return NVME_INVALID_NSID | NVME_DNR;
      }

-    ns = nvme_subsys_ns(n->subsys, nsid);
+    ns = nvme_subsys_ns(n, nsid);
      if (!ns) {
          return NVME_INVALID_FIELD | NVME_DNR;
      }
@@ -7751,17 +7752,23 @@ static int nvme_start_ctrl(NvmeCtrl *n)

      nvme_set_timestamp(n, 0ULL);

-    /* verify that the command sets of attached namespaces are supported */
-    for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
-        NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
+    if (n->subsys) {
+        for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
+        NvmeNamespace *ns = n->subsys->namespaces[i].namespace;

-        if (ns && nvme_csi_supported(n, ns->csi) && !ns->params.detached) {
-            if (!ns->attached || ns->params.shared) {
-                nvme_attach_ns(n, ns);
+        if (!ns) {
+                continue;
              }
+        if (!(n->subsys->namespaces[i].ctrl == n) && !ns->params.shared) {
+                continue;
+            }
+        if (nvme_csi_supported(n, ns->csi) && !ns->params.detached) {
+                if (!ns->attached || ns->params.shared) {
+            nvme_attach_ns(n, ns);
+        }
+            }
          }
      }
-
      nvme_update_dsm_limits(n, NULL);

      return 0;
@@ -8993,7 +9000,8 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
**errp)
              return;
          }

-        n->subsys->namespaces[ns->params.nsid] = ns;
+        n->subsys->namespaces[ns->params.nsid].namespace = ns;
+        n->subsys->namespaces[ns->params.nsid].ctrl = n;
      }
  }

diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index 98c1e75a5d29..9cca699b354f 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -742,7 +742,7 @@ static void nvme_ns_realize(DeviceState *dev, Error 
**errp)

      if (!nsid) {
          for (i = 1; i <= NVME_MAX_NAMESPACES; i++) {
-            if (nvme_subsys_ns(subsys, i)) {
+            if (nvme_subsys_ns(n, i)) {
                  continue;
              }

@@ -754,12 +754,13 @@ static void nvme_ns_realize(DeviceState *dev, 
Error **errp)
              error_setg(errp, "no free namespace id");
              return;
          }
-    } else if (nvme_subsys_ns(subsys, nsid)) {
+    } else if (nvme_subsys_ns(n, nsid)) {
          error_setg(errp, "namespace id '%d' already allocated", nsid);
          return;
      }

-    subsys->namespaces[nsid] = ns;
+    subsys->namespaces[nsid].namespace = ns;
+    subsys->namespaces[nsid].ctrl = n;

      ns->id_ns.endgid = cpu_to_le16(0x1);
      ns->id_ns_ind.endgrpid = cpu_to_le16(0x1);
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 6f782ba18826..bea3b96a6dfa 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -97,6 +97,11 @@ typedef struct NvmeEnduranceGroup {
      } fdp;
  } NvmeEnduranceGroup;

+typedef struct Namespaces {
+    NvmeCtrl           *ctrl;
+    NvmeNamespace      *namespace;
+} Namespaces;
+
  typedef struct NvmeSubsystem {
      DeviceState parent_obj;
      NvmeBus     bus;
@@ -104,7 +109,7 @@ typedef struct NvmeSubsystem {
      char        *serial;

      NvmeCtrl           *ctrls[NVME_MAX_CONTROLLERS];
-    NvmeNamespace      *namespaces[NVME_MAX_NAMESPACES + 1];
+    Namespaces         namespaces[NVME_MAX_NAMESPACES + 1];
      NvmeEnduranceGroup endgrp;

      struct {
@@ -136,16 +141,6 @@ static inline NvmeCtrl 
*nvme_subsys_ctrl(NvmeSubsystem *subsys,
      return subsys->ctrls[cntlid];
  }

-static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys,
-                                            uint32_t nsid)
-{
-    if (!subsys || !nsid || nsid > NVME_MAX_NAMESPACES) {
-        return NULL;
-    }
-
-    return subsys->namespaces[nsid];
-}
-
  #define TYPE_NVME_NS "nvme-ns"
  #define NVME_NS(obj) \
      OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
@@ -711,6 +706,14 @@ static inline NvmeSecCtrlEntry 
*nvme_sctrl_for_cntlid(NvmeCtrl *n,
      return NULL;
  }

+static inline NvmeNamespace *nvme_subsys_ns(NvmeCtrl *n, uint32_t nsid)
+{
+    if (!n->subsys || !nsid || nsid > NVME_MAX_NAMESPACES) {
+        return NULL;
+    }
+    return n->subsys->namespaces[nsid].namespace;
+}
+
  void nvme_attach_ns(NvmeCtrl *n, NvmeNamespace *ns);
  uint16_t nvme_bounce_data(NvmeCtrl *n, void *ptr, uint32_t len,
                            NvmeTxDirection dir, NvmeRequest *req);
diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
index 2ae56f12a596..d5751564c05c 100644
--- a/hw/nvme/subsys.c
+++ b/hw/nvme/subsys.c
@@ -92,13 +92,19 @@ int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp)

      subsys->ctrls[cntlid] = n;

-    for (nsid = 1; nsid < ARRAY_SIZE(subsys->namespaces); nsid++) {
-        NvmeNamespace *ns = subsys->namespaces[nsid];
-        if (ns && ns->params.shared && !ns->params.detached) {
-            nvme_attach_ns(n, ns);
+    for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
+        NvmeNamespace *ns = subsys->namespaces[nsid].namespace;
+
+    if (!ns) {
+            continue;
+        }
+        if (!(subsys->namespaces[nsid].ctrl == n) && !ns->params.shared) {
+            continue;
          }
+    if (ns->params.shared && !ns->params.detached) {
+            nvme_attach_ns(n, ns);
+    }
      }
-
      return cntlid;
  }


With this fix:
==============
[root@localhost ~]# nvme list -v
Subsystem Subsystem-NQN Controllers
---------------- 
------------------------------------------------------------------------------------------------ 
----------------
nvme-subsys1 nqn.2019-08.org.qemu:subsys0 nvme0, nvme1

Device           Cntlid SN MN                                       
FR       TxPort Address        Slot   Subsystem    Namespaces
---------------- ------ -------------------- 
---------------------------------------- -------- ------ -------------- 
------ ------------ ----------------
nvme0    0      deadbeef             QEMU NVMe 
Ctrl                           9.2.92   pcie   0000:00:04.0   4 
nvme-subsys1 nvme1n2, nvme1n3
nvme1    1      deadbeef             QEMU NVMe 
Ctrl                           9.2.92   pcie   0000:00:05.0   5 
nvme-subsys1 nvme1n1, nvme1n3

Device            Generic           NSID Usage                      
Format           Controllers
----------------- ----------------- ---------- 
-------------------------- ---------------- ----------------
/dev/nvme1n1 /dev/ng1n1   0x2        268.44  GB / 268.44  GB 512   B +  
0 B   nvme1
/dev/nvme1n2 /dev/ng1n2   0x1        268.44  GB / 268.44  GB 512   B +  
0 B   nvme0
/dev/nvme1n3 /dev/ng1n3   0x3        268.44  GB / 268.44  GB 512   B +  
0 B   nvme0, nvme1

[root@localhost ~]# nvme id-ctrl /dev/nvme1n1 |  grep awupf
awupf     : 63
[root@localhost ~]# nvme id-ctrl /dev/nvme1n2 |  grep awupf
awupf     : 15
[root@localhost ~]#

Each private namespace is attached to its own controller and that is 
verified with the AWUPF values.




Re: hw/nvme: Issue with multiple controllers behind a subsystem
Posted by Klaus Jensen 14 hours ago
On Apr  4 10:52, alan.adamson@oracle.com wrote:
> I'm running into a issue with the latest qemu-nvme with v10.0.0-rc2 with
> regards to multiple controllers behind a subsystem.  When I setup a
> subsystem with 2 controllers, each with a private/non-shared namespace, the
> two private/non-shared namespaces all get attached to one of the
> controllers.
> 
> I'm sending out diffs that resolve the problem but would like to get some
> feedback before sending a formal patch.
> 

Hi Alan,

Thanks for reporting this! This is definitely a regression caused by the
csi refactoring I did.

Few comments below, but I'd like to try to get this into 10.0. Timeline
is tight, so I'll send out a patch with my suggestings below.

> @@ -7751,17 +7752,23 @@ static int nvme_start_ctrl(NvmeCtrl *n)
> 
>      nvme_set_timestamp(n, 0ULL);
> 
> -    /* verify that the command sets of attached namespaces are supported */
> -    for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
> -        NvmeNamespace *ns = nvme_subsys_ns(n->subsys, i);
> +    if (n->subsys) {
> +        for (int i = 1; i <= NVME_MAX_NAMESPACES; i++) {
> +        NvmeNamespace *ns = n->subsys->namespaces[i].namespace;
> 
> -        if (ns && nvme_csi_supported(n, ns->csi) && !ns->params.detached) {
> -            if (!ns->attached || ns->params.shared) {
> -                nvme_attach_ns(n, ns);
> +        if (!ns) {
> +                continue;
>              }
> +        if (!(n->subsys->namespaces[i].ctrl == n) && !ns->params.shared) {
> +                continue;
> +            }
> +        if (nvme_csi_supported(n, ns->csi) && !ns->params.detached) {
> +                if (!ns->attached || ns->params.shared) {
> +            nvme_attach_ns(n, ns);
> +        }
> +            }
>          }
>      }
> -
>      nvme_update_dsm_limits(n, NULL);
> 
>      return 0;

Yeah, this is where things went wrong.

> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 6f782ba18826..bea3b96a6dfa 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -97,6 +97,11 @@ typedef struct NvmeEnduranceGroup {
>      } fdp;
>  } NvmeEnduranceGroup;
> 
> +typedef struct Namespaces {
> +    NvmeCtrl           *ctrl;
> +    NvmeNamespace      *namespace;
> +} Namespaces;

Let's just add an NvmeCtrl* in struct NvmeNamespace. If set, the
namespace is private.

> diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
> index 2ae56f12a596..d5751564c05c 100644
> --- a/hw/nvme/subsys.c
> +++ b/hw/nvme/subsys.c
> @@ -92,13 +92,19 @@ int nvme_subsys_register_ctrl(NvmeCtrl *n, Error **errp)
> 
>      subsys->ctrls[cntlid] = n;
> 
> -    for (nsid = 1; nsid < ARRAY_SIZE(subsys->namespaces); nsid++) {
> -        NvmeNamespace *ns = subsys->namespaces[nsid];
> -        if (ns && ns->params.shared && !ns->params.detached) {
> -            nvme_attach_ns(n, ns);
> +    for (nsid = 1; nsid <= NVME_MAX_NAMESPACES; nsid++) {
> +        NvmeNamespace *ns = subsys->namespaces[nsid].namespace;
> +
> +    if (!ns) {
> +            continue;
> +        }
> +        if (!(subsys->namespaces[nsid].ctrl == n) && !ns->params.shared) {
> +            continue;
>          }
> +    if (ns->params.shared && !ns->params.detached) {
> +            nvme_attach_ns(n, ns);
> +    }
>      }
> -
>      return cntlid;
>  }

The code here that attach namespaces when the controller is registers
actually need to go away. This is a leftover.