[PATCH v2] hw/nvme: add namespace hotplug support

mr-083 posted 2 patches 2 days, 11 hours ago
Maintainers: Kevin Wolf <kwolf@redhat.com>, Hanna Reitz <hreitz@redhat.com>, "Dr. David Alan Gilbert" <dave@treblig.org>, Keith Busch <kbusch@kernel.org>, Klaus Jensen <its@irrelevant.dk>, Jesper Devantier <foss@defmacro.it>
There is a newer version of this series
[PATCH v2] hw/nvme: add namespace hotplug support
Posted by mr-083 1 day, 19 hours ago
Add hotplug support for nvme-ns devices on the NvmeBus. This enables
namespace-level hot-swap without removing the NVMe controller, matching
the behavior of physical NVMe drives hot-swapped in the same PCIe slot.

Mark nvme-ns devices as hotpluggable and register the NvmeBus as a
hotplug handler with proper plug and unplug callbacks:

- plug: attach namespace to all started controllers and send an
  Asynchronous Event Notification (AEN) with NS_ATTR_CHANGED so
  the guest kernel rescans namespaces and adds the block device
- unplug: detach from all controllers, send AEN, remove from
  subsystem, then unrealize the device. The guest kernel rescans
  and removes the block device.

The plug handler skips controllers that haven't started yet
(qs_created == false) to avoid interfering with boot-time namespace
attachment in nvme_start_ctrl().

Both the controller bus and subsystem bus are configured as hotplug
handlers via qbus_set_bus_hotplug_handler() since nvme-ns devices
may reparent to the subsystem bus during realize.

Example hot-swap sequence using the NVMe subsystem model:

  # Boot with: -device nvme-subsys,id=subsys0
  #            -device nvme,id=ctrl0,subsys=subsys0
  #            -device nvme-ns,id=ns0,drive=drv0,bus=ctrl0,nsid=1

  device_del ns0             # guest receives AEN, removes /dev/nvme0n1
  drive_del drv0
  drive_add 0 file=disk.qcow2,format=qcow2,id=drv0,if=none
  device_add nvme-ns,id=ns0,drive=drv0,bus=ctrl0,nsid=1
                              # guest receives AEN, adds /dev/nvme0n1

Tested with Linux 6.1 guest (NVMe driver processes AEN and rescans
namespace list automatically).

Signed-off-by: Matthieu Receveur <matthieu@min.io>
---
 hw/nvme/ctrl.c   | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
 hw/nvme/ns.c     |  1 +
 hw/nvme/subsys.c |  2 ++
 3 files changed, 88 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index be6c7028cb..5502e4ea2b 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -206,6 +206,7 @@
 #include "system/hostmem.h"
 #include "hw/pci/msix.h"
 #include "hw/pci/pcie_sriov.h"
+#include "hw/core/qdev.h"
 #include "system/spdm-socket.h"
 #include "migration/vmstate.h"
 
@@ -9293,6 +9294,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
     }
 
     qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
+    qbus_set_bus_hotplug_handler(BUS(&n->bus));
 
     if (nvme_init_subsys(n, errp)) {
         return;
@@ -9553,10 +9555,93 @@ static const TypeInfo nvme_info = {
     },
 };
 
+static void nvme_ns_hot_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
+                              Error **errp)
+{
+    NvmeNamespace *ns = NVME_NS(dev);
+    NvmeSubsystem *subsys = ns->subsys;
+    uint32_t nsid = ns->params.nsid;
+    int i;
+
+    /*
+     * Attach to all started controllers and notify via AEN.
+     * Skip controllers that haven't started yet (boot-time realize) —
+     * nvme_start_ctrl() will attach namespaces during controller init.
+     */
+    for (i = 0; i < NVME_MAX_CONTROLLERS; i++) {
+        NvmeCtrl *ctrl = nvme_subsys_ctrl(subsys, i);
+        if (!ctrl || !ctrl->qs_created) {
+            continue;
+        }
+
+        if (nvme_csi_supported(ctrl, ns->csi) && !ns->params.detached) {
+            nvme_attach_ns(ctrl, ns);
+            nvme_update_dsm_limits(ctrl, ns);
+
+            if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
+                nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
+                                   NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
+                                   NVME_LOG_CHANGED_NSLIST);
+            }
+        }
+    }
+}
+
+static void nvme_ns_hot_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
+                               Error **errp)
+{
+    NvmeNamespace *ns = NVME_NS(dev);
+    NvmeSubsystem *subsys = ns->subsys;
+    uint32_t nsid = ns->params.nsid;
+    int i;
+
+    /*
+     * Detach from all controllers and notify the guest via AEN.
+     * Must happen before unrealize to avoid use-after-free when the
+     * guest sends I/O to a freed namespace.
+     */
+    for (i = 0; i < NVME_MAX_CONTROLLERS; i++) {
+        NvmeCtrl *ctrl = nvme_subsys_ctrl(subsys, i);
+        if (!ctrl || !nvme_ns(ctrl, nsid)) {
+            continue;
+        }
+
+        nvme_detach_ns(ctrl, ns);
+        nvme_update_dsm_limits(ctrl, NULL);
+
+        if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
+            nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
+                               NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
+                               NVME_LOG_CHANGED_NSLIST);
+        }
+    }
+
+    /* Remove from subsystem namespace list. */
+    subsys->namespaces[nsid] = NULL;
+
+    /*
+     * Unrealize: drain I/O, flush, cleanup structures, remove from QOM.
+     * nvme_ns_unrealize() handles drain/shutdown/cleanup internally.
+     */
+    qdev_unrealize(dev);
+}
+
+static void nvme_bus_class_init(ObjectClass *klass, const void *data)
+{
+    HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(klass);
+    hc->plug = nvme_ns_hot_plug;
+    hc->unplug = nvme_ns_hot_unplug;
+}
+
 static const TypeInfo nvme_bus_info = {
     .name = TYPE_NVME_BUS,
     .parent = TYPE_BUS,
     .instance_size = sizeof(NvmeBus),
+    .class_init = nvme_bus_class_init,
+    .interfaces = (const InterfaceInfo[]) {
+        { TYPE_HOTPLUG_HANDLER },
+        { }
+    },
 };
 
 static void nvme_register_types(void)
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index b0106eaa5c..eb628c0734 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -937,6 +937,7 @@ static void nvme_ns_class_init(ObjectClass *oc, const void *data)
     dc->bus_type = TYPE_NVME_BUS;
     dc->realize = nvme_ns_realize;
     dc->unrealize = nvme_ns_unrealize;
+    dc->hotpluggable = true;
     device_class_set_props(dc, nvme_ns_props);
     dc->desc = "Virtual NVMe namespace";
 }
diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
index 777e1c620f..fa35055d3c 100644
--- a/hw/nvme/subsys.c
+++ b/hw/nvme/subsys.c
@@ -9,6 +9,7 @@
 #include "qemu/osdep.h"
 #include "qemu/units.h"
 #include "qapi/error.h"
+#include "hw/core/qdev.h"
 
 #include "nvme.h"
 
@@ -205,6 +206,7 @@ static void nvme_subsys_realize(DeviceState *dev, Error **errp)
     NvmeSubsystem *subsys = NVME_SUBSYS(dev);
 
     qbus_init(&subsys->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
+    qbus_set_bus_hotplug_handler(BUS(&subsys->bus));
 
     nvme_subsys_setup(subsys, errp);
 }
-- 
2.53.0


Re: [PATCH v2] hw/nvme: add namespace hotplug support
Posted by Stefan Hajnoczi 1 day, 4 hours ago
On Thu, Apr 09, 2026 at 11:34:51PM +0200, mr-083 wrote:
> Add hotplug support for nvme-ns devices on the NvmeBus. This enables
> namespace-level hot-swap without removing the NVMe controller, matching
> the behavior of physical NVMe drives hot-swapped in the same PCIe slot.

If we rely purely on NVMe's AEN then this is not equivalent to swapping
physical drives in the same PCIe slot. Maybe adjust the wording to
reflect that this is NVMe-level Namespace hotplug?

> Mark nvme-ns devices as hotpluggable and register the NvmeBus as a
> hotplug handler with proper plug and unplug callbacks:
> 
> - plug: attach namespace to all started controllers and send an
>   Asynchronous Event Notification (AEN) with NS_ATTR_CHANGED so
>   the guest kernel rescans namespaces and adds the block device
> - unplug: detach from all controllers, send AEN, remove from
>   subsystem, then unrealize the device. The guest kernel rescans
>   and removes the block device.
> 
> The plug handler skips controllers that haven't started yet
> (qs_created == false) to avoid interfering with boot-time namespace
> attachment in nvme_start_ctrl().
> 
> Both the controller bus and subsystem bus are configured as hotplug
> handlers via qbus_set_bus_hotplug_handler() since nvme-ns devices
> may reparent to the subsystem bus during realize.
> 
> Example hot-swap sequence using the NVMe subsystem model:
> 
>   # Boot with: -device nvme-subsys,id=subsys0
>   #            -device nvme,id=ctrl0,subsys=subsys0
>   #            -device nvme-ns,id=ns0,drive=drv0,bus=ctrl0,nsid=1
> 
>   device_del ns0             # guest receives AEN, removes /dev/nvme0n1
>   drive_del drv0
>   drive_add 0 file=disk.qcow2,format=qcow2,id=drv0,if=none
>   device_add nvme-ns,id=ns0,drive=drv0,bus=ctrl0,nsid=1
>                               # guest receives AEN, adds /dev/nvme0n1
> 
> Tested with Linux 6.1 guest (NVMe driver processes AEN and rescans
> namespace list automatically).

Did you test a Windows Server guest? If not, I can try that next week in
case there are any surprises.

> 
> Signed-off-by: Matthieu Receveur <matthieu@min.io>
> ---
>  hw/nvme/ctrl.c   | 85 ++++++++++++++++++++++++++++++++++++++++++++++++
>  hw/nvme/ns.c     |  1 +
>  hw/nvme/subsys.c |  2 ++
>  3 files changed, 88 insertions(+)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index be6c7028cb..5502e4ea2b 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -206,6 +206,7 @@
>  #include "system/hostmem.h"
>  #include "hw/pci/msix.h"
>  #include "hw/pci/pcie_sriov.h"
> +#include "hw/core/qdev.h"
>  #include "system/spdm-socket.h"
>  #include "migration/vmstate.h"
>  
> @@ -9293,6 +9294,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error **errp)
>      }
>  
>      qbus_init(&n->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
> +    qbus_set_bus_hotplug_handler(BUS(&n->bus));
>  
>      if (nvme_init_subsys(n, errp)) {
>          return;
> @@ -9553,10 +9555,93 @@ static const TypeInfo nvme_info = {
>      },
>  };
>  
> +static void nvme_ns_hot_plug(HotplugHandler *hotplug_dev, DeviceState *dev,
> +                              Error **errp)
> +{
> +    NvmeNamespace *ns = NVME_NS(dev);
> +    NvmeSubsystem *subsys = ns->subsys;
> +    uint32_t nsid = ns->params.nsid;
> +    int i;
> +
> +    /*
> +     * Attach to all started controllers and notify via AEN.
> +     * Skip controllers that haven't started yet (boot-time realize) —
> +     * nvme_start_ctrl() will attach namespaces during controller init.
> +     */
> +    for (i = 0; i < NVME_MAX_CONTROLLERS; i++) {
> +        NvmeCtrl *ctrl = nvme_subsys_ctrl(subsys, i);
> +        if (!ctrl || !ctrl->qs_created) {
> +            continue;
> +        }
> +
> +        if (nvme_csi_supported(ctrl, ns->csi) && !ns->params.detached) {
> +            nvme_attach_ns(ctrl, ns);
> +            nvme_update_dsm_limits(ctrl, ns);
> +
> +            if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
> +                nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
> +                                   NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
> +                                   NVME_LOG_CHANGED_NSLIST);
> +            }
> +        }
> +    }
> +}
> +
> +static void nvme_ns_hot_unplug(HotplugHandler *hotplug_dev, DeviceState *dev,
> +                               Error **errp)
> +{
> +    NvmeNamespace *ns = NVME_NS(dev);
> +    NvmeSubsystem *subsys = ns->subsys;
> +    uint32_t nsid = ns->params.nsid;
> +    int i;

While there is qdev_unrealize -> nvme_ns_unrealize -> nvme_ns_drain ->
blk_drain to quiesce I/O requests at the end of this function, I wonder
whether it's safe to start removing the namespace before I/O has been
drained.

Did you test hot unplug while the Namespace is under heavy I/O (e.g. fio
job running inside the guest with lots of queued I/O requests)?

It might be necessary to stop I/O first before tearing down the
namespace.

> +
> +    /*
> +     * Detach from all controllers and notify the guest via AEN.
> +     * Must happen before unrealize to avoid use-after-free when the
> +     * guest sends I/O to a freed namespace.
> +     */
> +    for (i = 0; i < NVME_MAX_CONTROLLERS; i++) {
> +        NvmeCtrl *ctrl = nvme_subsys_ctrl(subsys, i);
> +        if (!ctrl || !nvme_ns(ctrl, nsid)) {
> +            continue;
> +        }
> +
> +        nvme_detach_ns(ctrl, ns);
> +        nvme_update_dsm_limits(ctrl, NULL);
> +
> +        if (!test_and_set_bit(nsid, ctrl->changed_nsids)) {
> +            nvme_enqueue_event(ctrl, NVME_AER_TYPE_NOTICE,
> +                               NVME_AER_INFO_NOTICE_NS_ATTR_CHANGED,
> +                               NVME_LOG_CHANGED_NSLIST);
> +        }
> +    }
> +
> +    /* Remove from subsystem namespace list. */
> +    subsys->namespaces[nsid] = NULL;

The dual of this operation is done in nvme_ns_realize():

  subsys->namespaces[nsid] = ns;

Maybe nvme_ns_unrealize() should remove the namespace from the
subsystem for consistency? I guess the lack of removal was never an
issue before hot unplug, but now it would be nice to implement the
lifecycle.

> +
> +    /*
> +     * Unrealize: drain I/O, flush, cleanup structures, remove from QOM.
> +     * nvme_ns_unrealize() handles drain/shutdown/cleanup internally.
> +     */
> +    qdev_unrealize(dev);
> +}
> +
> +static void nvme_bus_class_init(ObjectClass *klass, const void *data)
> +{
> +    HotplugHandlerClass *hc = HOTPLUG_HANDLER_CLASS(klass);
> +    hc->plug = nvme_ns_hot_plug;
> +    hc->unplug = nvme_ns_hot_unplug;
> +}
> +
>  static const TypeInfo nvme_bus_info = {
>      .name = TYPE_NVME_BUS,
>      .parent = TYPE_BUS,
>      .instance_size = sizeof(NvmeBus),
> +    .class_init = nvme_bus_class_init,
> +    .interfaces = (const InterfaceInfo[]) {
> +        { TYPE_HOTPLUG_HANDLER },
> +        { }
> +    },
>  };
>  
>  static void nvme_register_types(void)
> diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
> index b0106eaa5c..eb628c0734 100644
> --- a/hw/nvme/ns.c
> +++ b/hw/nvme/ns.c
> @@ -937,6 +937,7 @@ static void nvme_ns_class_init(ObjectClass *oc, const void *data)
>      dc->bus_type = TYPE_NVME_BUS;
>      dc->realize = nvme_ns_realize;
>      dc->unrealize = nvme_ns_unrealize;
> +    dc->hotpluggable = true;
>      device_class_set_props(dc, nvme_ns_props);
>      dc->desc = "Virtual NVMe namespace";
>  }
> diff --git a/hw/nvme/subsys.c b/hw/nvme/subsys.c
> index 777e1c620f..fa35055d3c 100644
> --- a/hw/nvme/subsys.c
> +++ b/hw/nvme/subsys.c
> @@ -9,6 +9,7 @@
>  #include "qemu/osdep.h"
>  #include "qemu/units.h"
>  #include "qapi/error.h"
> +#include "hw/core/qdev.h"
>  
>  #include "nvme.h"
>  
> @@ -205,6 +206,7 @@ static void nvme_subsys_realize(DeviceState *dev, Error **errp)
>      NvmeSubsystem *subsys = NVME_SUBSYS(dev);
>  
>      qbus_init(&subsys->bus, sizeof(NvmeBus), TYPE_NVME_BUS, dev, dev->id);
> +    qbus_set_bus_hotplug_handler(BUS(&subsys->bus));
>  
>      nvme_subsys_setup(subsys, errp);
>  }
> -- 
> 2.53.0
>