[PATCH 2/4] hw/block/nvme: add zone descriptor changed log page

clay.mayers@kioxia.com posted 4 patches 3 years, 3 months ago
Maintainers: Keith Busch <kbusch@kernel.org>, Klaus Jensen <its@irrelevant.dk>, Stefan Hajnoczi <stefanha@redhat.com>, Fam Zheng <fam@euphon.net>, "Philippe Mathieu-Daudé" <philmd@linaro.org>, Kevin Wolf <kwolf@redhat.com>, Hanna Reitz <hreitz@redhat.com>
There is a newer version of this series
[PATCH 2/4] hw/block/nvme: add zone descriptor changed log page
Posted by clay.mayers@kioxia.com 3 years, 3 months ago
From: Clay Mayers <clay.mayers@kioxia.com>

Zones marked with ZONE_FINISH_RECOMMENDED are added to the zone
descriptor changed log page.  Once read with RAE cleared, they are
removed from the list.

Zones stay in the list regardless of what other states the zones may
go through so applications must be aware of ABA issues where finish
may be recommended, the zone freed and re-opened and now the attribute
is now clear.

Signed-off-by: Clay Mayers <clay.mayers@kioxia.com>
---
 hw/nvme/ctrl.c       | 50 ++++++++++++++++++++++++++++++++++++++++++++
 hw/nvme/ns.c         |  6 ++++++
 hw/nvme/nvme.h       |  8 +++++++
 hw/nvme/trace-events |  1 +
 include/block/nvme.h |  8 +++++++
 5 files changed, 73 insertions(+)

diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index d7e9fae0b0..3ffd0fb469 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1516,15 +1516,42 @@ static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
     }
 }
 
+static void nvme_zdc_list(NvmeNamespace *ns, NvmeZoneIdList *zlist, bool reset)
+{
+    NvmeZdc *zdc;
+    NvmeZdc *next;
+    int index = 0;
+
+    QTAILQ_FOREACH_SAFE(zdc, &ns->zdc_list, entry, next) {
+        if (index >= ARRAY_SIZE(zlist->zids)) {
+            break;
+        }
+        zlist->zids[index++] = zdc->zone->d.zslba;
+        if (reset) {
+            QTAILQ_REMOVE(&ns->zdc_list, zdc, entry);
+            zdc->zone->zdc_entry = NULL;
+            g_free(zdc);
+        }
+    }
+    zlist->nzid = cpu_to_le16(index);
+}
+
 static void nvme_check_finish(NvmeNamespace *ns, NvmeZoneListHead *list)
 {
     int64_t now = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
     NvmeZone *zone;
+    NvmeZdc  *zdc;
 
     QTAILQ_FOREACH(zone, list, entry) {
         if (zone->finish_ms <= now) {
             zone->finish_ms = INT64_MAX;
             zone->d.za |= NVME_ZA_FINISH_RECOMMENDED;
+            if (!zone->zdc_entry) {
+                zdc = g_malloc0(sizeof(*zdc));
+                zdc->zone = zone;
+                zone->zdc_entry = zdc;
+                QTAILQ_INSERT_TAIL(&ns->zdc_list, zdc, entry);
+            }
         } else if (zone->finish_ms != INT64_MAX) {
             timer_mod_anticipate(ns->active_timer, zone->finish_ms);
         }
@@ -4675,6 +4702,27 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
     return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
 }
 
+static uint16_t nvme_changed_zones(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
+                                    uint64_t off, NvmeRequest *req)
+{
+    NvmeNamespace *ns;
+    NvmeCmd *cmd = &req->cmd;
+    uint32_t nsid = le32_to_cpu(cmd->nsid);
+    NvmeZoneIdList zlist = { };
+    uint32_t trans_len = MIN(sizeof(zlist) - off, buf_len);
+
+    nsid = le32_to_cpu(cmd->nsid);
+    trace_pci_nvme_changed_zones(nsid);
+
+    ns = nvme_ns(n, nsid);
+    if (!ns) {
+        return NVME_INVALID_NSID | NVME_DNR;
+    }
+    nvme_zdc_list(ns, &zlist, !rae);
+
+    return nvme_c2h(n, ((uint8_t *)&zlist) + off, trans_len, req);
+}
+
 static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
 {
     NvmeCmd *cmd = &req->cmd;
@@ -4722,6 +4770,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
         return nvme_changed_nslist(n, rae, len, off, req);
     case NVME_LOG_CMD_EFFECTS:
         return nvme_cmd_effects(n, csi, len, off, req);
+    case NVME_LOG_CHANGED_ZONE:
+        return nvme_changed_zones(n, rae, len, off, req);
     default:
         trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
         return NVME_INVALID_FIELD | NVME_DNR;
diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
index b577f2d8e0..25cd490c99 100644
--- a/hw/nvme/ns.c
+++ b/hw/nvme/ns.c
@@ -240,6 +240,7 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
     QTAILQ_INIT(&ns->imp_open_zones);
     QTAILQ_INIT(&ns->closed_zones);
     QTAILQ_INIT(&ns->full_zones);
+    QTAILQ_INIT(&ns->zdc_list);
 
     zone = ns->zone_array;
     for (i = 0; i < ns->num_zones; i++, zone++) {
@@ -526,8 +527,13 @@ void nvme_ns_shutdown(NvmeNamespace *ns)
 
 void nvme_ns_cleanup(NvmeNamespace *ns)
 {
+    NvmeZdc *zdc;
+
     if (ns->params.zoned) {
         timer_free(ns->active_timer);
+        while ((zdc = QTAILQ_FIRST(&ns->zdc_list))) {
+            g_free(zdc);
+        }
         g_free(ns->id_ns_zoned);
         g_free(ns->zone_array);
         g_free(ns->zd_extensions);
diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
index 9a54dcdb32..ae65226150 100644
--- a/hw/nvme/nvme.h
+++ b/hw/nvme/nvme.h
@@ -32,6 +32,7 @@ QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_NSID_BROADCAST - 1);
 
 typedef struct NvmeCtrl NvmeCtrl;
 typedef struct NvmeNamespace NvmeNamespace;
+typedef struct NvmeZone NvmeZone;
 
 #define TYPE_NVME_BUS "nvme-bus"
 OBJECT_DECLARE_SIMPLE_TYPE(NvmeBus, NVME_BUS)
@@ -90,10 +91,16 @@ static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys,
 #define NVME_NS(obj) \
     OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
 
+typedef struct NvmeZdc {
+    QTAILQ_ENTRY(NvmeZdc) entry;
+    NvmeZone *zone;
+} NvmeZdc;
+
 typedef struct NvmeZone {
     NvmeZoneDescr   d;
     uint64_t        w_ptr;
     int64_t         finish_ms;
+    NvmeZdc         *zdc_entry;
     QTAILQ_ENTRY(NvmeZone) entry;
 } NvmeZone;
 
@@ -172,6 +179,7 @@ typedef struct NvmeNamespace {
 
     int64_t         fto_ms;
     QEMUTimer       *active_timer;
+    QTAILQ_HEAD(, NvmeZdc) zdc_list;
 
     NvmeNamespaceParams params;
 
diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
index fccb79f489..337927e607 100644
--- a/hw/nvme/trace-events
+++ b/hw/nvme/trace-events
@@ -64,6 +64,7 @@ pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", csi=0x%"PRIx8""
 pci_nvme_identify_cmd_set(void) "identify i/o command set"
 pci_nvme_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32""
+pci_nvme_changed_zones(uint32_t ns) "nsid %"PRIu32""
 pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64""
 pci_nvme_getfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32""
 pci_nvme_setfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t save, uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" save 0x%"PRIx8" cdw11 0x%"PRIx32""
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 8027b7126b..c747cc4948 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1010,6 +1010,7 @@ enum NvmeLogIdentifier {
     NVME_LOG_FW_SLOT_INFO   = 0x03,
     NVME_LOG_CHANGED_NSLIST = 0x04,
     NVME_LOG_CMD_EFFECTS    = 0x05,
+    NVME_LOG_CHANGED_ZONE   = 0xbf,
 };
 
 typedef struct QEMU_PACKED NvmePSD {
@@ -1617,6 +1618,12 @@ typedef enum NvmeVirtualResourceType {
     NVME_VIRT_RES_INTERRUPT     = 0x01,
 } NvmeVirtualResourceType;
 
+typedef struct QEMU_PACKED NvmeZoneIdList {
+    uint16_t nzid;
+    uint16_t rsvd2[3];
+    uint64_t zids[511];
+} NvmeZoneIdList;
+
 static inline void _nvme_check_size(void)
 {
     QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096);
@@ -1655,5 +1662,6 @@ static inline void _nvme_check_size(void)
     QEMU_BUILD_BUG_ON(sizeof(NvmePriCtrlCap) != 4096);
     QEMU_BUILD_BUG_ON(sizeof(NvmeSecCtrlEntry) != 32);
     QEMU_BUILD_BUG_ON(sizeof(NvmeSecCtrlList) != 4096);
+    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneIdList) != 4096);
 }
 #endif
-- 
2.27.0
Re: [PATCH 2/4] hw/block/nvme: add zone descriptor changed log page
Posted by Klaus Jensen 3 years, 3 months ago
On Okt 20 17:18, clay.mayers@kioxia.com wrote:
> From: Clay Mayers <clay.mayers@kioxia.com>
> 
> Zones marked with ZONE_FINISH_RECOMMENDED are added to the zone
> descriptor changed log page.  Once read with RAE cleared, they are
> removed from the list.
> 
> Zones stay in the list regardless of what other states the zones may
> go through so applications must be aware of ABA issues where finish
> may be recommended, the zone freed and re-opened and now the attribute
> is now clear.
> 
> Signed-off-by: Clay Mayers <clay.mayers@kioxia.com>
> ---
>  hw/nvme/ctrl.c       | 50 ++++++++++++++++++++++++++++++++++++++++++++
>  hw/nvme/ns.c         |  6 ++++++
>  hw/nvme/nvme.h       |  8 +++++++
>  hw/nvme/trace-events |  1 +
>  include/block/nvme.h |  8 +++++++
>  5 files changed, 73 insertions(+)
> 
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index d7e9fae0b0..3ffd0fb469 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -1516,15 +1516,42 @@ static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
>      }
>  }
>  
> +static void nvme_zdc_list(NvmeNamespace *ns, NvmeZoneIdList *zlist, bool reset)
> +{
> +    NvmeZdc *zdc;
> +    NvmeZdc *next;
> +    int index = 0;
> +
> +    QTAILQ_FOREACH_SAFE(zdc, &ns->zdc_list, entry, next) {
> +        if (index >= ARRAY_SIZE(zlist->zids)) {
> +            break;
> +        }

I could've sweared that a previous revision of the spec stated that this
required `nzid` to be set to 0xffff and the log page be zeroed in this
case.

However, upon reading it again, that doesnt seem to be the case, so this
seems to be just fine.

> +        zlist->zids[index++] = zdc->zone->d.zslba;
> +        if (reset) {
> +            QTAILQ_REMOVE(&ns->zdc_list, zdc, entry);
> +            zdc->zone->zdc_entry = NULL;
> +            g_free(zdc);
> +        }
> +    }
> +    zlist->nzid = cpu_to_le16(index);
> +}
> +
>  static void nvme_check_finish(NvmeNamespace *ns, NvmeZoneListHead *list)
>  {
>      int64_t now = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
>      NvmeZone *zone;
> +    NvmeZdc  *zdc;
>  
>      QTAILQ_FOREACH(zone, list, entry) {
>          if (zone->finish_ms <= now) {
>              zone->finish_ms = INT64_MAX;
>              zone->d.za |= NVME_ZA_FINISH_RECOMMENDED;
> +            if (!zone->zdc_entry) {
> +                zdc = g_malloc0(sizeof(*zdc));
> +                zdc->zone = zone;
> +                zone->zdc_entry = zdc;
> +                QTAILQ_INSERT_TAIL(&ns->zdc_list, zdc, entry);
> +            }
>          } else if (zone->finish_ms != INT64_MAX) {
>              timer_mod_anticipate(ns->active_timer, zone->finish_ms);
>          }
> @@ -4675,6 +4702,27 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n, uint8_t csi, uint32_t buf_len,
>      return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
>  }
>  
> +static uint16_t nvme_changed_zones(NvmeCtrl *n, uint8_t rae, uint32_t buf_len,
> +                                    uint64_t off, NvmeRequest *req)
> +{
> +    NvmeNamespace *ns;
> +    NvmeCmd *cmd = &req->cmd;
> +    uint32_t nsid = le32_to_cpu(cmd->nsid);
> +    NvmeZoneIdList zlist = { };
> +    uint32_t trans_len = MIN(sizeof(zlist) - off, buf_len);

I believe this is unsafe if off >= sizeof(zlist).

> +
> +    nsid = le32_to_cpu(cmd->nsid);
> +    trace_pci_nvme_changed_zones(nsid);
> +
> +    ns = nvme_ns(n, nsid);
> +    if (!ns) {
> +        return NVME_INVALID_NSID | NVME_DNR;
> +    }
> +    nvme_zdc_list(ns, &zlist, !rae);
> +
> +    return nvme_c2h(n, ((uint8_t *)&zlist) + off, trans_len, req);
> +}
> +
>  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
>  {
>      NvmeCmd *cmd = &req->cmd;
> @@ -4722,6 +4770,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
>          return nvme_changed_nslist(n, rae, len, off, req);
>      case NVME_LOG_CMD_EFFECTS:
>          return nvme_cmd_effects(n, csi, len, off, req);
> +    case NVME_LOG_CHANGED_ZONE:
> +        return nvme_changed_zones(n, rae, len, off, req);
>      default:
>          trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
>          return NVME_INVALID_FIELD | NVME_DNR;
> diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
> index b577f2d8e0..25cd490c99 100644
> --- a/hw/nvme/ns.c
> +++ b/hw/nvme/ns.c
> @@ -240,6 +240,7 @@ static void nvme_ns_zoned_init_state(NvmeNamespace *ns)
>      QTAILQ_INIT(&ns->imp_open_zones);
>      QTAILQ_INIT(&ns->closed_zones);
>      QTAILQ_INIT(&ns->full_zones);
> +    QTAILQ_INIT(&ns->zdc_list);
>  
>      zone = ns->zone_array;
>      for (i = 0; i < ns->num_zones; i++, zone++) {
> @@ -526,8 +527,13 @@ void nvme_ns_shutdown(NvmeNamespace *ns)
>  
>  void nvme_ns_cleanup(NvmeNamespace *ns)
>  {
> +    NvmeZdc *zdc;
> +
>      if (ns->params.zoned) {
>          timer_free(ns->active_timer);
> +        while ((zdc = QTAILQ_FIRST(&ns->zdc_list))) {
> +            g_free(zdc);
> +        }
>          g_free(ns->id_ns_zoned);
>          g_free(ns->zone_array);
>          g_free(ns->zd_extensions);
> diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> index 9a54dcdb32..ae65226150 100644
> --- a/hw/nvme/nvme.h
> +++ b/hw/nvme/nvme.h
> @@ -32,6 +32,7 @@ QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES > NVME_NSID_BROADCAST - 1);
>  
>  typedef struct NvmeCtrl NvmeCtrl;
>  typedef struct NvmeNamespace NvmeNamespace;
> +typedef struct NvmeZone NvmeZone;
>  
>  #define TYPE_NVME_BUS "nvme-bus"
>  OBJECT_DECLARE_SIMPLE_TYPE(NvmeBus, NVME_BUS)
> @@ -90,10 +91,16 @@ static inline NvmeNamespace *nvme_subsys_ns(NvmeSubsystem *subsys,
>  #define NVME_NS(obj) \
>      OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
>  
> +typedef struct NvmeZdc {
> +    QTAILQ_ENTRY(NvmeZdc) entry;
> +    NvmeZone *zone;
> +} NvmeZdc;
> +
>  typedef struct NvmeZone {
>      NvmeZoneDescr   d;
>      uint64_t        w_ptr;
>      int64_t         finish_ms;
> +    NvmeZdc         *zdc_entry;
>      QTAILQ_ENTRY(NvmeZone) entry;
>  } NvmeZone;
>  
> @@ -172,6 +179,7 @@ typedef struct NvmeNamespace {
>  
>      int64_t         fto_ms;
>      QEMUTimer       *active_timer;
> +    QTAILQ_HEAD(, NvmeZdc) zdc_list;
>  
>      NvmeNamespaceParams params;
>  
> diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
> index fccb79f489..337927e607 100644
> --- a/hw/nvme/trace-events
> +++ b/hw/nvme/trace-events
> @@ -64,6 +64,7 @@ pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
>  pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16", csi=0x%"PRIx8""
>  pci_nvme_identify_cmd_set(void) "identify i/o command set"
>  pci_nvme_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32""
> +pci_nvme_changed_zones(uint32_t ns) "nsid %"PRIu32""
>  pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off %"PRIu64""
>  pci_nvme_getfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t sel, uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" sel 0x%"PRIx8" cdw11 0x%"PRIx32""
>  pci_nvme_setfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t save, uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" save 0x%"PRIx8" cdw11 0x%"PRIx32""
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 8027b7126b..c747cc4948 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -1010,6 +1010,7 @@ enum NvmeLogIdentifier {
>      NVME_LOG_FW_SLOT_INFO   = 0x03,
>      NVME_LOG_CHANGED_NSLIST = 0x04,
>      NVME_LOG_CMD_EFFECTS    = 0x05,
> +    NVME_LOG_CHANGED_ZONE   = 0xbf,
>  };
>  
>  typedef struct QEMU_PACKED NvmePSD {
> @@ -1617,6 +1618,12 @@ typedef enum NvmeVirtualResourceType {
>      NVME_VIRT_RES_INTERRUPT     = 0x01,
>  } NvmeVirtualResourceType;
>  
> +typedef struct QEMU_PACKED NvmeZoneIdList {
> +    uint16_t nzid;
> +    uint16_t rsvd2[3];
> +    uint64_t zids[511];
> +} NvmeZoneIdList;
> +
>  static inline void _nvme_check_size(void)
>  {
>      QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096);
> @@ -1655,5 +1662,6 @@ static inline void _nvme_check_size(void)
>      QEMU_BUILD_BUG_ON(sizeof(NvmePriCtrlCap) != 4096);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeSecCtrlEntry) != 32);
>      QEMU_BUILD_BUG_ON(sizeof(NvmeSecCtrlList) != 4096);
> +    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneIdList) != 4096);
>  }
>  #endif
> -- 
> 2.27.0
> 

-- 
One of us - No more doubt, silence or taboo about mental illness.
RE: [PATCH 2/4] hw/block/nvme: add zone descriptor changed log page
Posted by Clay Mayers 3 years, 3 months ago
> From: Klaus Jensen <its@irrelevant.dk>
> Sent: Thursday, October 20, 2022 11:26 PM
> 
> On Okt 20 17:18, clay.mayers@kioxia.com wrote:
> > From: Clay Mayers <clay.mayers@kioxia.com>
> >
> > Zones marked with ZONE_FINISH_RECOMMENDED are added to the zone
> > descriptor changed log page.  Once read with RAE cleared, they are
> > removed from the list.
> >
> > Zones stay in the list regardless of what other states the zones may
> > go through so applications must be aware of ABA issues where finish
> > may be recommended, the zone freed and re-opened and now the attribute
> > is now clear.
> >
> > Signed-off-by: Clay Mayers <clay.mayers@kioxia.com>
> > ---
> >  hw/nvme/ctrl.c       | 50 ++++++++++++++++++++++++++++++++++++++++++++
> >  hw/nvme/ns.c         |  6 ++++++
> >  hw/nvme/nvme.h       |  8 +++++++
> >  hw/nvme/trace-events |  1 +
> >  include/block/nvme.h |  8 +++++++
> >  5 files changed, 73 insertions(+)
> >
> > diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> > index d7e9fae0b0..3ffd0fb469 100644
> > --- a/hw/nvme/ctrl.c
> > +++ b/hw/nvme/ctrl.c
> > @@ -1516,15 +1516,42 @@ static void nvme_clear_events(NvmeCtrl *n,
> uint8_t event_type)
> >      }
> >  }
> >
> > +static void nvme_zdc_list(NvmeNamespace *ns, NvmeZoneIdList *zlist, bool
> reset)
> > +{
> > +    NvmeZdc *zdc;
> > +    NvmeZdc *next;
> > +    int index = 0;
> > +
> > +    QTAILQ_FOREACH_SAFE(zdc, &ns->zdc_list, entry, next) {
> > +        if (index >= ARRAY_SIZE(zlist->zids)) {
> > +            break;
> > +        }
> 
> I could've sweared that a previous revision of the spec stated that this
> required `nzid` to be set to 0xffff and the log page be zeroed in this
> case.
> 
> However, upon reading it again, that doesnt seem to be the case, so this
> seems to be just fine.

My reading was it only does that if the list can't be returned.

> 
> > +        zlist->zids[index++] = zdc->zone->d.zslba;
> > +        if (reset) {
> > +            QTAILQ_REMOVE(&ns->zdc_list, zdc, entry);
> > +            zdc->zone->zdc_entry = NULL;
> > +            g_free(zdc);
> > +        }
> > +    }
> > +    zlist->nzid = cpu_to_le16(index);
> > +}
> > +
> >  static void nvme_check_finish(NvmeNamespace *ns, NvmeZoneListHead
> *list)
> >  {
> >      int64_t now = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> >      NvmeZone *zone;
> > +    NvmeZdc  *zdc;
> >
> >      QTAILQ_FOREACH(zone, list, entry) {
> >          if (zone->finish_ms <= now) {
> >              zone->finish_ms = INT64_MAX;
> >              zone->d.za |= NVME_ZA_FINISH_RECOMMENDED;
> > +            if (!zone->zdc_entry) {
> > +                zdc = g_malloc0(sizeof(*zdc));
> > +                zdc->zone = zone;
> > +                zone->zdc_entry = zdc;
> > +                QTAILQ_INSERT_TAIL(&ns->zdc_list, zdc, entry);
> > +            }
> >          } else if (zone->finish_ms != INT64_MAX) {
> >              timer_mod_anticipate(ns->active_timer, zone->finish_ms);
> >          }
> > @@ -4675,6 +4702,27 @@ static uint16_t nvme_cmd_effects(NvmeCtrl *n,
> uint8_t csi, uint32_t buf_len,
> >      return nvme_c2h(n, ((uint8_t *)&log) + off, trans_len, req);
> >  }
> >
> > +static uint16_t nvme_changed_zones(NvmeCtrl *n, uint8_t rae, uint32_t
> buf_len,
> > +                                    uint64_t off, NvmeRequest *req)
> > +{
> > +    NvmeNamespace *ns;
> > +    NvmeCmd *cmd = &req->cmd;
> > +    uint32_t nsid = le32_to_cpu(cmd->nsid);
> > +    NvmeZoneIdList zlist = { };
> > +    uint32_t trans_len = MIN(sizeof(zlist) - off, buf_len);
> 
> I believe this is unsafe if off >= sizeof(zlist).

Yep, I copied that from how most of the other log pages are
handled, but missed the check for off > sizeof(zlist).

> 
> > +
> > +    nsid = le32_to_cpu(cmd->nsid);
> > +    trace_pci_nvme_changed_zones(nsid);
> > +
> > +    ns = nvme_ns(n, nsid);
> > +    if (!ns) {
> > +        return NVME_INVALID_NSID | NVME_DNR;
> > +    }
> > +    nvme_zdc_list(ns, &zlist, !rae);
> > +
> > +    return nvme_c2h(n, ((uint8_t *)&zlist) + off, trans_len, req);
> > +}
> > +
> >  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeRequest *req)
> >  {
> >      NvmeCmd *cmd = &req->cmd;
> > @@ -4722,6 +4770,8 @@ static uint16_t nvme_get_log(NvmeCtrl *n,
> NvmeRequest *req)
> >          return nvme_changed_nslist(n, rae, len, off, req);
> >      case NVME_LOG_CMD_EFFECTS:
> >          return nvme_cmd_effects(n, csi, len, off, req);
> > +    case NVME_LOG_CHANGED_ZONE:
> > +        return nvme_changed_zones(n, rae, len, off, req);
> >      default:
> >          trace_pci_nvme_err_invalid_log_page(nvme_cid(req), lid);
> >          return NVME_INVALID_FIELD | NVME_DNR;
> > diff --git a/hw/nvme/ns.c b/hw/nvme/ns.c
> > index b577f2d8e0..25cd490c99 100644
> > --- a/hw/nvme/ns.c
> > +++ b/hw/nvme/ns.c
> > @@ -240,6 +240,7 @@ static void
> nvme_ns_zoned_init_state(NvmeNamespace *ns)
> >      QTAILQ_INIT(&ns->imp_open_zones);
> >      QTAILQ_INIT(&ns->closed_zones);
> >      QTAILQ_INIT(&ns->full_zones);
> > +    QTAILQ_INIT(&ns->zdc_list);
> >
> >      zone = ns->zone_array;
> >      for (i = 0; i < ns->num_zones; i++, zone++) {
> > @@ -526,8 +527,13 @@ void nvme_ns_shutdown(NvmeNamespace *ns)
> >
> >  void nvme_ns_cleanup(NvmeNamespace *ns)
> >  {
> > +    NvmeZdc *zdc;
> > +
> >      if (ns->params.zoned) {
> >          timer_free(ns->active_timer);
> > +        while ((zdc = QTAILQ_FIRST(&ns->zdc_list))) {
> > +            g_free(zdc);
> > +        }
> >          g_free(ns->id_ns_zoned);
> >          g_free(ns->zone_array);
> >          g_free(ns->zd_extensions);
> > diff --git a/hw/nvme/nvme.h b/hw/nvme/nvme.h
> > index 9a54dcdb32..ae65226150 100644
> > --- a/hw/nvme/nvme.h
> > +++ b/hw/nvme/nvme.h
> > @@ -32,6 +32,7 @@ QEMU_BUILD_BUG_ON(NVME_MAX_NAMESPACES >
> NVME_NSID_BROADCAST - 1);
> >
> >  typedef struct NvmeCtrl NvmeCtrl;
> >  typedef struct NvmeNamespace NvmeNamespace;
> > +typedef struct NvmeZone NvmeZone;
> >
> >  #define TYPE_NVME_BUS "nvme-bus"
> >  OBJECT_DECLARE_SIMPLE_TYPE(NvmeBus, NVME_BUS)
> > @@ -90,10 +91,16 @@ static inline NvmeNamespace
> *nvme_subsys_ns(NvmeSubsystem *subsys,
> >  #define NVME_NS(obj) \
> >      OBJECT_CHECK(NvmeNamespace, (obj), TYPE_NVME_NS)
> >
> > +typedef struct NvmeZdc {
> > +    QTAILQ_ENTRY(NvmeZdc) entry;
> > +    NvmeZone *zone;
> > +} NvmeZdc;
> > +
> >  typedef struct NvmeZone {
> >      NvmeZoneDescr   d;
> >      uint64_t        w_ptr;
> >      int64_t         finish_ms;
> > +    NvmeZdc         *zdc_entry;
> >      QTAILQ_ENTRY(NvmeZone) entry;
> >  } NvmeZone;
> >
> > @@ -172,6 +179,7 @@ typedef struct NvmeNamespace {
> >
> >      int64_t         fto_ms;
> >      QEMUTimer       *active_timer;
> > +    QTAILQ_HEAD(, NvmeZdc) zdc_list;
> >
> >      NvmeNamespaceParams params;
> >
> > diff --git a/hw/nvme/trace-events b/hw/nvme/trace-events
> > index fccb79f489..337927e607 100644
> > --- a/hw/nvme/trace-events
> > +++ b/hw/nvme/trace-events
> > @@ -64,6 +64,7 @@ pci_nvme_identify_nslist(uint32_t ns) "nsid %"PRIu32""
> >  pci_nvme_identify_nslist_csi(uint16_t ns, uint8_t csi) "nsid=%"PRIu16",
> csi=0x%"PRIx8""
> >  pci_nvme_identify_cmd_set(void) "identify i/o command set"
> >  pci_nvme_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32""
> > +pci_nvme_changed_zones(uint32_t ns) "nsid %"PRIu32""
> >  pci_nvme_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, uint32_t
> len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 0x%"PRIx8"
> len %"PRIu32" off %"PRIu64""
> >  pci_nvme_getfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t sel,
> uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" sel
> 0x%"PRIx8" cdw11 0x%"PRIx32""
> >  pci_nvme_setfeat(uint16_t cid, uint32_t nsid, uint8_t fid, uint8_t save,
> uint32_t cdw11) "cid %"PRIu16" nsid 0x%"PRIx32" fid 0x%"PRIx8" save
> 0x%"PRIx8" cdw11 0x%"PRIx32""
> > diff --git a/include/block/nvme.h b/include/block/nvme.h
> > index 8027b7126b..c747cc4948 100644
> > --- a/include/block/nvme.h
> > +++ b/include/block/nvme.h
> > @@ -1010,6 +1010,7 @@ enum NvmeLogIdentifier {
> >      NVME_LOG_FW_SLOT_INFO   = 0x03,
> >      NVME_LOG_CHANGED_NSLIST = 0x04,
> >      NVME_LOG_CMD_EFFECTS    = 0x05,
> > +    NVME_LOG_CHANGED_ZONE   = 0xbf,
> >  };
> >
> >  typedef struct QEMU_PACKED NvmePSD {
> > @@ -1617,6 +1618,12 @@ typedef enum NvmeVirtualResourceType {
> >      NVME_VIRT_RES_INTERRUPT     = 0x01,
> >  } NvmeVirtualResourceType;
> >
> > +typedef struct QEMU_PACKED NvmeZoneIdList {
> > +    uint16_t nzid;
> > +    uint16_t rsvd2[3];
> > +    uint64_t zids[511];
> > +} NvmeZoneIdList;
> > +
> >  static inline void _nvme_check_size(void)
> >  {
> >      QEMU_BUILD_BUG_ON(sizeof(NvmeBar) != 4096);
> > @@ -1655,5 +1662,6 @@ static inline void _nvme_check_size(void)
> >      QEMU_BUILD_BUG_ON(sizeof(NvmePriCtrlCap) != 4096);
> >      QEMU_BUILD_BUG_ON(sizeof(NvmeSecCtrlEntry) != 32);
> >      QEMU_BUILD_BUG_ON(sizeof(NvmeSecCtrlList) != 4096);
> > +    QEMU_BUILD_BUG_ON(sizeof(NvmeZoneIdList) != 4096);
> >  }
> >  #endif
> > --
> > 2.27.0
> >
> 
> --
> One of us - No more doubt, silence or taboo about mental illness.