[RFC v2 6/7] hw/nvme: refactor zone append write using block layer APIs

Sam Li posted 7 patches 1 year ago
There is a newer version of this series
[RFC v2 6/7] hw/nvme: refactor zone append write using block layer APIs
Posted by Sam Li 1 year ago
Signed-off-by: Sam Li <faithilikerun@gmail.com>
---
 block/qcow2.c        |   2 +-
 hw/nvme/ctrl.c       | 190 ++++++++++++++++++++++++++++++++-----------
 include/sysemu/dma.h |   3 +
 system/dma-helpers.c |  17 ++++
 4 files changed, 162 insertions(+), 50 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index dfaf5566e2..74d2e2bf39 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2290,7 +2290,7 @@ static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
     bs->bl.max_open_zones = s->zoned_header.max_open_zones;
     bs->bl.zone_size = s->zoned_header.zone_size;
     bs->bl.zone_capacity = s->zoned_header.zone_capacity;
-    bs->bl.write_granularity = BDRV_SECTOR_SIZE;
+    bs->bl.write_granularity = BDRV_SECTOR_SIZE; /* physical block size */
     bs->bl.zd_extension_size = s->zoned_header.zd_extension_size;
 }
 
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index b9ed3495e1..f65a87646e 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -1735,6 +1735,95 @@ static void nvme_misc_cb(void *opaque, int ret)
     nvme_enqueue_req_completion(nvme_cq(req), req);
 }
 
+typedef struct NvmeZoneCmdAIOCB {
+    NvmeRequest *req;
+    NvmeCmd *cmd;
+    NvmeCtrl *n;
+
+    union {
+        struct {
+          uint32_t partial;
+          unsigned int nr_zones;
+          BlockZoneDescriptor *zones;
+        } zone_report_data;
+        struct {
+          int64_t offset;
+        } zone_append_data;
+    };
+} NvmeZoneCmdAIOCB;
+
+static void nvme_blk_zone_append_complete_cb(void *opaque, int ret)
+{
+    NvmeZoneCmdAIOCB *cb = opaque;
+    NvmeRequest *req = cb->req;
+    int64_t *offset = (int64_t *)&req->cqe;
+
+    if (ret) {
+        nvme_aio_err(req, ret);
+    }
+
+    *offset = nvme_b2l(req->ns, cb->zone_append_data.offset);
+    nvme_enqueue_req_completion(nvme_cq(req), req);
+    g_free(cb);
+}
+
+static inline void nvme_blk_zone_append(BlockBackend *blk, int64_t *offset,
+                                  uint32_t align,
+                                  BlockCompletionFunc *cb,
+                                  NvmeZoneCmdAIOCB *aiocb)
+{
+    NvmeRequest *req = aiocb->req;
+    assert(req->sg.flags & NVME_SG_ALLOC);
+
+    if (req->sg.flags & NVME_SG_DMA) {
+        req->aiocb = dma_blk_zone_append(blk, &req->sg.qsg, (int64_t)offset,
+                                         align, cb, aiocb);
+    } else {
+        req->aiocb = blk_aio_zone_append(blk, offset, &req->sg.iov, 0,
+                                         cb, aiocb);
+    }
+}
+
+static void nvme_zone_append_cb(void *opaque, int ret)
+{
+    NvmeZoneCmdAIOCB *aiocb = opaque;
+    NvmeRequest *req = aiocb->req;
+    NvmeNamespace *ns = req->ns;
+
+    BlockBackend *blk = ns->blkconf.blk;
+
+    trace_pci_nvme_rw_cb(nvme_cid(req), blk_name(blk));
+
+    if (ret) {
+        goto out;
+    }
+
+    if (ns->lbaf.ms) {
+        NvmeRwCmd *rw = (NvmeRwCmd *)&req->cmd;
+        uint32_t nlb = (uint32_t)le16_to_cpu(rw->nlb) + 1;
+        int64_t offset = aiocb->zone_append_data.offset;
+
+        if (nvme_ns_ext(ns) || req->cmd.mptr) {
+            uint16_t status;
+
+            nvme_sg_unmap(&req->sg);
+            status = nvme_map_mdata(nvme_ctrl(req), nlb, req);
+            if (status) {
+                ret = -EFAULT;
+                goto out;
+            }
+
+            return nvme_blk_zone_append(blk, &offset, 1,
+                                        nvme_blk_zone_append_complete_cb,
+                                        aiocb);
+        }
+    }
+
+out:
+    nvme_blk_zone_append_complete_cb(aiocb, ret);
+}
+
+
 void nvme_rw_complete_cb(void *opaque, int ret)
 {
     NvmeRequest *req = opaque;
@@ -3061,6 +3150,9 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
     uint64_t mapped_size = data_size;
     uint64_t data_offset;
     BlockBackend *blk = ns->blkconf.blk;
+    BlockZoneWps *wps = blk_get_zone_wps(blk);
+    uint32_t zone_size = blk_get_zone_size(blk);
+    uint32_t zone_idx;
     uint16_t status;
 
     if (nvme_ns_ext(ns)) {
@@ -3091,42 +3183,47 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
     }
 
     if (blk_get_zone_model(blk)) {
-        uint32_t zone_size = blk_get_zone_size(blk);
-        uint32_t zone_idx = slba / zone_size;
-        int64_t zone_start = zone_idx * zone_size;
+        assert(wps);
+        if (zone_size) {
+            zone_idx = slba / zone_size;
+            int64_t zone_start = zone_idx * zone_size;
+
+            if (append) {
+                bool piremap = !!(ctrl & NVME_RW_PIREMAP);
+
+                if (n->params.zasl &&
+                    data_size > (uint64_t)
+                    n->page_size << n->params.zasl) {
+                    trace_pci_nvme_err_zasl(data_size);
+                    return NVME_INVALID_FIELD | NVME_DNR;
+                }
 
-        if (append) {
-            bool piremap = !!(ctrl & NVME_RW_PIREMAP);
+                rw->slba = cpu_to_le64(slba);
 
-            if (n->params.zasl &&
-                data_size > (uint64_t)n->page_size << n->params.zasl) {
-                trace_pci_nvme_err_zasl(data_size);
-                return NVME_INVALID_FIELD | NVME_DNR;
-            }
+                switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
+                case NVME_ID_NS_DPS_TYPE_1:
+                    if (!piremap) {
+                        return NVME_INVALID_PROT_INFO | NVME_DNR;
+                    }
 
-            rw->slba = cpu_to_le64(slba);
-            switch (NVME_ID_NS_DPS_TYPE(ns->id_ns.dps)) {
-            case NVME_ID_NS_DPS_TYPE_1:
-                if (!piremap) {
-                    return NVME_INVALID_PROT_INFO | NVME_DNR;
-                }
+                    /* fallthrough */
 
-                /* fallthrough */
+                case NVME_ID_NS_DPS_TYPE_2:
+                    if (piremap) {
+                        uint32_t reftag = le32_to_cpu(rw->reftag);
+                        rw->reftag =
+                            cpu_to_le32(reftag + (slba - zone_start));
+                    }
 
-            case NVME_ID_NS_DPS_TYPE_2:
-                if (piremap) {
-                    uint32_t reftag = le32_to_cpu(rw->reftag);
-                    rw->reftag = cpu_to_le32(reftag + (slba - zone_start));
-                }
+                    break;
 
-                break;
+                case NVME_ID_NS_DPS_TYPE_3:
+                    if (piremap) {
+                        return NVME_INVALID_PROT_INFO | NVME_DNR;
+                    }
 
-            case NVME_ID_NS_DPS_TYPE_3:
-                if (piremap) {
-                    return NVME_INVALID_PROT_INFO | NVME_DNR;
+                    break;
                 }
-
-                break;
             }
         }
 
@@ -3146,9 +3243,21 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest *req, bool append,
             goto invalid;
         }
 
-        block_acct_start(blk_get_stats(blk), &req->acct, data_size,
-                         BLOCK_ACCT_WRITE);
-        nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+        if (append) {
+            NvmeZoneCmdAIOCB *cb = g_malloc(sizeof(NvmeZoneCmdAIOCB));
+            cb->req = req;
+            cb->zone_append_data.offset = data_offset;
+
+            block_acct_start(blk_get_stats(blk), &req->acct, data_size,
+                             BLOCK_ACCT_ZONE_APPEND);
+            nvme_blk_zone_append(blk, &cb->zone_append_data.offset,
+                                 blk_get_write_granularity(blk),
+                                 nvme_zone_append_cb, cb);
+        } else {
+            block_acct_start(blk_get_stats(blk), &req->acct, data_size,
+                             BLOCK_ACCT_WRITE);
+            nvme_blk_write(blk, data_offset, BDRV_SECTOR_SIZE, nvme_rw_cb, req);
+        }
     } else {
         req->aiocb = blk_aio_pwrite_zeroes(blk, data_offset, data_size,
                                            BDRV_REQ_MAY_UNMAP, nvme_rw_cb,
@@ -3172,24 +3281,7 @@ static inline uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeRequest *req)
     return nvme_do_write(n, req, false, true);
 }
 
-typedef struct NvmeZoneCmdAIOCB {
-    NvmeRequest *req;
-    NvmeCmd *cmd;
-    NvmeCtrl *n;
-
-    union {
-        struct {
-          uint32_t partial;
-          unsigned int nr_zones;
-          BlockZoneDescriptor *zones;
-        } zone_report_data;
-        struct {
-          int64_t offset;
-        } zone_append_data;
-    };
-} NvmeZoneCmdAIOCB;
-
-static inline uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
+static uint16_t nvme_zone_append(NvmeCtrl *n, NvmeRequest *req)
 {
     return nvme_do_write(n, req, true, false);
 }
diff --git a/include/sysemu/dma.h b/include/sysemu/dma.h
index a1ac5bc1b5..680e0b5477 100644
--- a/include/sysemu/dma.h
+++ b/include/sysemu/dma.h
@@ -301,6 +301,9 @@ BlockAIOCB *dma_blk_read(BlockBackend *blk,
 BlockAIOCB *dma_blk_write(BlockBackend *blk,
                           QEMUSGList *sg, uint64_t offset, uint32_t align,
                           BlockCompletionFunc *cb, void *opaque);
+BlockAIOCB *dma_blk_zone_append(BlockBackend *blk,
+                          QEMUSGList *sg, int64_t offset, uint32_t align,
+                          void (*cb)(void *opaque, int ret), void *opaque);
 MemTxResult dma_buf_read(void *ptr, dma_addr_t len, dma_addr_t *residual,
                          QEMUSGList *sg, MemTxAttrs attrs);
 MemTxResult dma_buf_write(void *ptr, dma_addr_t len, dma_addr_t *residual,
diff --git a/system/dma-helpers.c b/system/dma-helpers.c
index 36211acc7e..98c97a165d 100644
--- a/system/dma-helpers.c
+++ b/system/dma-helpers.c
@@ -274,6 +274,23 @@ BlockAIOCB *dma_blk_write(BlockBackend *blk,
                       DMA_DIRECTION_TO_DEVICE);
 }
 
+static
+BlockAIOCB *dma_blk_zone_append_io_func(int64_t offset, QEMUIOVector *iov,
+                                  BlockCompletionFunc *cb, void *cb_opaque,
+                                  void *opaque)
+{
+    BlockBackend *blk = opaque;
+    return blk_aio_zone_append(blk, (int64_t *)offset, iov, 0, cb, cb_opaque);
+}
+
+BlockAIOCB *dma_blk_zone_append(BlockBackend *blk,
+                          QEMUSGList *sg, int64_t offset, uint32_t align,
+                          void (*cb)(void *opaque, int ret), void *opaque)
+{
+    return dma_blk_io(blk_get_aio_context(blk), sg, offset, align,
+                      dma_blk_zone_append_io_func, blk, cb, opaque,
+                      DMA_DIRECTION_TO_DEVICE);
+}
 
 static MemTxResult dma_buf_rw(void *buf, dma_addr_t len, dma_addr_t *residual,
                               QEMUSGList *sg, DMADirection dir,
-- 
2.40.1