From: Klaus Jensen <k.jensen@samsung.com>
This refactors how the device issues asynchronous block backend
requests. The NvmeRequest now holds a queue of NvmeAIOs that are
associated with the command. This allows multiple aios to be issued for
a command. Only when all requests have been completed will the device
post a completion queue entry.
Because the device is currently guaranteed to only issue a single aio
request per command, the benefit is not immediately obvious. But this
functionality is required to support metadata, the dataset management
command and other features.
Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
Acked-by: Keith Busch <kbusch@kernel.org>
---
hw/block/nvme.c | 377 +++++++++++++++++++++++++++++++-----------
hw/block/nvme.h | 129 +++++++++++++--
hw/block/trace-events | 6 +
3 files changed, 407 insertions(+), 105 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 0d2b5b45b0c5..817384e3b1a9 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -59,6 +59,7 @@
} while (0)
static void nvme_process_sq(void *opaque);
+static void nvme_aio_cb(void *opaque, int ret);
static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
{
@@ -373,6 +374,99 @@ static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, QEMUSGList *qsg,
return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
}
+static void nvme_aio_destroy(NvmeAIO *aio)
+{
+ g_free(aio);
+}
+
+static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
+ NvmeAIOOp opc)
+{
+ aio->opc = opc;
+
+ trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
+ aio->offset, aio->len,
+ nvme_aio_opc_str(aio), req);
+
+ if (req) {
+ QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry);
+ }
+}
+
+static void nvme_submit_aio(NvmeAIO *aio)
+{
+ BlockBackend *blk = aio->blk;
+ BlockAcctCookie *acct = &aio->acct;
+ BlockAcctStats *stats = blk_get_stats(blk);
+
+ bool is_write;
+
+ switch (aio->opc) {
+ case NVME_AIO_OPC_NONE:
+ break;
+
+ case NVME_AIO_OPC_FLUSH:
+ block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
+ aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
+ break;
+
+ case NVME_AIO_OPC_WRITE_ZEROES:
+ block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
+ aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
+ BDRV_REQ_MAY_UNMAP, nvme_aio_cb,
+ aio);
+ break;
+
+ case NVME_AIO_OPC_READ:
+ case NVME_AIO_OPC_WRITE:
+ is_write = (aio->opc == NVME_AIO_OPC_WRITE);
+
+ block_acct_start(stats, acct, aio->len,
+ is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+
+ if (aio->qsg) {
+ if (is_write) {
+ aio->aiocb = dma_blk_write(blk, aio->qsg, aio->offset,
+ BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+ } else {
+ aio->aiocb = dma_blk_read(blk, aio->qsg, aio->offset,
+ BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
+ }
+ } else {
+ if (is_write) {
+ aio->aiocb = blk_aio_pwritev(blk, aio->offset, aio->iov, 0,
+ nvme_aio_cb, aio);
+ } else {
+ aio->aiocb = blk_aio_preadv(blk, aio->offset, aio->iov, 0,
+ nvme_aio_cb, aio);
+ }
+ }
+
+ break;
+ }
+}
+
+static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeRequest *req)
+{
+ NvmeAIO *aio;
+ size_t len = req->qsg.nsg > 0 ? req->qsg.size : req->iov.size;
+
+ aio = g_new0(NvmeAIO, 1);
+
+ *aio = (NvmeAIO) {
+ .blk = blk,
+ .offset = offset,
+ .len = len,
+ .req = req,
+ .qsg = req->qsg.sg ? &req->qsg : NULL,
+ .iov = req->iov.iov ? &req->iov : NULL,
+ };
+
+ nvme_req_register_aio(req, aio, nvme_req_is_write(req) ?
+ NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ);
+ nvme_submit_aio(aio);
+}
+
static void nvme_post_cqes(void *opaque)
{
NvmeCQueue *cq = opaque;
@@ -396,6 +490,7 @@ static void nvme_post_cqes(void *opaque)
nvme_inc_cq_tail(cq);
pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
sizeof(req->cqe));
+ nvme_req_clear(req);
QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
}
if (cq->tail != cq->head) {
@@ -406,8 +501,8 @@ static void nvme_post_cqes(void *opaque)
static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
{
assert(cq->cqid == req->sq->cqid);
- trace_nvme_dev_enqueue_req_completion(nvme_cid(req), cq->cqid,
- req->status);
+ trace_nvme_dev_enqueue_req_completion(nvme_cid(req), cq->cqid, req->status);
+
QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
@@ -505,9 +600,11 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len,
return NVME_SUCCESS;
}
-static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns,
- uint16_t ctrl, NvmeRequest *req)
+static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, uint16_t ctrl,
+ NvmeRequest *req)
{
+ NvmeNamespace *ns = req->ns;
+
if ((ctrl & NVME_RW_PRINFO_PRACT) && !(ns->id_ns.dps & DPS_TYPE_MASK)) {
trace_nvme_dev_err_prinfo(nvme_cid(req), ctrl);
return NVME_INVALID_FIELD | NVME_DNR;
@@ -516,10 +613,10 @@ static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns,
return NVME_SUCCESS;
}
-static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
- uint64_t slba, uint32_t nlb,
- NvmeRequest *req)
+static inline uint16_t nvme_check_bounds(NvmeCtrl *n, uint64_t slba,
+ uint32_t nlb, NvmeRequest *req)
{
+ NvmeNamespace *ns = req->ns;
uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
@@ -530,55 +627,154 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
return NVME_SUCCESS;
}
-static void nvme_rw_cb(void *opaque, int ret)
+static uint16_t nvme_check_rw(NvmeCtrl *n, NvmeRequest *req)
+{
+ NvmeNamespace *ns = req->ns;
+ NvmeRwCmd *rw = (NvmeRwCmd *) &req->cmd;
+ uint16_t ctrl = le16_to_cpu(rw->control);
+ size_t len = req->nlb << nvme_ns_lbads(ns);
+ uint16_t status;
+
+ status = nvme_check_mdts(n, len, req);
+ if (status) {
+ return status;
+ }
+
+ status = nvme_check_prinfo(n, ctrl, req);
+ if (status) {
+ return status;
+ }
+
+ status = nvme_check_bounds(n, req->slba, req->nlb, req);
+ if (status) {
+ return status;
+ }
+
+ return NVME_SUCCESS;
+}
+
+static void nvme_rw_cb(NvmeRequest *req, void *opaque)
{
- NvmeRequest *req = opaque;
NvmeSQueue *sq = req->sq;
NvmeCtrl *n = sq->ctrl;
NvmeCQueue *cq = n->cq[sq->cqid];
- if (!ret) {
- block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
- req->status = NVME_SUCCESS;
- } else {
- block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
- req->status = NVME_INTERNAL_DEV_ERROR;
- }
-
- if (req->qsg.nalloc) {
- qemu_sglist_destroy(&req->qsg);
- }
- if (req->iov.nalloc) {
- qemu_iovec_destroy(&req->iov);
- }
+ trace_nvme_dev_rw_cb(nvme_cid(req), req->cmd.nsid);
nvme_enqueue_req_completion(cq, req);
}
-static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
- NvmeRequest *req)
+static void nvme_aio_cb(void *opaque, int ret)
{
- block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
- BLOCK_ACCT_FLUSH);
- req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
+ NvmeAIO *aio = opaque;
+ NvmeRequest *req = aio->req;
+
+ BlockBackend *blk = aio->blk;
+ BlockAcctCookie *acct = &aio->acct;
+ BlockAcctStats *stats = blk_get_stats(blk);
+
+ Error *local_err = NULL;
+
+ trace_nvme_dev_aio_cb(nvme_cid(req), aio, blk_name(blk), aio->offset,
+ nvme_aio_opc_str(aio), req);
+
+ if (req) {
+ QTAILQ_REMOVE(&req->aio_tailq, aio, tailq_entry);
+ }
+
+ if (!ret) {
+ block_acct_done(stats, acct);
+ } else {
+ block_acct_failed(stats, acct);
+
+ if (req) {
+ uint16_t status;
+
+ switch (aio->opc) {
+ case NVME_AIO_OPC_READ:
+ status = NVME_UNRECOVERED_READ;
+ break;
+ case NVME_AIO_OPC_WRITE:
+ case NVME_AIO_OPC_WRITE_ZEROES:
+ status = NVME_WRITE_FAULT;
+ break;
+ default:
+ status = NVME_INTERNAL_DEV_ERROR;
+ break;
+ }
+
+ trace_nvme_dev_err_aio(nvme_cid(req), aio, blk_name(blk),
+ aio->offset, nvme_aio_opc_str(aio), req,
+ status);
+
+ error_setg_errno(&local_err, -ret, "aio failed");
+ error_report_err(local_err);
+
+ /*
+ * An Internal Error trumps all other errors. For other errors,
+ * only set the first error encountered. Any additional errors will
+ * be recorded in the error information log page.
+ */
+ if (!req->status ||
+ nvme_status_is_error(status, NVME_INTERNAL_DEV_ERROR)) {
+ req->status = status;
+ }
+ }
+ }
+
+ if (aio->cb) {
+ aio->cb(aio, aio->cb_arg, ret);
+ }
+
+ if (req && QTAILQ_EMPTY(&req->aio_tailq)) {
+ if (req->cb) {
+ req->cb(req, req->cb_arg);
+ } else {
+ NvmeSQueue *sq = req->sq;
+ NvmeCtrl *n = sq->ctrl;
+ NvmeCQueue *cq = n->cq[sq->cqid];
+
+ nvme_enqueue_req_completion(cq, req);
+ }
+ }
+
+ nvme_aio_destroy(aio);
+}
+
+static uint16_t nvme_flush(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
+{
+ NvmeAIO *aio = g_new0(NvmeAIO, 1);
+
+ *aio = (NvmeAIO) {
+ .blk = n->conf.blk,
+ .req = req,
+ };
+
+ nvme_req_register_aio(req, aio, NVME_AIO_OPC_FLUSH);
+ nvme_submit_aio(aio);
return NVME_NO_COMPLETE;
}
-static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
- NvmeRequest *req)
+static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
- NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
- const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
- const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
- uint64_t slba = le64_to_cpu(rw->slba);
- uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
- uint64_t offset = slba << data_shift;
- uint32_t count = nlb << data_shift;
+ NvmeAIO *aio;
+
+ NvmeNamespace *ns = req->ns;
+ NvmeRwCmd *rw = (NvmeRwCmd *) cmd;
uint16_t ctrl = le16_to_cpu(rw->control);
+
+ int64_t offset;
+ size_t count;
uint16_t status;
- status = nvme_check_prinfo(n, ns, ctrl, req);
+ req->slba = le64_to_cpu(rw->slba);
+ req->nlb = le16_to_cpu(rw->nlb) + 1;
+
+ trace_nvme_dev_write_zeroes(nvme_cid(req), le32_to_cpu(cmd->nsid),
+ req->slba, req->nlb);
+
+ status = nvme_check_prinfo(n, ctrl, req);
if (status) {
goto invalid;
}
@@ -588,15 +784,26 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
goto invalid;
}
- status = nvme_check_bounds(n, ns, slba, nlb, req);
+ status = nvme_check_bounds(n, req->slba, req->nlb, req);
if (status) {
goto invalid;
}
- block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
- BLOCK_ACCT_WRITE);
- req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
- BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
+ offset = req->slba << nvme_ns_lbads(ns);
+ count = req->nlb << nvme_ns_lbads(ns);
+
+ aio = g_new0(NvmeAIO, 1);
+
+ *aio = (NvmeAIO) {
+ .blk = n->conf.blk,
+ .offset = offset,
+ .len = count,
+ .req = req,
+ };
+
+ nvme_req_register_aio(req, aio, NVME_AIO_OPC_WRITE_ZEROES);
+ nvme_submit_aio(aio);
+
return NVME_NO_COMPLETE;
invalid:
@@ -604,63 +811,36 @@ invalid:
return status;
}
-static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
- NvmeRequest *req)
+static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
- NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
- uint32_t nlb = le32_to_cpu(rw->nlb) + 1;
- uint64_t slba = le64_to_cpu(rw->slba);
- uint16_t ctrl = le16_to_cpu(rw->control);
+ NvmeRwCmd *rw = (NvmeRwCmd *) cmd;
+ NvmeNamespace *ns = req->ns;
+ uint32_t len;
+ int status;
- uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
- uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
- uint64_t data_size = (uint64_t)nlb << data_shift;
- uint64_t data_offset = slba << data_shift;
- int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
- enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
- uint16_t status;
+ enum BlockAcctType acct =
+ nvme_req_is_write(req) ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
- trace_nvme_dev_rw(is_write ? "write" : "read", nlb, data_size, slba);
+ req->nlb = le16_to_cpu(rw->nlb) + 1;
+ req->slba = le64_to_cpu(rw->slba);
- status = nvme_check_mdts(n, data_size, req);
- if (status) {
- goto invalid;
- }
+ len = req->nlb << nvme_ns_lbads(ns);
- status = nvme_check_prinfo(n, ns, ctrl, req);
- if (status) {
- goto invalid;
- }
+ trace_nvme_dev_rw(nvme_req_is_write(req) ? "write" : "read", req->nlb,
+ req->nlb << nvme_ns_lbads(req->ns), req->slba);
- status = nvme_check_bounds(n, ns, slba, nlb, req);
+ status = nvme_check_rw(n, req);
if (status) {
goto invalid;
}
- status = nvme_map(n, cmd, &req->qsg, &req->iov, data_size, req);
+ status = nvme_map(n, cmd, &req->qsg, &req->iov, len, req);
if (status) {
goto invalid;
}
- if (req->qsg.nsg > 0) {
- block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->qsg.size,
- acct);
-
- req->aiocb = is_write ?
- dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
- nvme_rw_cb, req) :
- dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
- nvme_rw_cb, req);
- } else {
- block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->iov.size,
- acct);
-
- req->aiocb = is_write ?
- blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
- req) :
- blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
- req);
- }
+ nvme_rw_aio(n->conf.blk, req->slba << nvme_ns_lbads(ns), req);
+ nvme_req_set_cb(req, nvme_rw_cb, NULL);
return NVME_NO_COMPLETE;
@@ -671,23 +851,26 @@ invalid:
static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
{
- NvmeNamespace *ns;
uint32_t nsid = le32_to_cpu(cmd->nsid);
+ trace_nvme_dev_io_cmd(nvme_cid(req), nsid, le16_to_cpu(req->sq->sqid),
+ cmd->opcode);
+
if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
trace_nvme_dev_err_invalid_ns(nsid, n->num_namespaces);
return NVME_INVALID_NSID | NVME_DNR;
}
- ns = &n->namespaces[nsid - 1];
+ req->ns = &n->namespaces[nsid - 1];
+
switch (cmd->opcode) {
case NVME_CMD_FLUSH:
- return nvme_flush(n, ns, cmd, req);
+ return nvme_flush(n, cmd, req);
case NVME_CMD_WRITE_ZEROS:
- return nvme_write_zeros(n, ns, cmd, req);
+ return nvme_write_zeroes(n, cmd, req);
case NVME_CMD_WRITE:
case NVME_CMD_READ:
- return nvme_rw(n, ns, cmd, req);
+ return nvme_rw(n, cmd, req);
default:
trace_nvme_dev_err_invalid_opc(cmd->opcode);
return NVME_INVALID_OPCODE | NVME_DNR;
@@ -711,6 +894,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
NvmeRequest *req, *next;
NvmeSQueue *sq;
NvmeCQueue *cq;
+ NvmeAIO *aio;
uint16_t qid = le16_to_cpu(c->qid);
if (unlikely(!qid || nvme_check_sqid(n, qid))) {
@@ -723,8 +907,11 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
sq = n->sq[qid];
while (!QTAILQ_EMPTY(&sq->out_req_list)) {
req = QTAILQ_FIRST(&sq->out_req_list);
- assert(req->aiocb);
- blk_aio_cancel(req->aiocb);
+ while (!QTAILQ_EMPTY(&req->aio_tailq)) {
+ aio = QTAILQ_FIRST(&req->aio_tailq);
+ assert(aio->aiocb);
+ blk_aio_cancel(aio->aiocb);
+ }
}
if (!nvme_check_cqid(n, sq->cqid)) {
cq = n->cq[sq->cqid];
@@ -761,6 +948,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
QTAILQ_INIT(&sq->out_req_list);
for (i = 0; i < sq->size; i++) {
sq->io_req[i].sq = sq;
+ QTAILQ_INIT(&(sq->io_req[i].aio_tailq));
QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
}
sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
@@ -1474,8 +1662,9 @@ static void nvme_process_sq(void *opaque)
req = QTAILQ_FIRST(&sq->req_list);
QTAILQ_REMOVE(&sq->req_list, req, entry);
QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
- memset(&req->cqe, 0, sizeof(req->cqe));
+
req->cqe.cid = cmd.cid;
+ memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
status = sq->sqid ? nvme_io_cmd(n, &cmd, req) :
nvme_admin_cmd(n, &cmd, req);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index b05c2153aebf..5d5fa8c8833a 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -27,16 +27,58 @@ typedef struct NvmeAsyncEvent {
NvmeAerResult result;
} NvmeAsyncEvent;
-typedef struct NvmeRequest {
- struct NvmeSQueue *sq;
- BlockAIOCB *aiocb;
- uint16_t status;
- NvmeCqe cqe;
- BlockAcctCookie acct;
- QEMUSGList qsg;
- QEMUIOVector iov;
- QTAILQ_ENTRY(NvmeRequest)entry;
-} NvmeRequest;
+typedef struct NvmeRequest NvmeRequest;
+typedef void NvmeRequestCompletionFunc(NvmeRequest *req, void *opaque);
+
+struct NvmeRequest {
+ struct NvmeSQueue *sq;
+ struct NvmeNamespace *ns;
+
+ NvmeCqe cqe;
+ NvmeCmd cmd;
+ uint16_t status;
+
+ uint64_t slba;
+ uint32_t nlb;
+
+ QEMUSGList qsg;
+ QEMUIOVector iov;
+
+ NvmeRequestCompletionFunc *cb;
+ void *cb_arg;
+
+ QTAILQ_HEAD(, NvmeAIO) aio_tailq;
+ QTAILQ_ENTRY(NvmeRequest) entry;
+};
+
+static inline void nvme_req_clear(NvmeRequest *req)
+{
+ req->ns = NULL;
+ memset(&req->cqe, 0, sizeof(req->cqe));
+ req->status = NVME_SUCCESS;
+ req->slba = req->nlb = 0x0;
+ req->cb = req->cb_arg = NULL;
+
+ if (req->qsg.sg) {
+ qemu_sglist_destroy(&req->qsg);
+ }
+
+ if (req->iov.iov) {
+ qemu_iovec_destroy(&req->iov);
+ }
+}
+
+static inline void nvme_req_set_cb(NvmeRequest *req,
+ NvmeRequestCompletionFunc *cb, void *cb_arg)
+{
+ req->cb = cb;
+ req->cb_arg = cb_arg;
+}
+
+static inline void nvme_req_clear_cb(NvmeRequest *req)
+{
+ req->cb = req->cb_arg = NULL;
+}
typedef struct NvmeSQueue {
struct NvmeCtrl *ctrl;
@@ -88,6 +130,60 @@ static inline size_t nvme_ns_lbads_bytes(NvmeNamespace *ns)
return 1 << nvme_ns_lbads(ns);
}
+typedef enum NvmeAIOOp {
+ NVME_AIO_OPC_NONE = 0x0,
+ NVME_AIO_OPC_FLUSH = 0x1,
+ NVME_AIO_OPC_READ = 0x2,
+ NVME_AIO_OPC_WRITE = 0x3,
+ NVME_AIO_OPC_WRITE_ZEROES = 0x4,
+} NvmeAIOOp;
+
+typedef struct NvmeAIO NvmeAIO;
+typedef void NvmeAIOCompletionFunc(NvmeAIO *aio, void *opaque, int ret);
+
+struct NvmeAIO {
+ NvmeRequest *req;
+
+ NvmeAIOOp opc;
+ int64_t offset;
+ size_t len;
+ BlockBackend *blk;
+ BlockAIOCB *aiocb;
+ BlockAcctCookie acct;
+
+ NvmeAIOCompletionFunc *cb;
+ void *cb_arg;
+
+ QEMUSGList *qsg;
+ QEMUIOVector *iov;
+
+ QTAILQ_ENTRY(NvmeAIO) tailq_entry;
+};
+
+static inline const char *nvme_aio_opc_str(NvmeAIO *aio)
+{
+ switch (aio->opc) {
+ case NVME_AIO_OPC_NONE: return "NVME_AIO_OP_NONE";
+ case NVME_AIO_OPC_FLUSH: return "NVME_AIO_OP_FLUSH";
+ case NVME_AIO_OPC_READ: return "NVME_AIO_OP_READ";
+ case NVME_AIO_OPC_WRITE: return "NVME_AIO_OP_WRITE";
+ case NVME_AIO_OPC_WRITE_ZEROES: return "NVME_AIO_OP_WRITE_ZEROES";
+ default: return "NVME_AIO_OP_UNKNOWN";
+ }
+}
+
+static inline bool nvme_req_is_write(NvmeRequest *req)
+{
+ switch (req->cmd.opcode) {
+ case NVME_CMD_WRITE:
+ case NVME_CMD_WRITE_UNCOR:
+ case NVME_CMD_WRITE_ZEROS:
+ return true;
+ default:
+ return false;
+ }
+}
+
#define TYPE_NVME "nvme"
#define NVME(obj) \
OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME)
@@ -140,10 +236,21 @@ static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
static inline uint16_t nvme_cid(NvmeRequest *req)
{
if (req) {
- return le16_to_cpu(req->cqe.cid);
+ return le16_to_cpu(req->cmd.cid);
}
return 0xffff;
}
+static inline bool nvme_status_is_error(uint16_t status, uint16_t err)
+{
+ /* strip DNR and MORE */
+ return (status & 0xfff) == err;
+}
+
+static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req)
+{
+ return req->sq->ctrl;
+}
+
#endif /* HW_NVME_H */
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 2aceb0537e05..aa449e314818 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -34,7 +34,12 @@ nvme_dev_irq_pin(void) "pulsing IRQ pin"
nvme_dev_irq_masked(void) "IRQ is masked"
nvme_dev_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
nvme_dev_map_prp(uint16_t cid, uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "cid %"PRIu16" trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64" num_prps %d"
+nvme_dev_req_register_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, uint64_t count, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" count %"PRIu64" opc \"%s\" req %p"
+nvme_dev_aio_cb(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p"
+nvme_dev_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8""
nvme_dev_rw(const char *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
+nvme_dev_rw_cb(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
+nvme_dev_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
nvme_dev_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
nvme_dev_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
nvme_dev_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
@@ -81,6 +86,7 @@ nvme_dev_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "cqid %"PRIu16" new_
# nvme traces for error conditions
nvme_dev_err_mdts(uint16_t cid, size_t mdts, size_t len) "cid %"PRIu16" mdts %"PRIu64" len %"PRIu64""
nvme_dev_err_prinfo(uint16_t cid, uint16_t ctrl) "cid %"PRIu16" ctrl %"PRIu16""
+nvme_dev_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p status 0x%"PRIx16""
nvme_dev_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
nvme_dev_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
nvme_dev_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
--
2.25.1
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen <k.jensen@samsung.com>
>
> This refactors how the device issues asynchronous block backend
> requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> associated with the command. This allows multiple aios to be issued for
> a command. Only when all requests have been completed will the device
> post a completion queue entry.
>
> Because the device is currently guaranteed to only issue a single aio
> request per command, the benefit is not immediately obvious. But this
> functionality is required to support metadata, the dataset management
> command and other features.
>
> Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> Acked-by: Keith Busch <kbusch@kernel.org>
> ---
> hw/block/nvme.c | 377 +++++++++++++++++++++++++++++++-----------
> hw/block/nvme.h | 129 +++++++++++++--
> hw/block/trace-events | 6 +
> 3 files changed, 407 insertions(+), 105 deletions(-)
>
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 0d2b5b45b0c5..817384e3b1a9 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -59,6 +59,7 @@
> } while (0)
>
> static void nvme_process_sq(void *opaque);
> +static void nvme_aio_cb(void *opaque, int ret);
>
> static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
> {
> @@ -373,6 +374,99 @@ static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, QEMUSGList *qsg,
> return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
> }
>
> +static void nvme_aio_destroy(NvmeAIO *aio)
> +{
> + g_free(aio);
> +}
> +
> +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
I guess I'll call this nvme_req_add_aio,
or nvme_add_aio_to_reg.
Thoughts?
Also you can leave this as is, but add a comment on top explaining this
> + NvmeAIOOp opc)
> +{
> + aio->opc = opc;
> +
> + trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
> + aio->offset, aio->len,
> + nvme_aio_opc_str(aio), req);
> +
> + if (req) {
> + QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry);
> + }
> +}
> +
> +static void nvme_submit_aio(NvmeAIO *aio)
OK, this name makes sense
Also please add a comment on top.
> +{
> + BlockBackend *blk = aio->blk;
> + BlockAcctCookie *acct = &aio->acct;
> + BlockAcctStats *stats = blk_get_stats(blk);
> +
> + bool is_write;
> +
> + switch (aio->opc) {
> + case NVME_AIO_OPC_NONE:
> + break;
> +
> + case NVME_AIO_OPC_FLUSH:
> + block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
> + aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
> + break;
> +
> + case NVME_AIO_OPC_WRITE_ZEROES:
> + block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
> + aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
> + BDRV_REQ_MAY_UNMAP, nvme_aio_cb,
> + aio);
> + break;
> +
> + case NVME_AIO_OPC_READ:
> + case NVME_AIO_OPC_WRITE:
> + is_write = (aio->opc == NVME_AIO_OPC_WRITE);
> +
> + block_acct_start(stats, acct, aio->len,
> + is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
> +
> + if (aio->qsg) {
> + if (is_write) {
> + aio->aiocb = dma_blk_write(blk, aio->qsg, aio->offset,
> + BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
> + } else {
> + aio->aiocb = dma_blk_read(blk, aio->qsg, aio->offset,
> + BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
> + }
> + } else {
> + if (is_write) {
> + aio->aiocb = blk_aio_pwritev(blk, aio->offset, aio->iov, 0,
> + nvme_aio_cb, aio);
> + } else {
> + aio->aiocb = blk_aio_preadv(blk, aio->offset, aio->iov, 0,
> + nvme_aio_cb, aio);
> + }
> + }
Looks much better that way that a early return!
> +
> + break;
> + }
> +}
> +
> +static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeRequest *req)
> +{
> + NvmeAIO *aio;
> + size_t len = req->qsg.nsg > 0 ? req->qsg.size : req->iov.size;
> +
> + aio = g_new0(NvmeAIO, 1);
> +
> + *aio = (NvmeAIO) {
> + .blk = blk,
> + .offset = offset,
> + .len = len,
> + .req = req,
> + .qsg = req->qsg.sg ? &req->qsg : NULL,
> + .iov = req->iov.iov ? &req->iov : NULL,
OK, this is the fix for the bug I mentioned in V5, looks good.
> + };
> +
> + nvme_req_register_aio(req, aio, nvme_req_is_write(req) ?
> + NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ);
> + nvme_submit_aio(aio);
> +}
> +
> static void nvme_post_cqes(void *opaque)
> {
> NvmeCQueue *cq = opaque;
> @@ -396,6 +490,7 @@ static void nvme_post_cqes(void *opaque)
> nvme_inc_cq_tail(cq);
> pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
> sizeof(req->cqe));
> + nvme_req_clear(req);
> QTAILQ_INSERT_TAIL(&sq->req_list, req, entry);
> }
> if (cq->tail != cq->head) {
> @@ -406,8 +501,8 @@ static void nvme_post_cqes(void *opaque)
> static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
> {
> assert(cq->cqid == req->sq->cqid);
> - trace_nvme_dev_enqueue_req_completion(nvme_cid(req), cq->cqid,
> - req->status);
> + trace_nvme_dev_enqueue_req_completion(nvme_cid(req), cq->cqid, req->status);
> +
> QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
> QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
> timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> @@ -505,9 +600,11 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len,
> return NVME_SUCCESS;
> }
>
> -static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns,
> - uint16_t ctrl, NvmeRequest *req)
> +static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, uint16_t ctrl,
> + NvmeRequest *req)
> {
> + NvmeNamespace *ns = req->ns;
> +
This should go to the patch that added nvme_check_prinfo
> if ((ctrl & NVME_RW_PRINFO_PRACT) && !(ns->id_ns.dps & DPS_TYPE_MASK)) {
> trace_nvme_dev_err_prinfo(nvme_cid(req), ctrl);
> return NVME_INVALID_FIELD | NVME_DNR;
> @@ -516,10 +613,10 @@ static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns,
> return NVME_SUCCESS;
> }
>
> -static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
> - uint64_t slba, uint32_t nlb,
> - NvmeRequest *req)
> +static inline uint16_t nvme_check_bounds(NvmeCtrl *n, uint64_t slba,
> + uint32_t nlb, NvmeRequest *req)
> {
> + NvmeNamespace *ns = req->ns;
> uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
This should go to the patch that added nvme_check_bounds as well
>
> if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
> @@ -530,55 +627,154 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
> return NVME_SUCCESS;
> }
>
> -static void nvme_rw_cb(void *opaque, int ret)
> +static uint16_t nvme_check_rw(NvmeCtrl *n, NvmeRequest *req)
> +{
> + NvmeNamespace *ns = req->ns;
> + NvmeRwCmd *rw = (NvmeRwCmd *) &req->cmd;
> + uint16_t ctrl = le16_to_cpu(rw->control);
> + size_t len = req->nlb << nvme_ns_lbads(ns);
> + uint16_t status;
> +
> + status = nvme_check_mdts(n, len, req);
> + if (status) {
> + return status;
> + }
> +
> + status = nvme_check_prinfo(n, ctrl, req);
> + if (status) {
> + return status;
> + }
> +
> + status = nvme_check_bounds(n, req->slba, req->nlb, req);
> + if (status) {
> + return status;
> + }
> +
> + return NVME_SUCCESS;
> +}
Nitpick: I hate to say it but nvme_check_rw should be in a separate patch as well.
It will also make diff more readable (when adding a funtion and changing a function
at the same time, you get a diff between two unrelated things)
> +
> +static void nvme_rw_cb(NvmeRequest *req, void *opaque)
> {
> - NvmeRequest *req = opaque;
> NvmeSQueue *sq = req->sq;
> NvmeCtrl *n = sq->ctrl;
> NvmeCQueue *cq = n->cq[sq->cqid];
>
> - if (!ret) {
> - block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
> - req->status = NVME_SUCCESS;
> - } else {
> - block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
> - req->status = NVME_INTERNAL_DEV_ERROR;
> - }
> -
> - if (req->qsg.nalloc) {
> - qemu_sglist_destroy(&req->qsg);
> - }
> - if (req->iov.nalloc) {
> - qemu_iovec_destroy(&req->iov);
> - }
> + trace_nvme_dev_rw_cb(nvme_cid(req), req->cmd.nsid);
>
> nvme_enqueue_req_completion(cq, req);
> }
>
> -static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
> - NvmeRequest *req)
> +static void nvme_aio_cb(void *opaque, int ret)
> {
> - block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
> - BLOCK_ACCT_FLUSH);
> - req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
> + NvmeAIO *aio = opaque;
> + NvmeRequest *req = aio->req;
> +
> + BlockBackend *blk = aio->blk;
> + BlockAcctCookie *acct = &aio->acct;
> + BlockAcctStats *stats = blk_get_stats(blk);
> +
> + Error *local_err = NULL;
> +
> + trace_nvme_dev_aio_cb(nvme_cid(req), aio, blk_name(blk), aio->offset,
> + nvme_aio_opc_str(aio), req);
> +
> + if (req) {
> + QTAILQ_REMOVE(&req->aio_tailq, aio, tailq_entry);
> + }
> +
> + if (!ret) {
> + block_acct_done(stats, acct);
> + } else {
> + block_acct_failed(stats, acct);
> +
> + if (req) {
> + uint16_t status;
> +
> + switch (aio->opc) {
> + case NVME_AIO_OPC_READ:
> + status = NVME_UNRECOVERED_READ;
> + break;
> + case NVME_AIO_OPC_WRITE:
> + case NVME_AIO_OPC_WRITE_ZEROES:
> + status = NVME_WRITE_FAULT;
> + break;
> + default:
> + status = NVME_INTERNAL_DEV_ERROR;
> + break;
> + }
> +
> + trace_nvme_dev_err_aio(nvme_cid(req), aio, blk_name(blk),
> + aio->offset, nvme_aio_opc_str(aio), req,
> + status);
> +
> + error_setg_errno(&local_err, -ret, "aio failed");
> + error_report_err(local_err);
> +
> + /*
> + * An Internal Error trumps all other errors. For other errors,
> + * only set the first error encountered. Any additional errors will
> + * be recorded in the error information log page.
> + */
> + if (!req->status ||
> + nvme_status_is_error(status, NVME_INTERNAL_DEV_ERROR)) {
> + req->status = status;
> + }
> + }
> + }
> +
> + if (aio->cb) {
> + aio->cb(aio, aio->cb_arg, ret);
> + }
> +
> + if (req && QTAILQ_EMPTY(&req->aio_tailq)) {
> + if (req->cb) {
> + req->cb(req, req->cb_arg);
> + } else {
> + NvmeSQueue *sq = req->sq;
> + NvmeCtrl *n = sq->ctrl;
> + NvmeCQueue *cq = n->cq[sq->cqid];
> +
> + nvme_enqueue_req_completion(cq, req);
> + }
> + }
> +
> + nvme_aio_destroy(aio);
> +}
> +
> +static uint16_t nvme_flush(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> + NvmeAIO *aio = g_new0(NvmeAIO, 1);
> +
> + *aio = (NvmeAIO) {
> + .blk = n->conf.blk,
> + .req = req,
> + };
> +
> + nvme_req_register_aio(req, aio, NVME_AIO_OPC_FLUSH);
> + nvme_submit_aio(aio);
>
> return NVME_NO_COMPLETE;
> }
>
> -static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
> - NvmeRequest *req)
> +static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
Very small nitpick about zeros/zeroes: This should move to some refactoring patch to be honest.
> {
> - NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
> - const uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
> - const uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
> - uint64_t slba = le64_to_cpu(rw->slba);
> - uint32_t nlb = le16_to_cpu(rw->nlb) + 1;
> - uint64_t offset = slba << data_shift;
> - uint32_t count = nlb << data_shift;
> + NvmeAIO *aio;
> +
> + NvmeNamespace *ns = req->ns;
> + NvmeRwCmd *rw = (NvmeRwCmd *) cmd;
> uint16_t ctrl = le16_to_cpu(rw->control);
> +
> + int64_t offset;
> + size_t count;
> uint16_t status;
>
> - status = nvme_check_prinfo(n, ns, ctrl, req);
> + req->slba = le64_to_cpu(rw->slba);
> + req->nlb = le16_to_cpu(rw->nlb) + 1;
> +
> + trace_nvme_dev_write_zeroes(nvme_cid(req), le32_to_cpu(cmd->nsid),
> + req->slba, req->nlb);
> +
> + status = nvme_check_prinfo(n, ctrl, req);
> if (status) {
> goto invalid;
> }
> @@ -588,15 +784,26 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
> goto invalid;
> }
>
> - status = nvme_check_bounds(n, ns, slba, nlb, req);
> + status = nvme_check_bounds(n, req->slba, req->nlb, req);
> if (status) {
> goto invalid;
> }
>
> - block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
> - BLOCK_ACCT_WRITE);
> - req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
> - BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
> + offset = req->slba << nvme_ns_lbads(ns);
> + count = req->nlb << nvme_ns_lbads(ns);
> +
> + aio = g_new0(NvmeAIO, 1);
> +
> + *aio = (NvmeAIO) {
> + .blk = n->conf.blk,
> + .offset = offset,
> + .len = count,
> + .req = req,
> + };
> +
> + nvme_req_register_aio(req, aio, NVME_AIO_OPC_WRITE_ZEROES);
> + nvme_submit_aio(aio);
> +
> return NVME_NO_COMPLETE;
>
> invalid:
> @@ -604,63 +811,36 @@ invalid:
> return status;
> }
>
> -static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
> - NvmeRequest *req)
> +static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> {
> - NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
> - uint32_t nlb = le32_to_cpu(rw->nlb) + 1;
> - uint64_t slba = le64_to_cpu(rw->slba);
> - uint16_t ctrl = le16_to_cpu(rw->control);
> + NvmeRwCmd *rw = (NvmeRwCmd *) cmd;
> + NvmeNamespace *ns = req->ns;
> + uint32_t len;
> + int status;
>
> - uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
> - uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
> - uint64_t data_size = (uint64_t)nlb << data_shift;
> - uint64_t data_offset = slba << data_shift;
> - int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
> - enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
> - uint16_t status;
> + enum BlockAcctType acct =
> + nvme_req_is_write(req) ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
>
> - trace_nvme_dev_rw(is_write ? "write" : "read", nlb, data_size, slba);
> + req->nlb = le16_to_cpu(rw->nlb) + 1;
> + req->slba = le64_to_cpu(rw->slba);
>
> - status = nvme_check_mdts(n, data_size, req);
> - if (status) {
> - goto invalid;
> - }
> + len = req->nlb << nvme_ns_lbads(ns);
>
> - status = nvme_check_prinfo(n, ns, ctrl, req);
> - if (status) {
> - goto invalid;
> - }
> + trace_nvme_dev_rw(nvme_req_is_write(req) ? "write" : "read", req->nlb,
> + req->nlb << nvme_ns_lbads(req->ns), req->slba);
>
> - status = nvme_check_bounds(n, ns, slba, nlb, req);
> + status = nvme_check_rw(n, req);
> if (status) {
> goto invalid;
> }
>
> - status = nvme_map(n, cmd, &req->qsg, &req->iov, data_size, req);
> + status = nvme_map(n, cmd, &req->qsg, &req->iov, len, req);
> if (status) {
> goto invalid;
> }
>
> - if (req->qsg.nsg > 0) {
> - block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->qsg.size,
> - acct);
> -
> - req->aiocb = is_write ?
> - dma_blk_write(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
> - nvme_rw_cb, req) :
> - dma_blk_read(n->conf.blk, &req->qsg, data_offset, BDRV_SECTOR_SIZE,
> - nvme_rw_cb, req);
> - } else {
> - block_acct_start(blk_get_stats(n->conf.blk), &req->acct, req->iov.size,
> - acct);
> -
> - req->aiocb = is_write ?
> - blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
> - req) :
> - blk_aio_preadv(n->conf.blk, data_offset, &req->iov, 0, nvme_rw_cb,
> - req);
> - }
> + nvme_rw_aio(n->conf.blk, req->slba << nvme_ns_lbads(ns), req);
> + nvme_req_set_cb(req, nvme_rw_cb, NULL);
>
> return NVME_NO_COMPLETE;
>
> @@ -671,23 +851,26 @@ invalid:
>
> static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> {
> - NvmeNamespace *ns;
> uint32_t nsid = le32_to_cpu(cmd->nsid);
>
> + trace_nvme_dev_io_cmd(nvme_cid(req), nsid, le16_to_cpu(req->sq->sqid),
> + cmd->opcode);
> +
> if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
> trace_nvme_dev_err_invalid_ns(nsid, n->num_namespaces);
> return NVME_INVALID_NSID | NVME_DNR;
> }
>
> - ns = &n->namespaces[nsid - 1];
> + req->ns = &n->namespaces[nsid - 1];
> +
> switch (cmd->opcode) {
> case NVME_CMD_FLUSH:
> - return nvme_flush(n, ns, cmd, req);
> + return nvme_flush(n, cmd, req);
> case NVME_CMD_WRITE_ZEROS:
> - return nvme_write_zeros(n, ns, cmd, req);
> + return nvme_write_zeroes(n, cmd, req);
> case NVME_CMD_WRITE:
> case NVME_CMD_READ:
> - return nvme_rw(n, ns, cmd, req);
> + return nvme_rw(n, cmd, req);
> default:
> trace_nvme_dev_err_invalid_opc(cmd->opcode);
> return NVME_INVALID_OPCODE | NVME_DNR;
> @@ -711,6 +894,7 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
> NvmeRequest *req, *next;
> NvmeSQueue *sq;
> NvmeCQueue *cq;
> + NvmeAIO *aio;
> uint16_t qid = le16_to_cpu(c->qid);
>
> if (unlikely(!qid || nvme_check_sqid(n, qid))) {
> @@ -723,8 +907,11 @@ static uint16_t nvme_del_sq(NvmeCtrl *n, NvmeCmd *cmd)
> sq = n->sq[qid];
> while (!QTAILQ_EMPTY(&sq->out_req_list)) {
> req = QTAILQ_FIRST(&sq->out_req_list);
> - assert(req->aiocb);
> - blk_aio_cancel(req->aiocb);
> + while (!QTAILQ_EMPTY(&req->aio_tailq)) {
> + aio = QTAILQ_FIRST(&req->aio_tailq);
> + assert(aio->aiocb);
> + blk_aio_cancel(aio->aiocb);
> + }
> }
> if (!nvme_check_cqid(n, sq->cqid)) {
> cq = n->cq[sq->cqid];
> @@ -761,6 +948,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
> QTAILQ_INIT(&sq->out_req_list);
> for (i = 0; i < sq->size; i++) {
> sq->io_req[i].sq = sq;
> + QTAILQ_INIT(&(sq->io_req[i].aio_tailq));
> QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
> }
> sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
> @@ -1474,8 +1662,9 @@ static void nvme_process_sq(void *opaque)
> req = QTAILQ_FIRST(&sq->req_list);
> QTAILQ_REMOVE(&sq->req_list, req, entry);
> QTAILQ_INSERT_TAIL(&sq->out_req_list, req, entry);
> - memset(&req->cqe, 0, sizeof(req->cqe));
> +
> req->cqe.cid = cmd.cid;
> + memcpy(&req->cmd, &cmd, sizeof(NvmeCmd));
>
> status = sq->sqid ? nvme_io_cmd(n, &cmd, req) :
> nvme_admin_cmd(n, &cmd, req);
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index b05c2153aebf..5d5fa8c8833a 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -27,16 +27,58 @@ typedef struct NvmeAsyncEvent {
> NvmeAerResult result;
> } NvmeAsyncEvent;
>
> -typedef struct NvmeRequest {
> - struct NvmeSQueue *sq;
> - BlockAIOCB *aiocb;
> - uint16_t status;
> - NvmeCqe cqe;
> - BlockAcctCookie acct;
> - QEMUSGList qsg;
> - QEMUIOVector iov;
> - QTAILQ_ENTRY(NvmeRequest)entry;
> -} NvmeRequest;
> +typedef struct NvmeRequest NvmeRequest;
> +typedef void NvmeRequestCompletionFunc(NvmeRequest *req, void *opaque);
> +
> +struct NvmeRequest {
> + struct NvmeSQueue *sq;
> + struct NvmeNamespace *ns;
> +
> + NvmeCqe cqe;
> + NvmeCmd cmd;
> + uint16_t status;
> +
> + uint64_t slba;
> + uint32_t nlb;
> +
> + QEMUSGList qsg;
> + QEMUIOVector iov;
> +
> + NvmeRequestCompletionFunc *cb;
> + void *cb_arg;
> +
> + QTAILQ_HEAD(, NvmeAIO) aio_tailq;
> + QTAILQ_ENTRY(NvmeRequest) entry;
> +};
> +
> +static inline void nvme_req_clear(NvmeRequest *req)
> +{
> + req->ns = NULL;
> + memset(&req->cqe, 0, sizeof(req->cqe));
> + req->status = NVME_SUCCESS;
> + req->slba = req->nlb = 0x0;
> + req->cb = req->cb_arg = NULL;
> +
> + if (req->qsg.sg) {
> + qemu_sglist_destroy(&req->qsg);
> + }
> +
> + if (req->iov.iov) {
> + qemu_iovec_destroy(&req->iov);
> + }
> +}
> +
> +static inline void nvme_req_set_cb(NvmeRequest *req,
> + NvmeRequestCompletionFunc *cb, void *cb_arg)
> +{
> + req->cb = cb;
> + req->cb_arg = cb_arg;
> +}
> +
> +static inline void nvme_req_clear_cb(NvmeRequest *req)
> +{
> + req->cb = req->cb_arg = NULL;
> +}
>
> typedef struct NvmeSQueue {
> struct NvmeCtrl *ctrl;
> @@ -88,6 +130,60 @@ static inline size_t nvme_ns_lbads_bytes(NvmeNamespace *ns)
> return 1 << nvme_ns_lbads(ns);
> }
>
> +typedef enum NvmeAIOOp {
> + NVME_AIO_OPC_NONE = 0x0,
> + NVME_AIO_OPC_FLUSH = 0x1,
> + NVME_AIO_OPC_READ = 0x2,
> + NVME_AIO_OPC_WRITE = 0x3,
> + NVME_AIO_OPC_WRITE_ZEROES = 0x4,
> +} NvmeAIOOp;
> +
> +typedef struct NvmeAIO NvmeAIO;
> +typedef void NvmeAIOCompletionFunc(NvmeAIO *aio, void *opaque, int ret);
> +
> +struct NvmeAIO {
> + NvmeRequest *req;
> +
> + NvmeAIOOp opc;
> + int64_t offset;
> + size_t len;
> + BlockBackend *blk;
> + BlockAIOCB *aiocb;
> + BlockAcctCookie acct;
> +
> + NvmeAIOCompletionFunc *cb;
> + void *cb_arg;
> +
> + QEMUSGList *qsg;
> + QEMUIOVector *iov;
> +
> + QTAILQ_ENTRY(NvmeAIO) tailq_entry;
> +};
> +
> +static inline const char *nvme_aio_opc_str(NvmeAIO *aio)
> +{
> + switch (aio->opc) {
> + case NVME_AIO_OPC_NONE: return "NVME_AIO_OP_NONE";
> + case NVME_AIO_OPC_FLUSH: return "NVME_AIO_OP_FLUSH";
> + case NVME_AIO_OPC_READ: return "NVME_AIO_OP_READ";
> + case NVME_AIO_OPC_WRITE: return "NVME_AIO_OP_WRITE";
> + case NVME_AIO_OPC_WRITE_ZEROES: return "NVME_AIO_OP_WRITE_ZEROES";
> + default: return "NVME_AIO_OP_UNKNOWN";
> + }
> +}
> +
> +static inline bool nvme_req_is_write(NvmeRequest *req)
> +{
> + switch (req->cmd.opcode) {
> + case NVME_CMD_WRITE:
> + case NVME_CMD_WRITE_UNCOR:
> + case NVME_CMD_WRITE_ZEROS:
> + return true;
> + default:
> + return false;
> + }
> +}
> +
> #define TYPE_NVME "nvme"
> #define NVME(obj) \
> OBJECT_CHECK(NvmeCtrl, (obj), TYPE_NVME)
> @@ -140,10 +236,21 @@ static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
> static inline uint16_t nvme_cid(NvmeRequest *req)
> {
> if (req) {
> - return le16_to_cpu(req->cqe.cid);
> + return le16_to_cpu(req->cmd.cid);
> }
>
> return 0xffff;
> }
>
> +static inline bool nvme_status_is_error(uint16_t status, uint16_t err)
> +{
> + /* strip DNR and MORE */
> + return (status & 0xfff) == err;
> +}
> +
> +static inline NvmeCtrl *nvme_ctrl(NvmeRequest *req)
> +{
> + return req->sq->ctrl;
> +}
> +
> #endif /* HW_NVME_H */
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 2aceb0537e05..aa449e314818 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -34,7 +34,12 @@ nvme_dev_irq_pin(void) "pulsing IRQ pin"
> nvme_dev_irq_masked(void) "IRQ is masked"
> nvme_dev_dma_read(uint64_t prp1, uint64_t prp2) "DMA read, prp1=0x%"PRIx64" prp2=0x%"PRIx64""
> nvme_dev_map_prp(uint16_t cid, uint64_t trans_len, uint32_t len, uint64_t prp1, uint64_t prp2, int num_prps) "cid %"PRIu16" trans_len %"PRIu64" len %"PRIu32" prp1 0x%"PRIx64" prp2 0x%"PRIx64"
> num_prps %d"
> +nvme_dev_req_register_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, uint64_t count, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" count
> %"PRIu64" opc \"%s\" req %p"
> +nvme_dev_aio_cb(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p"
> +nvme_dev_io_cmd(uint16_t cid, uint32_t nsid, uint16_t sqid, uint8_t opcode) "cid %"PRIu16" nsid %"PRIu32" sqid %"PRIu16" opc 0x%"PRIx8""
> nvme_dev_rw(const char *verb, uint32_t blk_count, uint64_t byte_count, uint64_t lba) "%s %"PRIu32" blocks (%"PRIu64" bytes) from LBA %"PRIu64""
> +nvme_dev_rw_cb(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
> +nvme_dev_write_zeroes(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
> nvme_dev_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", cqid=%"PRIu16", qsize=%"PRIu16",
> qflags=%"PRIu16""
> nvme_dev_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", cqid=%"PRIu16", vector=%"PRIu16",
> qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
> nvme_dev_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
> @@ -81,6 +86,7 @@ nvme_dev_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "cqid %"PRIu16" new_
> # nvme traces for error conditions
> nvme_dev_err_mdts(uint16_t cid, size_t mdts, size_t len) "cid %"PRIu16" mdts %"PRIu64" len %"PRIu64""
> nvme_dev_err_prinfo(uint16_t cid, uint16_t ctrl) "cid %"PRIu16" ctrl %"PRIu16""
> +nvme_dev_err_aio(uint16_t cid, void *aio, const char *blkname, uint64_t offset, const char *opc, void *req, uint16_t status) "cid %"PRIu16" aio %p blk \"%s\" offset %"PRIu64" opc \"%s\" req %p
> status 0x%"PRIx16""
> nvme_dev_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
> nvme_dev_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null or not page aligned: 0x%"PRIx64""
> nvme_dev_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 0x%"PRIx64""
The patch is still too large IMHO to review properly and few things can be split from it.
I tried my best to review it but I might have missed something.
Best regards,
Maxim Levitsky
On Mar 25 12:57, Maxim Levitsky wrote:
> On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> > From: Klaus Jensen <k.jensen@samsung.com>
> >
> > This refactors how the device issues asynchronous block backend
> > requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> > associated with the command. This allows multiple aios to be issued for
> > a command. Only when all requests have been completed will the device
> > post a completion queue entry.
> >
> > Because the device is currently guaranteed to only issue a single aio
> > request per command, the benefit is not immediately obvious. But this
> > functionality is required to support metadata, the dataset management
> > command and other features.
> >
> > Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > Acked-by: Keith Busch <kbusch@kernel.org>
> > ---
> > hw/block/nvme.c | 377 +++++++++++++++++++++++++++++++-----------
> > hw/block/nvme.h | 129 +++++++++++++--
> > hw/block/trace-events | 6 +
> > 3 files changed, 407 insertions(+), 105 deletions(-)
> >
> > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > index 0d2b5b45b0c5..817384e3b1a9 100644
> > --- a/hw/block/nvme.c
> > +++ b/hw/block/nvme.c
> > @@ -373,6 +374,99 @@ static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, QEMUSGList *qsg,
> > return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
> > }
> >
> > +static void nvme_aio_destroy(NvmeAIO *aio)
> > +{
> > + g_free(aio);
> > +}
> > +
> > +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
> I guess I'll call this nvme_req_add_aio,
> or nvme_add_aio_to_reg.
> Thoughts?
> Also you can leave this as is, but add a comment on top explaining this
>
nvme_req_add_aio it is :) And comment added.
> > + NvmeAIOOp opc)
> > +{
> > + aio->opc = opc;
> > +
> > + trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
> > + aio->offset, aio->len,
> > + nvme_aio_opc_str(aio), req);
> > +
> > + if (req) {
> > + QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry);
> > + }
> > +}
> > +
> > +static void nvme_submit_aio(NvmeAIO *aio)
> OK, this name makes sense
> Also please add a comment on top.
Done.
> > @@ -505,9 +600,11 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len,
> > return NVME_SUCCESS;
> > }
> >
> > -static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns,
> > - uint16_t ctrl, NvmeRequest *req)
> > +static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, uint16_t ctrl,
> > + NvmeRequest *req)
> > {
> > + NvmeNamespace *ns = req->ns;
> > +
> This should go to the patch that added nvme_check_prinfo
>
Probably killing that patch.
> > @@ -516,10 +613,10 @@ static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns,
> > return NVME_SUCCESS;
> > }
> >
> > -static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
> > - uint64_t slba, uint32_t nlb,
> > - NvmeRequest *req)
> > +static inline uint16_t nvme_check_bounds(NvmeCtrl *n, uint64_t slba,
> > + uint32_t nlb, NvmeRequest *req)
> > {
> > + NvmeNamespace *ns = req->ns;
> > uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
> This should go to the patch that added nvme_check_bounds as well
>
We can't really, because the NvmeRequest does not hold a reference to
the namespace as a struct member at that point. This is also an issue
with the nvme_check_prinfo function above.
> >
> > if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
> > @@ -530,55 +627,154 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
> > return NVME_SUCCESS;
> > }
> >
> > -static void nvme_rw_cb(void *opaque, int ret)
> > +static uint16_t nvme_check_rw(NvmeCtrl *n, NvmeRequest *req)
> > +{
> > + NvmeNamespace *ns = req->ns;
> > + NvmeRwCmd *rw = (NvmeRwCmd *) &req->cmd;
> > + uint16_t ctrl = le16_to_cpu(rw->control);
> > + size_t len = req->nlb << nvme_ns_lbads(ns);
> > + uint16_t status;
> > +
> > + status = nvme_check_mdts(n, len, req);
> > + if (status) {
> > + return status;
> > + }
> > +
> > + status = nvme_check_prinfo(n, ctrl, req);
> > + if (status) {
> > + return status;
> > + }
> > +
> > + status = nvme_check_bounds(n, req->slba, req->nlb, req);
> > + if (status) {
> > + return status;
> > + }
> > +
> > + return NVME_SUCCESS;
> > +}
>
> Nitpick: I hate to say it but nvme_check_rw should be in a separate patch as well.
> It will also make diff more readable (when adding a funtion and changing a function
> at the same time, you get a diff between two unrelated things)
>
Done, but had to do it as a follow up patch.
> >
> > -static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
> > - NvmeRequest *req)
> > +static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> Very small nitpick about zeros/zeroes: This should move to some refactoring patch to be honest.
>
Done ;)
>
> The patch is still too large IMHO to review properly and few things can be split from it.
> I tried my best to review it but I might have missed something.
>
Yeah, I know, but thanks for trying!
On Tue, 2020-03-31 at 07:47 +0200, Klaus Birkelund Jensen wrote:
> On Mar 25 12:57, Maxim Levitsky wrote:
> > On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> > > From: Klaus Jensen <k.jensen@samsung.com>
> > >
> > > This refactors how the device issues asynchronous block backend
> > > requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> > > associated with the command. This allows multiple aios to be issued for
> > > a command. Only when all requests have been completed will the device
> > > post a completion queue entry.
> > >
> > > Because the device is currently guaranteed to only issue a single aio
> > > request per command, the benefit is not immediately obvious. But this
> > > functionality is required to support metadata, the dataset management
> > > command and other features.
> > >
> > > Signed-off-by: Klaus Jensen <klaus.jensen@cnexlabs.com>
> > > Signed-off-by: Klaus Jensen <k.jensen@samsung.com>
> > > Acked-by: Keith Busch <kbusch@kernel.org>
> > > ---
> > > hw/block/nvme.c | 377 +++++++++++++++++++++++++++++++-----------
> > > hw/block/nvme.h | 129 +++++++++++++--
> > > hw/block/trace-events | 6 +
> > > 3 files changed, 407 insertions(+), 105 deletions(-)
> > >
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 0d2b5b45b0c5..817384e3b1a9 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -373,6 +374,99 @@ static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, QEMUSGList *qsg,
> > > return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
> > > }
> > >
> > > +static void nvme_aio_destroy(NvmeAIO *aio)
> > > +{
> > > + g_free(aio);
> > > +}
> > > +
> > > +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
> >
> > I guess I'll call this nvme_req_add_aio,
> > or nvme_add_aio_to_reg.
> > Thoughts?
> > Also you can leave this as is, but add a comment on top explaining this
> >
>
> nvme_req_add_aio it is :) And comment added.
Thanks a lot!
>
> > > + NvmeAIOOp opc)
> > > +{
> > > + aio->opc = opc;
> > > +
> > > + trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
> > > + aio->offset, aio->len,
> > > + nvme_aio_opc_str(aio), req);
> > > +
> > > + if (req) {
> > > + QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry);
> > > + }
> > > +}
> > > +
> > > +static void nvme_submit_aio(NvmeAIO *aio)
> >
> > OK, this name makes sense
> > Also please add a comment on top.
>
> Done.
Thanks!
>
> > > @@ -505,9 +600,11 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len,
> > > return NVME_SUCCESS;
> > > }
> > >
> > > -static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns,
> > > - uint16_t ctrl, NvmeRequest *req)
> > > +static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, uint16_t ctrl,
> > > + NvmeRequest *req)
> > > {
> > > + NvmeNamespace *ns = req->ns;
> > > +
> >
> > This should go to the patch that added nvme_check_prinfo
> >
>
> Probably killing that patch.
Yea, I also agree on that. Once we properly support metadata,
then we can add all the checks for its correctness.
>
> > > @@ -516,10 +613,10 @@ static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns,
> > > return NVME_SUCCESS;
> > > }
> > >
> > > -static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
> > > - uint64_t slba, uint32_t nlb,
> > > - NvmeRequest *req)
> > > +static inline uint16_t nvme_check_bounds(NvmeCtrl *n, uint64_t slba,
> > > + uint32_t nlb, NvmeRequest *req)
> > > {
> > > + NvmeNamespace *ns = req->ns;
> > > uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
> >
> > This should go to the patch that added nvme_check_bounds as well
> >
>
> We can't really, because the NvmeRequest does not hold a reference to
> the namespace as a struct member at that point. This is also an issue
> with the nvme_check_prinfo function above.
I see it now. The changes to NvmeRequest together with this are a good candidate
to split from this patch to get this patch to size that is easy to review.
>
> > >
> > > if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
> > > @@ -530,55 +627,154 @@ static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
> > > return NVME_SUCCESS;
> > > }
> > >
> > > -static void nvme_rw_cb(void *opaque, int ret)
> > > +static uint16_t nvme_check_rw(NvmeCtrl *n, NvmeRequest *req)
> > > +{
> > > + NvmeNamespace *ns = req->ns;
> > > + NvmeRwCmd *rw = (NvmeRwCmd *) &req->cmd;
> > > + uint16_t ctrl = le16_to_cpu(rw->control);
> > > + size_t len = req->nlb << nvme_ns_lbads(ns);
> > > + uint16_t status;
> > > +
> > > + status = nvme_check_mdts(n, len, req);
> > > + if (status) {
> > > + return status;
> > > + }
> > > +
> > > + status = nvme_check_prinfo(n, ctrl, req);
> > > + if (status) {
> > > + return status;
> > > + }
> > > +
> > > + status = nvme_check_bounds(n, req->slba, req->nlb, req);
> > > + if (status) {
> > > + return status;
> > > + }
> > > +
> > > + return NVME_SUCCESS;
> > > +}
> >
> > Nitpick: I hate to say it but nvme_check_rw should be in a separate patch as well.
> > It will also make diff more readable (when adding a funtion and changing a function
> > at the same time, you get a diff between two unrelated things)
> >
>
> Done, but had to do it as a follow up patch.
I guess it won't help to do this in a followup patch since this won't simplify this
patch. I'll take a look when you publish the next version.
>
> > >
> > > -static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
> > > - NvmeRequest *req)
> > > +static uint16_t nvme_write_zeroes(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> >
> > Very small nitpick about zeros/zeroes: This should move to some refactoring patch to be honest.
> >
>
> Done ;)
>
> >
> > The patch is still too large IMHO to review properly and few things can be split from it.
> > I tried my best to review it but I might have missed something.
> >
>
> Yeah, I know, but thanks for trying!
Thanks to you too.
Best regards,
Maxim Levitsky
>
On Mar 31 12:10, Maxim Levitsky wrote:
> On Tue, 2020-03-31 at 07:47 +0200, Klaus Birkelund Jensen wrote:
> > On Mar 25 12:57, Maxim Levitsky wrote:
> > > On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> > > > @@ -516,10 +613,10 @@ static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns,
> > > > return NVME_SUCCESS;
> > > > }
> > > >
> > > > -static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
> > > > - uint64_t slba, uint32_t nlb,
> > > > - NvmeRequest *req)
> > > > +static inline uint16_t nvme_check_bounds(NvmeCtrl *n, uint64_t slba,
> > > > + uint32_t nlb, NvmeRequest *req)
> > > > {
> > > > + NvmeNamespace *ns = req->ns;
> > > > uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
> > >
> > > This should go to the patch that added nvme_check_bounds as well
> > >
> >
> > We can't really, because the NvmeRequest does not hold a reference to
> > the namespace as a struct member at that point. This is also an issue
> > with the nvme_check_prinfo function above.
>
> I see it now. The changes to NvmeRequest together with this are a good candidate
> to split from this patch to get this patch to size that is easy to review.
>
I'm factoring those changes and other stuff out into separate patches!
© 2016 - 2026 Red Hat, Inc.