include/block/nvme.h | 8 +++ hw/block/nvme.c | 123 +++++++++++++++++++++++++++++++++++++++--- hw/block/trace-events | 2 + 3 files changed, 126 insertions(+), 7 deletions(-)
From: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Add support for using the broadcast nsid to issue a flush on all
namespaces through a single command.
Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com>
Reviewed-by: Klaus Jensen <k.jensen@samsung.com>
---
include/block/nvme.h | 8 +++
hw/block/nvme.c | 123 +++++++++++++++++++++++++++++++++++++++---
hw/block/trace-events | 2 +
3 files changed, 126 insertions(+), 7 deletions(-)
diff --git a/include/block/nvme.h b/include/block/nvme.h
index e4b918064df9..0e2e94c7ebbb 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -1024,6 +1024,14 @@ enum NvmeIdCtrlOncs {
NVME_ONCS_TIMESTAMP = 1 << 6,
};
+enum NvmeIdctrlVwc {
+ NVME_VWC_PRESENT = 1 << 0,
+ NVME_VWC_NSID_BROADCAST_NO_SUPPORT = 0 << 1,
+ NVME_VWC_NSID_BROADCAST_RESERVED = 1 << 1,
+ NVME_VWC_NSID_BROADCAST_CTRL_SPEC = 2 << 1,
+ NVME_VWC_NSID_BROADCAST_SUPPORT = 3 << 1,
+};
+
enum NvmeIdCtrlFrmw {
NVME_FRMW_SLOT1_RO = 1 << 0,
};
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 98d84fe26644..b5ce8ed7c785 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1406,6 +1406,41 @@ static void nvme_rw_cb(void *opaque, int ret)
nvme_enqueue_req_completion(nvme_cq(req), req);
}
+struct nvme_aio_flush_ctx {
+ NvmeRequest *req;
+ NvmeNamespace *ns;
+ BlockAcctCookie acct;
+};
+
+static void nvme_aio_flush_cb(void *opaque, int ret)
+{
+ struct nvme_aio_flush_ctx *ctx = opaque;
+ NvmeRequest *req = ctx->req;
+ uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
+
+ BlockBackend *blk = ctx->ns->blkconf.blk;
+ BlockAcctCookie *acct = &ctx->acct;
+ BlockAcctStats *stats = blk_get_stats(blk);
+
+ trace_pci_nvme_aio_flush_cb(nvme_cid(req), blk_name(blk));
+
+ if (!ret) {
+ block_acct_done(stats, acct);
+ } else {
+ block_acct_failed(stats, acct);
+ nvme_aio_err(req, ret);
+ }
+
+ (*num_flushes)--;
+ g_free(ctx);
+
+ if (*num_flushes) {
+ return;
+ }
+
+ nvme_enqueue_req_completion(nvme_cq(req), req);
+}
+
static void nvme_aio_discard_cb(void *opaque, int ret)
{
NvmeRequest *req = opaque;
@@ -1644,10 +1679,56 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req)
static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
{
- block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
- BLOCK_ACCT_FLUSH);
- req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
- return NVME_NO_COMPLETE;
+ uint32_t nsid = le32_to_cpu(req->cmd.nsid);
+ uintptr_t *num_flushes = (uintptr_t *)&req->opaque;
+ uint16_t status;
+ struct nvme_aio_flush_ctx *ctx;
+ NvmeNamespace *ns;
+
+ trace_pci_nvme_flush(nvme_cid(req), nsid);
+
+ if (nsid != NVME_NSID_BROADCAST) {
+ req->ns = nvme_ns(n, nsid);
+ if (unlikely(!req->ns)) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0,
+ BLOCK_ACCT_FLUSH);
+ req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req);
+ return NVME_NO_COMPLETE;
+ }
+
+ /* 1-initialize; see comment in nvme_dsm */
+ *num_flushes = 1;
+
+ for (int i = 1; i <= n->num_namespaces; i++) {
+ ns = nvme_ns(n, i);
+ if (!ns) {
+ continue;
+ }
+
+ ctx = g_new(struct nvme_aio_flush_ctx, 1);
+ ctx->req = req;
+ ctx->ns = ns;
+
+ (*num_flushes)++;
+
+ block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0,
+ BLOCK_ACCT_FLUSH);
+ req->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx);
+ }
+
+ /* account for the 1-initialization */
+ (*num_flushes)--;
+
+ if (*num_flushes) {
+ status = NVME_NO_COMPLETE;
+ } else {
+ status = req->status;
+ }
+
+ return status;
}
static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)
@@ -2344,6 +2425,29 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
return NVME_INVALID_NSID | NVME_DNR;
}
+ /*
+ * In the base NVM command set, Flush may apply to all namespaces
+ * (indicated by NSID being set to 0xFFFFFFFF). But if that feature is used
+ * along with TP 4056 (Namespace Types), it may be pretty screwed up.
+ *
+ * If NSID is indeed set to 0xFFFFFFFF, we simply cannot associate the
+ * opcode with a specific command since we cannot determine a unique I/O
+ * command set. Opcode 0x0 could have any other meaning than something
+ * equivalent to flushing and say it DOES have completely different
+ * semantics in some other command set - does an NSID of 0xFFFFFFFF then
+ * mean "for all namespaces, apply whatever command set specific command
+ * that uses the 0x0 opcode?" Or does it mean "for all namespaces, apply
+ * whatever command that uses the 0x0 opcode if, and only if, it allows
+ * NSID to be 0xFFFFFFFF"?
+ *
+ * Anyway (and luckily), for now, we do not care about this since the
+ * device only supports namespace types that includes the NVM Flush command
+ * (NVM and Zoned), so always do an NVM Flush.
+ */
+ if (req->cmd.opcode == NVME_CMD_FLUSH) {
+ return nvme_flush(n, req);
+ }
+
req->ns = nvme_ns(n, nsid);
if (unlikely(!req->ns)) {
return NVME_INVALID_FIELD | NVME_DNR;
@@ -2355,8 +2459,6 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
}
switch (req->cmd.opcode) {
- case NVME_CMD_FLUSH:
- return nvme_flush(n, req);
case NVME_CMD_WRITE_ZEROES:
return nvme_write_zeroes(n, req);
case NVME_CMD_ZONE_APPEND:
@@ -4450,7 +4552,14 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
NVME_ONCS_FEATURES | NVME_ONCS_DSM |
NVME_ONCS_COMPARE);
- id->vwc = (0x2 << 1) | 0x1;
+ /*
+ * NOTE: If this device ever supports a command set that does NOT use 0x0
+ * as a Flush-equivalent operation, support for the broadcast NSID in Flush
+ * should probably be removed.
+ *
+ * See comment in nvme_io_cmd.
+ */
+ id->vwc = NVME_VWC_NSID_BROADCAST_SUPPORT | NVME_VWC_PRESENT;
id->sgls = cpu_to_le32(NVME_CTRL_SGLS_SUPPORT_NO_ALIGN |
NVME_CTRL_SGLS_BITBUCKET);
diff --git a/hw/block/trace-events b/hw/block/trace-events
index 8e249ea910aa..ce2a35c34ea6 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -43,6 +43,8 @@ pci_nvme_admin_cmd(uint16_t cid, uint16_t sqid, uint8_t opcode, const char *opna
pci_nvme_read(uint16_t cid, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
pci_nvme_write(uint16_t cid, const char *verb, uint32_t nsid, uint32_t nlb, uint64_t count, uint64_t lba) "cid %"PRIu16" opname '%s' nsid %"PRIu32" nlb %"PRIu32" count %"PRIu64" lba 0x%"PRIx64""
pci_nvme_rw_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
+pci_nvme_flush(uint16_t cid, uint32_t nsid) "cid %"PRIu16" nsid %"PRIu32""
+pci_nvme_aio_flush_cb(uint16_t cid, const char *blkname) "cid %"PRIu16" blk '%s'"
pci_nvme_block_status(int64_t offset, int64_t bytes, int64_t pnum, int ret, bool zeroed) "offset %"PRId64" bytes %"PRId64" pnum %"PRId64" ret 0x%x zeroed %d"
pci_nvme_dsm(uint16_t cid, uint32_t nsid, uint32_t nr, uint32_t attr) "cid %"PRIu16" nsid %"PRIu32" nr %"PRIu32" attr 0x%"PRIx32""
pci_nvme_dsm_deallocate(uint16_t cid, uint32_t nsid, uint64_t slba, uint32_t nlb) "cid %"PRIu16" nsid %"PRIu32" slba %"PRIu64" nlb %"PRIu32""
--
2.30.0
On Mon, Jan 25, 2021 at 09:42:31PM +0100, Klaus Jensen wrote: > From: Gollu Appalanaidu <anaidu.gollu@samsung.com> > > Add support for using the broadcast nsid to issue a flush on all > namespaces through a single command. > > Signed-off-by: Gollu Appalanaidu <anaidu.gollu@samsung.com> > Reviewed-by: Klaus Jensen <k.jensen@samsung.com> > --- > include/block/nvme.h | 8 +++ > hw/block/nvme.c | 123 +++++++++++++++++++++++++++++++++++++++--- > hw/block/trace-events | 2 + > 3 files changed, 126 insertions(+), 7 deletions(-) Acked-by: Stefan Hajnoczi <stefanha@redhat.com>
On Jan 25 21:42, Klaus Jensen wrote: > From: Gollu Appalanaidu <anaidu.gollu@samsung.com> > > Add support for using the broadcast nsid to issue a flush on all > namespaces through a single command. > Gentle ping on this.
On Mon, Jan 25, 2021 at 09:42:31PM +0100, Klaus Jensen wrote: > @@ -1644,10 +1679,56 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) > > static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) > { > - block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, > - BLOCK_ACCT_FLUSH); > - req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); > - return NVME_NO_COMPLETE; > + uint32_t nsid = le32_to_cpu(req->cmd.nsid); > + uintptr_t *num_flushes = (uintptr_t *)&req->opaque; > + uint16_t status; > + struct nvme_aio_flush_ctx *ctx; > + NvmeNamespace *ns; > + > + trace_pci_nvme_flush(nvme_cid(req), nsid); > + > + if (nsid != NVME_NSID_BROADCAST) { > + req->ns = nvme_ns(n, nsid); > + if (unlikely(!req->ns)) { > + return NVME_INVALID_FIELD | NVME_DNR; > + } > + > + block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, > + BLOCK_ACCT_FLUSH); > + req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); > + return NVME_NO_COMPLETE; > + } > + > + /* 1-initialize; see comment in nvme_dsm */ > + *num_flushes = 1; > + > + for (int i = 1; i <= n->num_namespaces; i++) { > + ns = nvme_ns(n, i); > + if (!ns) { > + continue; > + } > + > + ctx = g_new(struct nvme_aio_flush_ctx, 1); > + ctx->req = req; > + ctx->ns = ns; > + > + (*num_flushes)++; > + > + block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0, > + BLOCK_ACCT_FLUSH); > + req->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx); > + } Overwriting req->aiocb with the most recent flush request doesn't seem right. This whole implementation would be much simpler with the synchronous blk_flush() routine instead of the AIO equivalent. This is not really a performant feature, so I don't think it's critical to get these operations happening in parallel. What do you think?
On Feb 9 03:59, Keith Busch wrote: > On Mon, Jan 25, 2021 at 09:42:31PM +0100, Klaus Jensen wrote: > > @@ -1644,10 +1679,56 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest *req) > > > > static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req) > > { > > - block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, > > - BLOCK_ACCT_FLUSH); > > - req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); > > - return NVME_NO_COMPLETE; > > + uint32_t nsid = le32_to_cpu(req->cmd.nsid); > > + uintptr_t *num_flushes = (uintptr_t *)&req->opaque; > > + uint16_t status; > > + struct nvme_aio_flush_ctx *ctx; > > + NvmeNamespace *ns; > > + > > + trace_pci_nvme_flush(nvme_cid(req), nsid); > > + > > + if (nsid != NVME_NSID_BROADCAST) { > > + req->ns = nvme_ns(n, nsid); > > + if (unlikely(!req->ns)) { > > + return NVME_INVALID_FIELD | NVME_DNR; > > + } > > + > > + block_acct_start(blk_get_stats(req->ns->blkconf.blk), &req->acct, 0, > > + BLOCK_ACCT_FLUSH); > > + req->aiocb = blk_aio_flush(req->ns->blkconf.blk, nvme_rw_cb, req); > > + return NVME_NO_COMPLETE; > > + } > > + > > + /* 1-initialize; see comment in nvme_dsm */ > > + *num_flushes = 1; > > + > > + for (int i = 1; i <= n->num_namespaces; i++) { > > + ns = nvme_ns(n, i); > > + if (!ns) { > > + continue; > > + } > > + > > + ctx = g_new(struct nvme_aio_flush_ctx, 1); > > + ctx->req = req; > > + ctx->ns = ns; > > + > > + (*num_flushes)++; > > + > > + block_acct_start(blk_get_stats(ns->blkconf.blk), &ctx->acct, 0, > > + BLOCK_ACCT_FLUSH); > > + req->aiocb = blk_aio_flush(ns->blkconf.blk, nvme_aio_flush_cb, ctx); > > + } > > Overwriting req->aiocb with the most recent flush request doesn't seem > right. > It doesn't really matter if this[1] patch is merged, but it is wrong and to align with the other multi-aio commands, this should just ignore the returned aiocb. > This whole implementation would be much simpler with the synchronous > blk_flush() routine instead of the AIO equivalent. This is not really a > performant feature, so I don't think it's critical to get these > operations happening in parallel. What do you think? It would definitely be simpler, but I believe that if there is a lot to flush, then we won't just block the nvme device. We are holding the Big QEMU Lock and will block most other devices as well. [1]: hw/block/nvme: drain namespaces on sq deletion
On Mon, Feb 08, 2021 at 08:08:17PM +0100, Klaus Jensen wrote: > On Feb 9 03:59, Keith Busch wrote: > > This whole implementation would be much simpler with the synchronous > > blk_flush() routine instead of the AIO equivalent. This is not really a > > performant feature, so I don't think it's critical to get these > > operations happening in parallel. What do you think? > > It would definitely be simpler, but I believe that if there is a lot to > flush, then we won't just block the nvme device. We are holding the Big > QEMU Lock and will block most other devices as well. Hm, I feel like you may have told me this same explanation for a different patch. :) Okay, I'm convinced: this is the way.
On Feb 10 12:32, Keith Busch wrote: > On Mon, Feb 08, 2021 at 08:08:17PM +0100, Klaus Jensen wrote: > > On Feb 9 03:59, Keith Busch wrote: > > > This whole implementation would be much simpler with the synchronous > > > blk_flush() routine instead of the AIO equivalent. This is not really a > > > performant feature, so I don't think it's critical to get these > > > operations happening in parallel. What do you think? > > > > It would definitely be simpler, but I believe that if there is a lot to > > flush, then we won't just block the nvme device. We are holding the Big > > QEMU Lock and will block most other devices as well. > > Hm, I feel like you may have told me this same explanation for a > different patch. :) Okay, I'm convinced: this is the way. > Is that an Acked-by? ;) And yes, I might have used that argument for Copy, can't remember ;)
On Wed, Feb 10, 2021 at 07:42:17AM +0100, Klaus Jensen wrote: > > Is that an Acked-by? ;) Yep, Acked-by: Keith Busch <kbusch@kernel.org>
On Jan 25 21:42, Klaus Jensen wrote: > From: Gollu Appalanaidu <anaidu.gollu@samsung.com> > > Add support for using the broadcast nsid to issue a flush on all > namespaces through a single command. > Applied to nvme-next.
© 2016 - 2024 Red Hat, Inc.