hw/block/nvme.c | 506 +++++++++++++++++++++++++++++++++++++------ hw/block/nvme.h | 57 ++++- include/block/nvme.h | 128 ++++++++++- 3 files changed, 610 insertions(+), 81 deletions(-)
Adding namespace management support to the nvme device. Namespace
creation requires contiguous block space for a simple method of allocation.
I wrote this a few years ago based on Keith's fork and nvmeqemu fork and
have recently re-synced with the latest trunk. Some data structures in
nvme.h are a bit more filled out that strictly necessary as this is also
the base for sr-iov and IOD patched to be submitted later.
Signed-off-by: fitzpat <matt.fitzpatrick@oakgatetech.com>
---
hw/block/nvme.c | 506 +++++++++++++++++++++++++++++++++++++------
hw/block/nvme.h | 57 ++++-
include/block/nvme.h | 128 ++++++++++-
3 files changed, 610 insertions(+), 81 deletions(-)
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 107a719b95..11d7da26f3 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -42,6 +44,9 @@
#include "trace.h"
#include "nvme.h"
+#define NVME_CTRL_LIST_MAX_ENTRIES 2047
+#define NVME_MAX_NUM_NAMESPACES 256
+
#define NVME_GUEST_ERR(trace, fmt, ...) \
do { \
(trace_##trace)(__VA_ARGS__); \
@@ -50,6 +55,8 @@
} while (0)
static void nvme_process_sq(void *opaque);
+static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, uint64_t data,
+ unsigned size);
static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
{
@@ -377,7 +384,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace
*ns, NvmeCmd *cmd,
uint8_t lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
uint64_t data_size = (uint64_t)nlb << data_shift;
- uint64_t data_offset = slba << data_shift;
+ uint64_t data_offset = (slba << data_shift) + ns->start_byte_index;
int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE :
BLOCK_ACCT_READ;
@@ -425,6 +432,11 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd
*cmd, NvmeRequest *req)
}
ns = &n->namespaces[nsid - 1];
+
+ if (unlikely(!ns->ctrl)) {
+ return NVME_INVALID_NSID | NVME_DNR;
+ }
+
switch (cmd->opcode) {
case NVME_CMD_FLUSH:
return nvme_flush(n, ns, cmd, req);
@@ -676,6 +688,49 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n,
NvmeIdentify *c)
prp1, prp2);
}
+/**
+ * Identify Allocated Namespace List
+ * @param n
+ * @param c
+ * @return
+ */
+static uint16_t nvme_identify_ns_allocated(NvmeCtrl *n, NvmeIdentify *c)
+{
+ static const int data_len = 4 * KiB;
+ uint32_t min_nsid = le32_to_cpu(c->nsid);
+ uint64_t prp1 = le64_to_cpu(c->prp1);
+ uint64_t prp2 = le64_to_cpu(c->prp2);
+ uint32_t *list;
+ uint16_t ret;
+ int i, j = 0;
+
+ trace_nvme_identify_nslist(min_nsid);
+
+ list = g_malloc0(data_len);
+ for (i = 0; i < NVME_MAX_NUM_NAMESPACES; i++) {
+ if (i < min_nsid) {
+ continue;
+ }
+ if (n->namespaces[i].created) {
+ list[j++] = cpu_to_le32(i + 1);
+ if (j == data_len / sizeof(uint32_t)) {
+ break;
+ }
+ }
+ }
+ ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2);
+ g_free(list);
+ return ret;
+}
+
+/**
+ * Identify Active Namespace List
+ * Active is defined as created and attached.
+ *
+ * @param n
+ * @param c
+ * @return
+ */
static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
{
static const int data_len = 4 * KiB;
@@ -689,13 +744,15 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n,
NvmeIdentify *c)
trace_nvme_identify_nslist(min_nsid);
list = g_malloc0(data_len);
- for (i = 0; i < n->num_namespaces; i++) {
+ for (i = 0; i < NVME_MAX_NUM_NAMESPACES; i++) {
if (i < min_nsid) {
continue;
}
- list[j++] = cpu_to_le32(i + 1);
- if (j == data_len / sizeof(uint32_t)) {
- break;
+ if (n->namespaces[i].created && n->namespaces[i].ctrl) {
+ list[j++] = cpu_to_le32(i + 1);
+ if (j == data_len / sizeof(uint32_t)) {
+ break;
+ }
}
}
ret = nvme_dma_read_prp(n, (uint8_t *)list, data_len, prp1, prp2);
@@ -708,18 +765,271 @@ static uint16_t nvme_identify(NvmeCtrl *n,
NvmeCmd *cmd)
NvmeIdentify *c = (NvmeIdentify *)cmd;
switch (le32_to_cpu(c->cns)) {
- case 0x00:
+ case NVME_ADM_CNS_ID_NS:
return nvme_identify_ns(n, c);
- case 0x01:
+ case NVME_ADM_CNS_ID_CTRL:
return nvme_identify_ctrl(n, c);
- case 0x02:
+ case NVME_ADM_CNS_ID_NS_LIST:
return nvme_identify_nslist(n, c);
+ case NVME_ADM_CNS_ID_NS_LIST_ALLOC:
+ return nvme_identify_ns_allocated(n, c);
+ case NVME_ADM_CNS_ID_NS_ALLOC:
+ return nvme_identify_ns(n, c);
default:
trace_nvme_err_invalid_identify_cns(le32_to_cpu(c->cns));
return NVME_INVALID_FIELD | NVME_DNR;
}
}
+static uint16_t nvme_namespace_controller_attach(NvmeCtrl *n, NvmeCmd *cmd)
+{
+ int i;
+ uint64_t prp1 = le64_to_cpu(cmd->prp1);
+ uint64_t prp2 = le64_to_cpu(cmd->prp2);
+ NvmeNamespace *ns = &n->namespaces[cmd->nsid - 1];
+
+ uint16_t ctrl_list[2048];
+ uint16_t ctrl_list_size;
+
+ if (nvme_dma_write_prp(n, (uint8_t *)ctrl_list, sizeof(ctrl_list),
prp1, prp2)) {
+ return NVME_INVALID_FIELD;
+ }
+
+ ctrl_list_size = ctrl_list[0];
+
+ if (!ctrl_list_size || ctrl_list_size > NVME_CTRL_LIST_MAX_ENTRIES) {
+ return NVME_CTRL_LIST_INVALID;
+ }
+
+ if (ns->ctrl == n) {
+ return NVME_NS_ALREADY_ATTACHED;
+ }
+ if (!ns->created) {
+ return NVME_INVALID_NSID;
+ }
+
+ /* TODO: Update NvmeNamespace to link multiple controllers */
+ for ( i = 1; i <= ctrl_list_size; i++) {
+ if (n->id_ctrl.cntlid == ctrl_list[i]) {
+ ns->ctrl = n;
+ return NVME_SUCCESS;
+ }
+ }
+ return NVME_CTRL_LIST_INVALID;
+}
+
+static uint16_t nvme_namespace_controller_detach(NvmeCtrl *n, NvmeCmd *cmd)
+{
+ int i;
+ uint64_t prp1 = le64_to_cpu(cmd->prp1);
+ uint64_t prp2 = le64_to_cpu(cmd->prp2);
+ NvmeNamespace *ns = &n->namespaces[cmd->nsid - 1];
+
+ uint16_t ctrl_list[2048];
+ uint16_t ctrl_list_size;
+
+ if (nvme_dma_write_prp(n, (uint8_t *)ctrl_list, sizeof(ctrl_list),
prp1, prp2)) {
+ return NVME_INVALID_FIELD;
+ }
+
+ ctrl_list_size = ctrl_list[0];
+
+ if (!ctrl_list_size || ctrl_list_size > NVME_CTRL_LIST_MAX_ENTRIES) {
+ return NVME_CTRL_LIST_INVALID;
+ }
+ /* TODO: semaphore to lock NS on detach for scenario with detach
during IO */
+ if (!ns->ctrl || (ns->ctrl != n) ) {
+ return NVME_NS_NOT_ATTACHED;
+ }
+ if (!ns->created) {
+ return NVME_INVALID_NSID;
+ }
+
+ /* TODO: Update NvmeNamespace to link multiple controllers */
+ for ( i = 1; i <= ctrl_list_size; i++) {
+ if (n->id_ctrl.cntlid == ctrl_list[i]) {
+ ns->ctrl = NULL;
+ return NVME_SUCCESS;
+ }
+ }
+ return NVME_CTRL_LIST_INVALID;
+}
+
+static uint16_t nvme_namespace_attachment(NvmeCtrl *n, NvmeCmd *cmd)
+{
+ uint32_t dw10 = le32_to_cpu(cmd->cdw10);
+
+ if ( (!cmd->nsid || cmd->nsid > NVME_MAX_NUM_NAMESPACES)
+ && (cmd->nsid != 0xFFFFFFFF)) {
+ return NVME_INVALID_FIELD;
+ }
+
+ switch (dw10) {
+ case NVME_NS_CONTROLLER_ATTACH:
+ return nvme_namespace_controller_attach(n, cmd);
+ case NVME_NS_CONTROLLER_DETACH:
+ return nvme_namespace_controller_detach(n, cmd);
+ default:
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+}
+
+static int nvme_set_start_index(NvmeCtrl *n, uint64_t *ns_start_index,
uint64_t requested_ns_size)
+{
+ int i;
+ int lba_index;
+ uint64_t start_index = 0;
+ uint64_t end_index, ns_bytes;
+ bool adjusted;
+
+ if (requested_ns_size > n->nvm_capacity) {
+ return -1;
+ }
+ do {
+ adjusted = false;
+ end_index = start_index + requested_ns_size;
+ if (end_index > n->nvm_capacity) {
+ return -1;
+ }
+
+ for (i = 0; i < NVME_MAX_NUM_NAMESPACES; i++) {
+ NvmeNamespace *ns = &n->namespaces[i];
+ NvmeIdNs *id_ns = &ns->id_ns;
+ if (ns->created) {
+
+ lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+ ns_bytes = id_ns->nsze * ((1 <<
id_ns->lbaf[lba_index].ds));
+
+ if ((start_index >= ns->start_byte_index &&
+ start_index < (ns->start_byte_index + ns_bytes)) ||
+ (end_index >= ns->start_byte_index &&
+ end_index < (ns->start_byte_index + ns_bytes))) {
+ start_index = ns->start_byte_index + ns_bytes;
+ adjusted = true;
+ }
+ }
+ }
+ } while (adjusted);
+
+ *ns_start_index = start_index;
+ return 0;
+}
+
+/**
+ * Attempts to create a namespace in a free contiguous space within the
block layer
+ *
+ * @param n
+ * @param cmd
+ * @param req
+ * @return NVME_SUCCESS is successfuly created
+ */
+static uint16_t nvme_namespace_create(NvmeCtrl *n, NvmeCmd *cmd,
NvmeRequest *req)
+{
+ int i;
+ uint64_t prp1 = le64_to_cpu(cmd->prp1);
+ uint64_t prp2 = le64_to_cpu(cmd->prp2);
+ NvmeIdNs id_ns_host;
+
+
+ if (nvme_dma_write_prp(n, (uint8_t*)&id_ns_host,
sizeof(id_ns_host), prp1, prp2)) {
+ return NVME_INVALID_FIELD;
+ }
+
+ for (i = 0; i < NVME_MAX_NUM_NAMESPACES; i++) {
+ uint64_t ns_size;
+ int lba_index;
+ NvmeNamespace *ns = &n->namespaces[i];
+ NvmeIdNs *id_ns = &ns->id_ns;
+
+ if (id_ns_host.flbas || id_ns_host.mc || id_ns_host.dps) {
+ return NVME_INVALID_FIELD | NVME_DNR;
+ }
+
+ if (!ns->created) { /* take the first available NS */
+
+ id_ns->flbas = id_ns_host.flbas;
+ id_ns->mc = id_ns_host.mc;
+ id_ns->dps = id_ns_host.dps;
+
+ id_ns->nuse = id_ns_host.nsze;
+ id_ns->ncap = id_ns_host.ncap;
+ id_ns->nsze = id_ns_host.nsze;
+
+ lba_index = NVME_ID_NS_FLBAS_INDEX(id_ns->flbas);
+ id_ns->lbaf[lba_index].ds = BDRV_SECTOR_BITS;
+ ns_size = id_ns->nsze * (1 << id_ns->lbaf[lba_index].ds);
+ id_ns->nvmcap = ns_size;
+
+ ns->id = i + 1;
+ id_ns->nguid = ns->id;
+
+ if (nvme_set_start_index(n, &ns->start_byte_index, ns_size)) {
+ return NVME_NS_INSUFF_CAP;
+ }
+ ns->created = true;
+ n->id_ctrl.unvmcap -= id_ns->nvmcap;
+
+ ns->ctrl = NULL; /* not attached */
+
+ n->num_namespaces++;
+ n->id_ctrl.nn++;
+
+ req->cqe.result = ns->id;
+ return NVME_SUCCESS;
+ }
+ }
+
+ return NVME_NS_INSUFF_CAP;
+}
+
+static uint16_t nvme_namespace_delete(NvmeCtrl *n, NvmeCmd *cmd,
NvmeRequest *req)
+{
+ NvmeNamespace *ns = &n->namespaces[cmd->nsid - 1];
+ if (ns->created) {
+ ns->created = false;
+ ns->ctrl = NULL;
+ n->num_namespaces--;
+ n->id_ctrl.nn--;
+ n->id_ctrl.unvmcap += ns->id_ns.nvmcap;
+ return NVME_SUCCESS;
+ }
+ return NVME_INVALID_NSID;
+}
+
+static uint16_t nvme_namespace_management(NvmeCtrl *n, NvmeCmd *cmd,
NvmeRequest *req)
+{
+ uint32_t dw10 = le32_to_cpu(cmd->cdw10);
+
+ if ( (cmd->nsid > NVME_MAX_NUM_NAMESPACES)
+ && (cmd->nsid != 0xFFFFFFFF)) {
+ return NVME_INVALID_FIELD;
+ }
+
+ switch (dw10) {
+ case NVME_NS_CREATE:
+ return nvme_namespace_create(n, cmd, req);
+ case NVME_NS_DELETE:
+ if ( cmd->nsid == 0xFFFFFFFF ) {
+ uint32_t i;
+ uint16_t ret = NVME_SUCCESS;
+
+ for (i = 1; i < NVME_MAX_NUM_NAMESPACES; i++) {
+ cmd->nsid = i;
+ if ( &n->namespaces[cmd->nsid - 1].created) {
+ ret = nvme_namespace_delete(n, cmd, req);
+ }
+ if (ret != NVME_SUCCESS) {
+ return ret;
+ }
+ }
+ return ret;
+ }
+ return nvme_namespace_delete(n, cmd, req);
+ default:
+ return NVME_INVALID_FIELD;
+ }
+}
+
static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
{
trace_nvme_setfeat_timestamp(ts);
@@ -860,6 +1170,10 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n,
NvmeCmd *cmd, NvmeRequest *req)
return nvme_set_feature(n, cmd, req);
case NVME_ADM_CMD_GET_FEATURES:
return nvme_get_feature(n, cmd, req);
+ case NVME_ADM_CMD_NS_MANAGEMENT:
+ return nvme_namespace_management(n, cmd, req);
+ case NVME_ADM_CMD_NS_ATTACH:
+ return nvme_namespace_attachment(n, cmd);
default:
trace_nvme_err_invalid_admin_opc(cmd->opcode);
return NVME_INVALID_OPCODE | NVME_DNR;
@@ -915,6 +1229,7 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
}
blk_flush(n->conf.blk);
+
n->bar.cc = 0;
}
@@ -1302,61 +1617,10 @@ static const MemoryRegionOps nvme_cmb_ops = {
},
};
-static void nvme_realize(PCIDevice *pci_dev, Error **errp)
+static void nvme_init_ctrl(NvmeCtrl *n)
{
- NvmeCtrl *n = NVME(pci_dev);
NvmeIdCtrl *id = &n->id_ctrl;
-
- int i;
- int64_t bs_size;
- uint8_t *pci_conf;
-
- if (!n->num_queues) {
- error_setg(errp, "num_queues can't be zero");
- return;
- }
-
- if (!n->conf.blk) {
- error_setg(errp, "drive property not set");
- return;
- }
-
- bs_size = blk_getlength(n->conf.blk);
- if (bs_size < 0) {
- error_setg(errp, "could not get backing file size");
- return;
- }
-
- if (!n->serial) {
- error_setg(errp, "serial property not set");
- return;
- }
- blkconf_blocksizes(&n->conf);
- if (!blkconf_apply_backend_options(&n->conf,
blk_is_read_only(n->conf.blk),
- false, errp)) {
- return;
- }
-
- pci_conf = pci_dev->config;
- pci_conf[PCI_INTERRUPT_PIN] = 1;
- pci_config_set_prog_interface(pci_dev->config, 0x2);
- pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
- pcie_endpoint_cap_init(pci_dev, 0x80);
-
- n->num_namespaces = 1;
- n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
- n->ns_size = bs_size / (uint64_t)n->num_namespaces;
-
- n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
- n->sq = g_new0(NvmeSQueue *, n->num_queues);
- n->cq = g_new0(NvmeCQueue *, n->num_queues);
-
- memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
- "nvme", n->reg_size);
- pci_register_bar(pci_dev, 0,
- PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
- &n->iomem);
- msix_init_exclusive_bar(pci_dev, n->num_queues, 4, NULL);
+ uint8_t *pci_conf = n->parent_obj.config;
id->vid = cpu_to_le16(pci_get_word(pci_conf + PCI_VENDOR_ID));
id->ssvid = cpu_to_le16(pci_get_word(pci_conf +
PCI_SUBSYSTEM_VENDOR_ID));
@@ -1367,16 +1631,25 @@ static void nvme_realize(PCIDevice *pci_dev,
Error **errp)
id->ieee[0] = 0x00;
id->ieee[1] = 0x02;
id->ieee[2] = 0xb3;
- id->oacs = cpu_to_le16(0);
+ id->oacs = cpu_to_le16(0x8); // Namespace Management Supported
+
id->frmw = 7 << 1;
id->lpa = 1 << 0;
id->sqes = (0x6 << 4) | 0x6;
id->cqes = (0x4 << 4) | 0x4;
- id->nn = cpu_to_le32(n->num_namespaces);
+ id->mnan = 0;
+ id->nn = NVME_MAX_NUM_NAMESPACES;
id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
id->psd[0].mp = cpu_to_le16(0x9c4);
id->psd[0].enlat = cpu_to_le32(0x10);
id->psd[0].exlat = cpu_to_le32(0x4);
+ id->tnvmcap = n->nvm_capacity;
+ id->unvmcap = 0;
+ id->hmpre = n->hmpre;
+ id->hmmin = n->hmmin;
+
+ snprintf ((char*)id->subnqn, sizeof(id->subnqn), "QEMU NVMe
Subsystem 1.2 Compatible");
+
if (blk_enable_write_cache(n->conf.blk)) {
id->vwc = 1;
}
@@ -1387,10 +1660,34 @@ static void nvme_realize(PCIDevice *pci_dev,
Error **errp)
NVME_CAP_SET_AMS(n->bar.cap, 1);
NVME_CAP_SET_TO(n->bar.cap, 0xf);
NVME_CAP_SET_CSS(n->bar.cap, 1);
+ NVME_CAP_SET_MPSMIN(n->bar.cap, 0);
NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
n->bar.vs = 0x00010200;
n->bar.intmc = n->bar.intms = 0;
+}
+
+static void nvme_init_pci(NvmeCtrl *n) {
+ uint8_t *pci_conf = n->parent_obj.config;
+
+ pci_conf[PCI_INTERRUPT_PIN] = 1;
+ pci_config_set_prog_interface(pci_conf, 0x2);
+ pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
+
+
+ pci_config_set_device_id(pci_conf, 0x5845);
+ pcie_endpoint_cap_init(&n->parent_obj, 0x80);
+
+ memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n, "nvme",
+ n->reg_size);
+
+
+ pci_register_bar(&n->parent_obj, 0,
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
PCI_BASE_ADDRESS_MEM_TYPE_64,
+ &n->iomem);
+
+
+ msix_init_exclusive_bar(&n->parent_obj, n->num_queues, 4, NULL);
if (n->cmb_size_mb) {
@@ -1406,20 +1703,31 @@ static void nvme_realize(PCIDevice *pci_dev,
Error **errp)
NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->cmb_size_mb);
n->cmbloc = n->bar.cmbloc;
- n->cmbsz = n->bar.cmbsz;
+ n->cmbsz = n->bar.cmbsz;
n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
memory_region_init_io(&n->ctrl_mem, OBJECT(n), &nvme_cmb_ops, n,
"nvme-cmb",
NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
- pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
- PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
- PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
-
+ pci_register_bar(&n->parent_obj, NVME_CMBLOC_BIR(n->bar.cmbloc),
+ PCI_BASE_ADDRESS_SPACE_MEMORY |
PCI_BASE_ADDRESS_MEM_TYPE_64 |
+ PCI_BASE_ADDRESS_MEM_PREFETCH, &n->ctrl_mem);
}
+}
+
+/**
+ * Divides up the total block space between all requested namespaces.
+ * @param n
+ */
+static void nvme_init_namespaces(NvmeCtrl *n)
+{
+ uint8_t i;
for (i = 0; i < n->num_namespaces; i++) {
+ uint64_t blks;
+ int lba_index;
NvmeNamespace *ns = &n->namespaces[i];
NvmeIdNs *id_ns = &ns->id_ns;
+
id_ns->nsfeat = 0;
id_ns->nlbaf = 0;
id_ns->flbas = 0;
@@ -1427,12 +1735,65 @@ static void nvme_realize(PCIDevice *pci_dev,
Error **errp)
id_ns->dpc = 0;
id_ns->dps = 0;
id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
- id_ns->ncap = id_ns->nuse = id_ns->nsze =
- cpu_to_le64(n->ns_size >>
- id_ns->lbaf[NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas)].ds);
+ id_ns->nsze = n->nvm_capacity / (uint64_t)n->num_namespaces;
+
+ lba_index = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
+ blks = id_ns->nsze / (1 << id_ns->lbaf[lba_index].ds);
+ id_ns->nuse = id_ns->ncap = id_ns->nsze = cpu_to_le64(blks);
+ id_ns->nvmcap = id_ns->nsze * (1 << id_ns->lbaf[lba_index].ds);
+
+ ns->id = i + 1;
+ ns->start_byte_index = (i * id_ns->nsze) >> BDRV_SECTOR_BITS;
+ ns->created = true;
+ ns->ctrl = n; /* attached */
+
}
}
+static void nvme_realize(PCIDevice *pci_dev, Error **errp)
+{
+ NvmeCtrl *n = NVME(pci_dev);
+
+ int64_t bs_size;
+ Error *local_err = NULL;
+
+ if (!n->conf.blk) {
+ error_setg(errp, "drive property not set");
+ return;
+ }
+
+ bs_size = blk_getlength(n->conf.blk);
+ if (bs_size < 0) {
+ error_setg(errp, "could not get backing file size");
+ return;
+ }
+
+ if (!n->serial) {
+ error_setg(errp, "serial property not set");
+ return;
+ }
+ blkconf_blocksizes(&n->conf);
+ blkconf_apply_backend_options(&n->conf, blk_is_read_only(n->conf.blk),
+ false, &local_err);
+ if (local_err) {
+ error_report_err(local_err);
+ return;
+ }
+
+ n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
+ n->nvm_capacity = bs_size;
+ n->sq = g_new0(NvmeSQueue *, n->num_queues);
+ n->cq = g_new0(NvmeCQueue *, n->num_queues);
+ n->namespaces = g_new0(NvmeNamespace, NVME_MAX_NUM_NAMESPACES);
+
+ nvme_init_pci(n);
+ nvme_init_ctrl(n);
+ nvme_init_namespaces(n);
+
+}
+
static void nvme_exit(PCIDevice *pci_dev)
{
NvmeCtrl *n = NVME(pci_dev);
@@ -1451,6 +1812,7 @@ static void nvme_exit(PCIDevice *pci_dev)
static Property nvme_props[] = {
DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
+ DEFINE_PROP_UINT32("namespaces", NvmeCtrl, num_namespaces, 1),
DEFINE_PROP_UINT32("cmb_size_mb", NvmeCtrl, cmb_size_mb, 0),
DEFINE_PROP_UINT32("num_queues", NvmeCtrl, num_queues, 64),
DEFINE_PROP_END_OF_LIST(),
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index 557194ee19..c182dcb10a 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -9,6 +9,7 @@ typedef struct NvmeAsyncEvent {
typedef struct NvmeRequest {
struct NvmeSQueue *sq;
+ struct NvmeNamespace *ns;
BlockAIOCB *aiocb;
uint16_t status;
bool has_sg;
@@ -50,7 +51,16 @@ typedef struct NvmeCQueue {
} NvmeCQueue;
typedef struct NvmeNamespace {
+ struct NvmeCtrl *ctrl;
+ bool created;
NvmeIdNs id_ns;
+ NvmeRangeType lba_range[64];
+ unsigned long *util;
+ unsigned long *uncorrectable;
+ uint32_t id;
+ uint64_t start_byte_index;
+ uint64_t meta_start_offset;
+ BlockConf conf;
} NvmeNamespace;
#define TYPE_NVME "nvme"
@@ -64,23 +74,66 @@ typedef struct NvmeCtrl {
NvmeBar bar;
BlockConf conf;
- uint32_t page_size;
+ time_t start_time;
+ uint16_t temperature;
+ uint16_t page_size;
uint16_t page_bits;
uint16_t max_prp_ents;
uint16_t cqe_size;
uint16_t sqe_size;
+ uint16_t oacs;
+ uint16_t oncs;
uint32_t reg_size;
uint32_t num_namespaces;
uint32_t num_queues;
uint32_t max_q_ents;
- uint64_t ns_size;
+ uint64_t nvm_capacity;
+ uint8_t db_stride;
+ uint8_t aerl;
+ uint8_t acl;
+ uint8_t elpe;
+ uint8_t elp_index;
+ uint8_t error_count;
+ uint8_t mdts;
+ uint8_t cqr;
+ uint8_t max_sqes;
+ uint8_t max_cqes;
+ uint8_t meta;
+ uint8_t vwc;
+ uint8_t mc;
+ uint8_t dpc;
+ uint8_t dps;
+ uint8_t nlbaf;
+ uint8_t extended;
+ uint8_t lba_index;
+ uint8_t mpsmin;
+ uint8_t mpsmax;
+ uint8_t intc;
+ uint8_t intc_thresh;
+ uint8_t intc_time;
+ uint8_t outstanding_aers;
+ uint8_t temp_warn_issued;
+ uint8_t num_errors;
+ uint8_t cqes_pending;
+ uint16_t vid;
+ uint16_t did;
uint32_t cmb_size_mb;
uint32_t cmbsz;
uint32_t cmbloc;
+ uint32_t sriov_total_vfs;
uint8_t *cmbuf;
uint64_t irq_status;
uint64_t host_timestamp; /* Timestamp sent by
the host */
uint64_t timestamp_set_qemu_clock_ms; /* QEMU clock time */
+ uint8_t ehm;
+ uint8_t hsize;
+ uint32_t hmdlal;
+ uint32_t hmdlua;
+ uint32_t hmdlec;
+ uint8_t *hmbuf;
+ uint32_t hmmin;
+ uint32_t hmpre;
+
char *serial;
NvmeNamespace *namespaces;
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 3ec8efcc43..8c1e8c6cdc 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -17,6 +17,16 @@ typedef struct NvmeBar {
uint32_t cmbsz;
} NvmeBar;
+enum NvmeNsSelect {
+ NVME_NS_CONTROLLER_ATTACH = 0,
+ NVME_NS_CONTROLLER_DETACH = 1,
+};
+
+enum NvmeNsManagement {
+ NVME_NS_CREATE = 0,
+ NVME_NS_DELETE = 1,
+};
+
enum NvmeCapShift {
CAP_MQES_SHIFT = 0,
CAP_CQR_SHIFT = 16,
@@ -233,13 +243,31 @@ enum NvmeAdminCommands {
NVME_ADM_CMD_SET_FEATURES = 0x09,
NVME_ADM_CMD_GET_FEATURES = 0x0a,
NVME_ADM_CMD_ASYNC_EV_REQ = 0x0c,
+ NVME_ADM_CMD_NS_MANAGEMENT = 0x0d,
NVME_ADM_CMD_ACTIVATE_FW = 0x10,
NVME_ADM_CMD_DOWNLOAD_FW = 0x11,
+ NVME_ADM_CMD_NS_ATTACH = 0x15,
+ NVME_ADM_VIRT_MANAGEMENT = 0x1C,
NVME_ADM_CMD_FORMAT_NVM = 0x80,
NVME_ADM_CMD_SECURITY_SEND = 0x81,
NVME_ADM_CMD_SECURITY_RECV = 0x82,
};
+
+enum NvmeAdminCns {
+ NVME_ADM_CNS_ID_NS = 0x00,
+ NVME_ADM_CNS_ID_CTRL = 0x01,
+ NVME_ADM_CNS_ID_NS_LIST = 0x02,
+ NVME_ADM_CNS_NS_DESC_LIST = 0x03,
+ NVME_ADM_CNS_NVM_SET_LIST = 0x04,
+ NVME_ADM_CNS_ID_NS_LIST_ALLOC = 0x10,
+ NVME_ADM_CNS_ID_NS_ALLOC = 0x11,
+ NVME_ADM_CNS_CTRL_LIST_NS_ATT = 0x12,
+ NVME_ADM_CNS_CTRL_LIST = 0x13,
+ NVME_ADM_CNS_PRIM_CTRL_CAP = 0x14,
+ NVME_ADM_CNS_SEC_CTRL_LIST = 0x15,
+};
+
enum NvmeIoCommands {
NVME_CMD_FLUSH = 0x00,
NVME_CMD_WRITE = 0x01,
@@ -427,6 +455,17 @@ enum NvmeStatusCodes {
NVME_CMD_ABORT_MISSING_FUSE = 0x000a,
NVME_INVALID_NSID = 0x000b,
NVME_CMD_SEQ_ERROR = 0x000c,
+ NVME_NS_INSUFF_CAP = 0x0015,
+ NVME_NS_ID_UNAVAILABLE = 0x0016,
+ NVME_NS_ALREADY_ATTACHED = 0x0018,
+ NVME_NS_PRIVATE = 0x0019,
+ NVME_NS_NOT_ATTACHED = 0x001A,
+ NVME_THIN_PROV_NOT_SUP = 0x001B,
+ NVME_CTRL_LIST_INVALID = 0x001C,
+ NVME_INVALID_CTRL_ID = 0x001F,
+ NVME_INVALID_SEC_CTRL_ST = 0x0020,
+ NVME_INVALID_NUM_CTRL_RES = 0x0021,
+ NVME_INVALID_RES_ID = 0x0022,
NVME_LBA_RANGE = 0x0080,
NVME_CAP_EXCEEDED = 0x0081,
NVME_NS_NOT_READY = 0x0082,
@@ -543,7 +582,20 @@ typedef struct NvmeIdCtrl {
uint8_t ieee[3];
uint8_t cmic;
uint8_t mdts;
- uint8_t rsvd255[178];
+ uint16_t cntlid;
+ uint32_t ver;
+ uint8_t rsvd_95[8];
+ uint32_t oaes;
+ uint32_t ctratt;
+ uint16_t rrls;
+ uint8_t rsvd110[9];
+ uint8_t cntrltype;
+ uint64_t fguid;
+ uint64_t fguid_u;
+ uint16_t crdt1;
+ uint16_t crdt2;
+ uint16_t crdt3;
+ uint8_t rsvd255[122];
uint16_t oacs;
uint8_t acl;
uint8_t aerl;
@@ -551,10 +603,39 @@ typedef struct NvmeIdCtrl {
uint8_t lpa;
uint8_t elpe;
uint8_t npss;
- uint8_t rsvd511[248];
+ uint8_t avscc;
+ uint8_t apsta;
+ uint16_t wctemp;
+ uint16_t cctemp;
+ uint16_t mtfa;
+ uint32_t hmpre;
+ uint32_t hmmin;
+ uint64_t tnvmcap;
+ uint64_t tnvmcap_u;
+ uint64_t unvmcap;
+ uint64_t unvmcap_u;
+ uint32_t rpmbs;
+ uint16_t edstt;
+ uint8_t dsto;
+ uint8_t fwug;
+ uint16_t kas;
+ uint16_t hctma;
+ uint16_t mntmt;
+ uint16_t mxtmt;
+ uint32_t sanicap;
+ uint32_t hmminds;
+ uint16_t hmmaxd;
+ uint16_t nsetidmax;
+ uint16_t endgidmax;
+ uint8_t anatt;
+ uint8_t anacap;
+ uint32_t anagrpmax;
+ uint32_t nanagrpid;
+ uint32_t pels;
+ uint8_t rsvd511[156];
uint8_t sqes;
uint8_t cqes;
- uint16_t rsvd515;
+ uint16_t maxcmd;
uint32_t nn;
uint16_t oncs;
uint16_t fuses;
@@ -562,8 +643,15 @@ typedef struct NvmeIdCtrl {
uint8_t vwc;
uint16_t awun;
uint16_t awupf;
- uint8_t rsvd703[174];
- uint8_t rsvd2047[1344];
+ uint8_t nvscc;
+ uint8_t nwpc;
+ uint16_t acwu;
+ uint8_t rsvd535[2];
+ uint32_t sgls;
+ uint32_t mnan;
+ uint8_t rsvd767[224];
+ uint8_t subnqn[256];
+ uint8_t rsvd2047[1024];
NvmePSD psd[32];
uint8_t vs[1024];
} NvmeIdCtrl;
@@ -653,9 +741,35 @@ typedef struct NvmeIdNs {
uint8_t mc;
uint8_t dpc;
uint8_t dps;
- uint8_t res30[98];
+ uint8_t nmic;
+ uint8_t rescap;
+ uint8_t fpi;
+ uint8_t dlfeat;
+ uint16_t nawun;
+ uint16_t nawupf;
+ uint16_t nacwu;
+ uint16_t nabsn;
+ uint16_t nabo;
+ uint16_t nabspf;
+ uint16_t noiob;
+ uint64_t nvmcap;
+ uint64_t nvmcap_u;
+ uint16_t npwg;
+ uint16_t npwa;
+ uint16_t npdg;
+ uint16_t npda;
+ uint16_t nows;
+ uint8_t rsvd91[18];
+ uint32_t anagrpid;
+ uint8_t rsvd98[3];
+ uint8_t nsattr;
+ uint16_t nvmsetid;
+ uint16_t endgid;
+ uint64_t nguid;
+ uint64_t nguid_u;
+ uint64_t eui64;
NvmeLBAF lbaf[16];
- uint8_t res192[192];
+ uint8_t rsvd383[192];
uint8_t vs[3712];
} NvmeIdNs;
--
2.17.1
On Tue, Jul 02, 2019 at 10:39:36AM -0700, Matt Fitzpatrick wrote: > Adding namespace management support to the nvme device. Namespace creation > requires contiguous block space for a simple method of allocation. I guess that means this won't handle creating a large namespace from fragmented unallocated space after various create+delete scenarios. Capping the create size to the max contiguous extent may not be so bad, but it may be confusing when UNVMCAP exceeds the largest possible namespace you can create when we have no good way to report the max possible creation size.
On Tue, Jul 02, 2019 at 10:39:36AM -0700, Matt Fitzpatrick wrote: > Adding namespace management support to the nvme device. Namespace creation > requires contiguous block space for a simple method of allocation. > > I wrote this a few years ago based on Keith's fork and nvmeqemu fork and > have recently re-synced with the latest trunk. Some data structures in > nvme.h are a bit more filled out that strictly necessary as this is also the > base for sr-iov and IOD patched to be submitted later. > Hi Matt, Nice! I'm always happy when new features for the nvme device is posted! I'll be happy to review it, but I won't start going through it in details because I believe the approach to supporting multiple namespaces is flawed. We had a recent discussion on this and I also got some unrelated patches rejected due to implementing it similarly by carving up the image. I have posted a long series that includes a patch for multiple namespaces. It is implemented by introducing a fresh `nvme-ns` device model that represents a namespace and attaches to a bus created by the parent `nvme` controller device. The core issue is that a qemu image /should/ be attachable to other devices (say ide) and not strictly tied to the one device model. Thus, we cannot just shove a bunch of namespaces into a single image. But, in light of your patch, I'm not convinced that my implementation is the correct solution. Maybe the abstraction should not be an `nvme-ns` device, but a `nvme-nvm` device that when attached changes TNVMCAP and UNVMCAP? Maybe you have some input for this? Or we could have both and dynamically create the nvme-ns devices on top of nvme-nvm devices. I think it would still require a 1-to-1 mapping, but it could be a way to support the namespace management capability. Cheers, Klaus
Hey Klaus, Sorry for the late reply! I finally found this message amid the pile of emails Qemu dumped on me. I don't know what the right answer is here... NVMe is designed in a way where you *do* "carve up" the flash into logical groupings and the nvme firmware decides on how that's done. Those logical groupings can be attached to different controllers(which we don't have here yet?) after init, but that's a problem for future us I guess?But that's all stuff you already know. The "nvme-nvm" solution might be the right approach, but I'm a bit hesitant on the idea of growing tnvmcap... I can't think of any way to create namespaces on the fly and not have it use some single existing block backend, unless we defined a range of block images on qemu start and namespace create/attach only uses one image up to and including it's max size per namespace? That might work, and I think that's what you suggested (or at least is similar to), though it could be pretty wasteful. It wouldn't offer a "true" namespace management support, but could be close enough. I'm in the middle of going through the patch you posted. Nice job! I'm glad to see more people adding enhancements. It was pretty stale for years. -Matt On 7/5/19 12:50 AM, Klaus Birkelund wrote: > On Tue, Jul 02, 2019 at 10:39:36AM -0700, Matt Fitzpatrick wrote: >> Adding namespace management support to the nvme device. Namespace creation >> requires contiguous block space for a simple method of allocation. >> >> I wrote this a few years ago based on Keith's fork and nvmeqemu fork and >> have recently re-synced with the latest trunk. Some data structures in >> nvme.h are a bit more filled out that strictly necessary as this is also the >> base for sr-iov and IOD patched to be submitted later. >> > Hi Matt, > > Nice! I'm always happy when new features for the nvme device is posted! > > I'll be happy to review it, but I won't start going through it in > details because I believe the approach to supporting multiple namespaces > is flawed. We had a recent discussion on this and I also got some > unrelated patches rejected due to implementing it similarly by carving > up the image. > > I have posted a long series that includes a patch for multiple > namespaces. It is implemented by introducing a fresh `nvme-ns` device > model that represents a namespace and attaches to a bus created by the > parent `nvme` controller device. > > The core issue is that a qemu image /should/ be attachable to other > devices (say ide) and not strictly tied to the one device model. Thus, > we cannot just shove a bunch of namespaces into a single image. > > But, in light of your patch, I'm not convinced that my implementation is > the correct solution. Maybe the abstraction should not be an `nvme-ns` > device, but a `nvme-nvm` device that when attached changes TNVMCAP and > UNVMCAP? Maybe you have some input for this? Or we could have both and > dynamically create the nvme-ns devices on top of nvme-nvm devices. I > think it would still require a 1-to-1 mapping, but it could be a way to > support the namespace management capability. > > > Cheers, > Klaus >
On Mon, Jul 08, 2019 at 03:52:29PM -0700, Matt Fitzpatrick wrote: > Hey Klaus, > > Sorry for the late reply! I finally found this message amid the pile of > emails Qemu dumped on me. > > I don't know what the right answer is here... NVMe is designed in a way > where you *do* "carve up" the flash into logical groupings and the nvme > firmware decides on how that's done. Those logical groupings can be attached > to different controllers(which we don't have here yet?) after init, but > that's a problem for future us I guess?But that's all stuff you already > know. > Yeah, I havn't started worrying about that ;) > The "nvme-nvm" solution might be the right approach, but I'm a bit hesitant > on the idea of growing tnvmcap... > > I can't think of any way to create namespaces on the fly and not have it use > some single existing block backend, unless we defined a range of block > images on qemu start and namespace create/attach only uses one image up to > and including it's max size per namespace? That might work, and I think > that's what you suggested (or at least is similar to), though it could be > pretty wasteful. It wouldn't offer a "true" namespace management support, > but could be close enough. > Having an emulated device that supports namespace management would be very useful for testing software, but yeah, I have a hard time seeing how we can make that fit with the current "QEMU model". > I'm in the middle of going through the patch you posted. Nice job! I'm glad > to see more people adding enhancements. It was pretty stale for years. > Thanks for looking at it, I know it's a lot to go through ;) > -Matt > On 7/5/19 12:50 AM, Klaus Birkelund wrote: > > On Tue, Jul 02, 2019 at 10:39:36AM -0700, Matt Fitzpatrick wrote: > > > Adding namespace management support to the nvme device. Namespace creation > > > requires contiguous block space for a simple method of allocation. > > > > > > I wrote this a few years ago based on Keith's fork and nvmeqemu fork and > > > have recently re-synced with the latest trunk. Some data structures in > > > nvme.h are a bit more filled out that strictly necessary as this is also the > > > base for sr-iov and IOD patched to be submitted later. > > > > > Hi Matt, > > > > Nice! I'm always happy when new features for the nvme device is posted! > > > > I'll be happy to review it, but I won't start going through it in > > details because I believe the approach to supporting multiple namespaces > > is flawed. We had a recent discussion on this and I also got some > > unrelated patches rejected due to implementing it similarly by carving > > up the image. > > > > I have posted a long series that includes a patch for multiple > > namespaces. It is implemented by introducing a fresh `nvme-ns` device > > model that represents a namespace and attaches to a bus created by the > > parent `nvme` controller device. > > > > The core issue is that a qemu image /should/ be attachable to other > > devices (say ide) and not strictly tied to the one device model. Thus, > > we cannot just shove a bunch of namespaces into a single image. > > > > But, in light of your patch, I'm not convinced that my implementation is > > the correct solution. Maybe the abstraction should not be an `nvme-ns` > > device, but a `nvme-nvm` device that when attached changes TNVMCAP and > > UNVMCAP? Maybe you have some input for this? Or we could have both and > > dynamically create the nvme-ns devices on top of nvme-nvm devices. I > > think it would still require a 1-to-1 mapping, but it could be a way to > > support the namespace management capability. > > > > > > Cheers, > > Klaus > > > Hi Kevin, This highlights another situation where the "1 image to 1 block device" model doesn't fit that well with NVMe. Especially with the introduction of "NVM Sets" in NVMe 1.4. It would be very nice to introduce a 'nvme-nvmset' device model that adds an NVM Set which the controller can then create namespaces in. Is it completely unacceptable for a device to use the image in such a way that it would not make sense (aka present the same block device) when attached to another device (ide, ...)? I really have a hard time seeing how we could support these features without violating the '1 image to 1 block device" model. Cheers, Klaus
© 2016 - 2024 Red Hat, Inc.