:p
atchew
Login
Hi Alex, I was looking at your recent fix for SeaBIOS NVME. I agree that it is not valid to write to the f-segment during runtime. However, I was struggling to understand the PRP list management in the nvme.c code. I came up with a different implementation that avoids allocating another buffer in the "high zone". Instead, the code in this patch series builds the page list in the existing "dma bounce buffer". I don't have a good way to test this on real hardware. It passes simple qemu tests (both with and without kvm). For example: qemu-system-x86_64 -k en-us -snapshot -L test -chardev stdio,id=seabios -device isa-debugcon,iobase=0x402,chardev=seabios -m 512 -drive file=dos-drivec,if=none,id=drive0 -device nvme,drive=drive0,serial=nvme-1 -boot menu=on Thanks, -Kevin Kevin O'Connor (5): nvme: Rework nvme_io_readwrite() to return -1 on error nvme: Add nvme_bounce_xfer() helper function nvme: Convert nvme_build_prpl() to nvme_prpl_xfer() nvme: Pass prp1 and prp2 directly to nvme_io_xfer() nvme: Build the page list in the existing dma buffer src/hw/nvme-int.h | 7 --- src/hw/nvme.c | 143 ++++++++++++++++++++-------------------------- 2 files changed, 61 insertions(+), 89 deletions(-) -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Rename nvme_io_readwrite() to nvme_io_xfer() and change it so it implements the debugging dprintf() and it returns -1 on an error. Signed-off-by: Kevin O'Connor <kevin@koconnor.net> --- src/hw/nvme.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ err: /* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross page boundaries. */ static int -nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, - int write) +nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, + int write) { u32 buf_addr = (u32)buf; void *prp2; @@ -XXX,XX +XXX,XX @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, if (buf_addr & 0x3) { /* Buffer is misaligned */ warn_internalerror(); - return DISK_RET_EBADTRACK; + return -1; } if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { @@ -XXX,XX +XXX,XX @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, dprintf(2, "read io: %08x %08x %08x %08x\n", cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); - return DISK_RET_EBADTRACK; + return -1; } - return DISK_RET_SUCCESS; + dprintf(5, "ns %u %s lba %llu+%u\n", ns->ns_id, write ? "write" : "read", + lba, count); + return count; } static void nvme_reset_prpl(struct nvme_namespace *ns) @@ -XXX,XX +XXX,XX @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) { - int res = DISK_RET_SUCCESS; u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; u16 i, blocks; - for (i = 0; i < op->count && res == DISK_RET_SUCCESS;) { + for (i = 0; i < op->count;) { u16 blocks_remaining = op->count - i; char *op_buf = op->buf_fl + i * ns->block_size; blocks = nvme_build_prpl(ns, op_buf, blocks_remaining); if (blocks) { - res = nvme_io_readwrite(ns, op->lba + i, ns->prp1, blocks, write); - dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" - : "read", - op->lba, blocks, res); + int res = nvme_io_xfer(ns, op->lba + i, ns->prp1, blocks, write); + if (res < 0) + return DISK_RET_EBADTRACK; } else { blocks = blocks_remaining < max_blocks ? blocks_remaining : max_blocks; @@ -XXX,XX +XXX,XX @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); } - res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write); - dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" - : "read", - op->lba + i, blocks, res); + int res = nvme_io_xfer(ns, op->lba + i, ns->dma_buffer, + blocks, write); + if (res < 0) + return DISK_RET_EBADTRACK; - if (!write && res == DISK_RET_SUCCESS) { + if (!write) { memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); } } @@ -XXX,XX +XXX,XX @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) i += blocks; } - return res; + return DISK_RET_SUCCESS; } int -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Move bounce buffer processing to a new helper function. Signed-off-by: Kevin O'Connor <kevin@koconnor.net> --- src/hw/nvme.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return count; } +// Transfer up to one page of data using the internal dma bounce buffer +static int +nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, + int write) +{ + u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; + u16 blocks = count < max_blocks ? count : max_blocks; + + if (write) + memcpy(ns->dma_buffer, buf, blocks * ns->block_size); + + int res = nvme_io_xfer(ns, lba, ns->dma_buffer, blocks, write); + + if (!write && res >= 0) + memcpy(buf, ns->dma_buffer, res * ns->block_size); + + return res; +} + static void nvme_reset_prpl(struct nvme_namespace *ns) { ns->prpl_len = 0; @@ -XXX,XX +XXX,XX @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) { - u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; u16 i, blocks; for (i = 0; i < op->count;) { @@ -XXX,XX +XXX,XX @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) if (res < 0) return DISK_RET_EBADTRACK; } else { - blocks = blocks_remaining < max_blocks ? blocks_remaining - : max_blocks; - - if (write) { - memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); - } - - int res = nvme_io_xfer(ns, op->lba + i, ns->dma_buffer, - blocks, write); + int res = nvme_bounce_xfer(ns, op->lba + i, op_buf, blocks, write); if (res < 0) return DISK_RET_EBADTRACK; - - if (!write) { - memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); - } + blocks = res; } i += blocks; -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Rename nvme_build_prpl() to nvme_prpl_xfer() and directly invoke nvme_io_xfer() or nvme_bounce_xfer() from that function. Signed-off-by: Kevin O'Connor <kevin@koconnor.net> --- src/hw/nvme-int.h | 1 - src/hw/nvme.c | 42 ++++++++++++++++-------------------------- 2 files changed, 16 insertions(+), 27 deletions(-) diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -XXX,XX +XXX,XX @@ struct nvme_namespace { /* Page List */ u32 prpl_len; - void *prp1; u64 prpl[NVME_MAX_PRPL_ENTRIES]; }; diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) return 0; } -static int nvme_build_prpl(struct nvme_namespace *ns, void *op_buf, u16 count) +// Transfer data using page list (if applicable) +static int +nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, + int write) { int first_page = 1; - u32 base = (long)op_buf; + u32 base = (long)buf; s32 size; if (count > ns->max_req_size) @@ -XXX,XX +XXX,XX @@ static int nvme_build_prpl(struct nvme_namespace *ns, void *op_buf, u16 count) size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ - if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) { - ns->prp1 = op_buf; - return count; - } + if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) + return nvme_io_xfer(ns, lba, buf, count, write); /* Every request has to be page aligned */ if (base & ~NVME_PAGE_MASK) - return 0; + return nvme_bounce_xfer(ns, lba, buf, count, write); /* Make sure a full block fits into the last chunk */ if (size & (ns->block_size - 1ULL)) - return 0; + return nvme_bounce_xfer(ns, lba, buf, count, write); for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { if (first_page) { /* First page is special */ - ns->prp1 = (void*)base; first_page = 0; continue; } if (nvme_add_prpl(ns, base)) - return 0; + return nvme_bounce_xfer(ns, lba, buf, count, write); } - return count; + return nvme_io_xfer(ns, lba, buf, count, write); } static int @@ -XXX,XX +XXX,XX @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) { - u16 i, blocks; - + int i; for (i = 0; i < op->count;) { u16 blocks_remaining = op->count - i; char *op_buf = op->buf_fl + i * ns->block_size; - - blocks = nvme_build_prpl(ns, op_buf, blocks_remaining); - if (blocks) { - int res = nvme_io_xfer(ns, op->lba + i, ns->prp1, blocks, write); - if (res < 0) - return DISK_RET_EBADTRACK; - } else { - int res = nvme_bounce_xfer(ns, op->lba + i, op_buf, blocks, write); - if (res < 0) - return DISK_RET_EBADTRACK; - blocks = res; - } - + int blocks = nvme_prpl_xfer(ns, op->lba + i, op_buf, + blocks_remaining, write); + if (blocks < 0) + return DISK_RET_EBADTRACK; i += blocks; } -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
When using a prp2 parameter, build it in nvme_prpl_xfer() and pass it directly to nvme_io_xfer(). Signed-off-by: Kevin O'Connor <kevin@koconnor.net> --- src/hw/nvme.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ err: /* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross page boundaries. */ static int -nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, - int write) +nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *prp1, void *prp2, + u16 count, int write) { - u32 buf_addr = (u32)buf; - void *prp2; - - if (buf_addr & 0x3) { + if (((u32)prp1 & 0x3) || ((u32)prp2 & 0x3)) { /* Buffer is misaligned */ warn_internalerror(); return -1; } - if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { - /* We need to describe more than 2 pages, rely on PRP List */ - prp2 = ns->prpl; - } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { - /* Directly embed the 2nd page if we only need 2 pages */ - prp2 = (void *)(long)ns->prpl[0]; - } else { - /* One page is enough, don't expose anything else */ - prp2 = NULL; - } - struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq, write ? NVME_SQE_OPC_IO_WRITE : NVME_SQE_OPC_IO_READ, - NULL, buf, prp2); + NULL, prp1, prp2); io_read->nsid = ns->ns_id; io_read->dword[10] = (u32)lba; io_read->dword[11] = (u32)(lba >> 32); @@ -XXX,XX +XXX,XX @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, if (write) memcpy(ns->dma_buffer, buf, blocks * ns->block_size); - int res = nvme_io_xfer(ns, lba, ns->dma_buffer, blocks, write); + int res = nvme_io_xfer(ns, lba, ns->dma_buffer, NULL, blocks, write); if (!write && res >= 0) memcpy(buf, ns->dma_buffer, res * ns->block_size); @@ -XXX,XX +XXX,XX @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) - return nvme_io_xfer(ns, lba, buf, count, write); + return nvme_io_xfer(ns, lba, buf, NULL, count, write); /* Every request has to be page aligned */ if (base & ~NVME_PAGE_MASK) @@ -XXX,XX +XXX,XX @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return nvme_bounce_xfer(ns, lba, buf, count, write); } - return nvme_io_xfer(ns, lba, buf, count, write); + void *prp2; + if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { + /* We need to describe more than 2 pages, rely on PRP List */ + prp2 = ns->prpl; + } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { + /* Directly embed the 2nd page if we only need 2 pages */ + prp2 = (void *)(long)ns->prpl[0]; + } else { + /* One page is enough, don't expose anything else */ + prp2 = NULL; + } + return nvme_io_xfer(ns, lba, buf, prp2, count, write); } static int -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Commit 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") introduced multi-page requests using the NVMe PRP mechanism. To store the list and "first page to write to" hints, it added fields to the NVMe namespace struct. Unfortunately, that struct resides in fseg which is read-only at runtime. While KVM ignores the read-only part and allows writes, real hardware and TCG adhere to the semantics and ignore writes to the fseg region. The net effect of that is that reads and writes were always happening on address 0, unless they went through the bounce buffer logic. This patch builds the PRP maintenance data in the existing "dma bounce buffer" and only builds it when needed. Fixes: 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") Reported-by: Matt DeVillier <matt.devillier@gmail.com> Signed-off-by: Alexander Graf <graf@amazon.com> Signed-off-by: Kevin O'Connor <kevin@koconnor.net> --- src/hw/nvme-int.h | 6 ------ src/hw/nvme.c | 51 +++++++++++++++++------------------------------ 2 files changed, 18 insertions(+), 39 deletions(-) diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -XXX,XX +XXX,XX @@ #include "types.h" // u32 #include "pcidevice.h" // struct pci_device -#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */ - /* Data structures */ /* The register file of a NVMe host controller. This struct follows the naming @@ -XXX,XX +XXX,XX @@ struct nvme_namespace { /* Page aligned buffer of size NVME_PAGE_SIZE. */ char *dma_buffer; - - /* Page List */ - u32 prpl_len; - u64 prpl[NVME_MAX_PRPL_ENTRIES]; }; /* Data structures for NVMe admin identify commands */ diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return res; } -static void nvme_reset_prpl(struct nvme_namespace *ns) -{ - ns->prpl_len = 0; -} - -static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) -{ - if (ns->prpl_len >= NVME_MAX_PRPL_ENTRIES) - return -1; - - ns->prpl[ns->prpl_len++] = base; - - return 0; -} +#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */ // Transfer data using page list (if applicable) static int nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, int write) { - int first_page = 1; u32 base = (long)buf; s32 size; if (count > ns->max_req_size) count = ns->max_req_size; - nvme_reset_prpl(ns); - size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) @@ -XXX,XX +XXX,XX @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, if (size & (ns->block_size - 1ULL)) return nvme_bounce_xfer(ns, lba, buf, count, write); - for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { - if (first_page) { - /* First page is special */ - first_page = 0; - continue; - } - if (nvme_add_prpl(ns, base)) - return nvme_bounce_xfer(ns, lba, buf, count, write); - } - - void *prp2; if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { - /* We need to describe more than 2 pages, rely on PRP List */ - prp2 = ns->prpl; + /* We need to describe more than 2 pages, build PRP List */ + u32 prpl_len = 0; + u64 *prpl = (void*)ns->dma_buffer; + int first_page = 1; + for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { + if (first_page) { + /* First page is special */ + first_page = 0; + continue; + } + if (prpl_len >= NVME_MAX_PRPL_ENTRIES) + return nvme_bounce_xfer(ns, lba, buf, count, write); + prpl[prpl_len++] = base; + } + return nvme_io_xfer(ns, lba, buf, prpl, count, write); } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { /* Directly embed the 2nd page if we only need 2 pages */ - prp2 = (void *)(long)ns->prpl[0]; + return nvme_io_xfer(ns, lba, buf, buf + NVME_PAGE_SIZE, count, write); } else { /* One page is enough, don't expose anything else */ - prp2 = NULL; + return nvme_io_xfer(ns, lba, buf, NULL, count, write); } - return nvme_io_xfer(ns, lba, buf, prp2, count, write); } static int -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
This is a resend of the previous series. Changes since last time: * Patch 1: Fix function comment on nvme_io_xfer() * Patch 3-5: Added "goto bounce" to nvme_io_xfer() as suggested by Alex * Patch 6: Added separate "single dma bounce buffer" patch to this series * Patch 6: Add checking for malloc failure -Kevin Kevin O'Connor (6): nvme: Rework nvme_io_readwrite() to return -1 on error nvme: Add nvme_bounce_xfer() helper function nvme: Convert nvme_build_prpl() to nvme_prpl_xfer() nvme: Pass prp1 and prp2 directly to nvme_io_xfer() nvme: Build the page list in the existing dma buffer nvme: Only allocate one dma bounce buffer for all nvme drives src/hw/nvme-int.h | 10 --- src/hw/nvme.c | 163 ++++++++++++++++++++++------------------------ 2 files changed, 78 insertions(+), 95 deletions(-) -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Rename nvme_io_readwrite() to nvme_io_xfer() and change it so it implements the debugging dprintf() and it returns -1 on an error. Signed-off-by: Kevin O'Connor <kevin@koconnor.net> Reviewed-by: Alexander Graf <graf@amazon.com> --- src/hw/nvme.c | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ err: return -1; } -/* Reads count sectors into buf. Returns DISK_RET_*. The buffer cannot cross - page boundaries. */ +/* Reads count sectors into buf. The buffer cannot cross page boundaries. */ static int -nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, - int write) +nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, + int write) { u32 buf_addr = (u32)buf; void *prp2; @@ -XXX,XX +XXX,XX @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, if (buf_addr & 0x3) { /* Buffer is misaligned */ warn_internalerror(); - return DISK_RET_EBADTRACK; + return -1; } if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { @@ -XXX,XX +XXX,XX @@ nvme_io_readwrite(struct nvme_namespace *ns, u64 lba, char *buf, u16 count, dprintf(2, "read io: %08x %08x %08x %08x\n", cqe.dword[0], cqe.dword[1], cqe.dword[2], cqe.dword[3]); - return DISK_RET_EBADTRACK; + return -1; } - return DISK_RET_SUCCESS; + dprintf(5, "ns %u %s lba %llu+%u\n", ns->ns_id, write ? "write" : "read", + lba, count); + return count; } static void nvme_reset_prpl(struct nvme_namespace *ns) @@ -XXX,XX +XXX,XX @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) { - int res = DISK_RET_SUCCESS; u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; u16 i, blocks; - for (i = 0; i < op->count && res == DISK_RET_SUCCESS;) { + for (i = 0; i < op->count;) { u16 blocks_remaining = op->count - i; char *op_buf = op->buf_fl + i * ns->block_size; blocks = nvme_build_prpl(ns, op_buf, blocks_remaining); if (blocks) { - res = nvme_io_readwrite(ns, op->lba + i, ns->prp1, blocks, write); - dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" - : "read", - op->lba, blocks, res); + int res = nvme_io_xfer(ns, op->lba + i, ns->prp1, blocks, write); + if (res < 0) + return DISK_RET_EBADTRACK; } else { blocks = blocks_remaining < max_blocks ? blocks_remaining : max_blocks; @@ -XXX,XX +XXX,XX @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); } - res = nvme_io_readwrite(ns, op->lba + i, ns->dma_buffer, blocks, write); - dprintf(5, "ns %u %s lba %llu+%u: %d\n", ns->ns_id, write ? "write" - : "read", - op->lba + i, blocks, res); + int res = nvme_io_xfer(ns, op->lba + i, ns->dma_buffer, + blocks, write); + if (res < 0) + return DISK_RET_EBADTRACK; - if (!write && res == DISK_RET_SUCCESS) { + if (!write) { memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); } } @@ -XXX,XX +XXX,XX @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) i += blocks; } - return res; + return DISK_RET_SUCCESS; } int -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Move bounce buffer processing to a new helper function. Signed-off-by: Kevin O'Connor <kevin@koconnor.net> Reviewed-by: Alexander Graf <graf@amazon.com> --- src/hw/nvme.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return count; } +// Transfer up to one page of data using the internal dma bounce buffer +static int +nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, + int write) +{ + u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; + u16 blocks = count < max_blocks ? count : max_blocks; + + if (write) + memcpy(ns->dma_buffer, buf, blocks * ns->block_size); + + int res = nvme_io_xfer(ns, lba, ns->dma_buffer, blocks, write); + + if (!write && res >= 0) + memcpy(buf, ns->dma_buffer, res * ns->block_size); + + return res; +} + static void nvme_reset_prpl(struct nvme_namespace *ns) { ns->prpl_len = 0; @@ -XXX,XX +XXX,XX @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) { - u16 const max_blocks = NVME_PAGE_SIZE / ns->block_size; u16 i, blocks; for (i = 0; i < op->count;) { @@ -XXX,XX +XXX,XX @@ nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) if (res < 0) return DISK_RET_EBADTRACK; } else { - blocks = blocks_remaining < max_blocks ? blocks_remaining - : max_blocks; - - if (write) { - memcpy(ns->dma_buffer, op_buf, blocks * ns->block_size); - } - - int res = nvme_io_xfer(ns, op->lba + i, ns->dma_buffer, - blocks, write); + int res = nvme_bounce_xfer(ns, op->lba + i, op_buf, blocks, write); if (res < 0) return DISK_RET_EBADTRACK; - - if (!write) { - memcpy(op_buf, ns->dma_buffer, blocks * ns->block_size); - } + blocks = res; } i += blocks; -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Rename nvme_build_prpl() to nvme_prpl_xfer() and directly invoke nvme_io_xfer() or nvme_bounce_xfer() from that function. Signed-off-by: Kevin O'Connor <kevin@koconnor.net> --- src/hw/nvme-int.h | 1 - src/hw/nvme.c | 46 ++++++++++++++++++++-------------------------- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -XXX,XX +XXX,XX @@ struct nvme_namespace { /* Page List */ u32 prpl_len; - void *prp1; u64 prpl[NVME_MAX_PRPL_ENTRIES]; }; diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) return 0; } -static int nvme_build_prpl(struct nvme_namespace *ns, void *op_buf, u16 count) +// Transfer data using page list (if applicable) +static int +nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, + int write) { int first_page = 1; - u32 base = (long)op_buf; + u32 base = (long)buf; s32 size; if (count > ns->max_req_size) @@ -XXX,XX +XXX,XX @@ static int nvme_build_prpl(struct nvme_namespace *ns, void *op_buf, u16 count) size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ - if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) { - ns->prp1 = op_buf; - return count; - } + if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) + return nvme_io_xfer(ns, lba, buf, count, write); /* Every request has to be page aligned */ if (base & ~NVME_PAGE_MASK) - return 0; + goto bounce; /* Make sure a full block fits into the last chunk */ if (size & (ns->block_size - 1ULL)) - return 0; + goto bounce; for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { if (first_page) { /* First page is special */ - ns->prp1 = (void*)base; first_page = 0; continue; } if (nvme_add_prpl(ns, base)) - return 0; + goto bounce; } - return count; + return nvme_io_xfer(ns, lba, buf, count, write); + +bounce: + /* Use bounce buffer to make transfer */ + return nvme_bounce_xfer(ns, lba, buf, count, write); } static int @@ -XXX,XX +XXX,XX @@ nvme_scan(void) static int nvme_cmd_readwrite(struct nvme_namespace *ns, struct disk_op_s *op, int write) { - u16 i, blocks; - + int i; for (i = 0; i < op->count;) { u16 blocks_remaining = op->count - i; char *op_buf = op->buf_fl + i * ns->block_size; - - blocks = nvme_build_prpl(ns, op_buf, blocks_remaining); - if (blocks) { - int res = nvme_io_xfer(ns, op->lba + i, ns->prp1, blocks, write); - if (res < 0) - return DISK_RET_EBADTRACK; - } else { - int res = nvme_bounce_xfer(ns, op->lba + i, op_buf, blocks, write); - if (res < 0) - return DISK_RET_EBADTRACK; - blocks = res; - } - + int blocks = nvme_prpl_xfer(ns, op->lba + i, op_buf, + blocks_remaining, write); + if (blocks < 0) + return DISK_RET_EBADTRACK; i += blocks; } -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
When using a prp2 parameter, build it in nvme_prpl_xfer() and pass it directly to nvme_io_xfer(). Signed-off-by: Kevin O'Connor <kevin@koconnor.net> Reviewed-by: Alexander Graf <graf@amazon.com> --- src/hw/nvme.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ err: /* Reads count sectors into buf. The buffer cannot cross page boundaries. */ static int -nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, - int write) +nvme_io_xfer(struct nvme_namespace *ns, u64 lba, void *prp1, void *prp2, + u16 count, int write) { - u32 buf_addr = (u32)buf; - void *prp2; - - if (buf_addr & 0x3) { + if (((u32)prp1 & 0x3) || ((u32)prp2 & 0x3)) { /* Buffer is misaligned */ warn_internalerror(); return -1; } - if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { - /* We need to describe more than 2 pages, rely on PRP List */ - prp2 = ns->prpl; - } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { - /* Directly embed the 2nd page if we only need 2 pages */ - prp2 = (void *)(long)ns->prpl[0]; - } else { - /* One page is enough, don't expose anything else */ - prp2 = NULL; - } - struct nvme_sqe *io_read = nvme_get_next_sqe(&ns->ctrl->io_sq, write ? NVME_SQE_OPC_IO_WRITE : NVME_SQE_OPC_IO_READ, - NULL, buf, prp2); + NULL, prp1, prp2); io_read->nsid = ns->ns_id; io_read->dword[10] = (u32)lba; io_read->dword[11] = (u32)(lba >> 32); @@ -XXX,XX +XXX,XX @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, if (write) memcpy(ns->dma_buffer, buf, blocks * ns->block_size); - int res = nvme_io_xfer(ns, lba, ns->dma_buffer, blocks, write); + int res = nvme_io_xfer(ns, lba, ns->dma_buffer, NULL, blocks, write); if (!write && res >= 0) memcpy(buf, ns->dma_buffer, res * ns->block_size); @@ -XXX,XX +XXX,XX @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) - return nvme_io_xfer(ns, lba, buf, count, write); + return nvme_io_xfer(ns, lba, buf, NULL, count, write); /* Every request has to be page aligned */ if (base & ~NVME_PAGE_MASK) @@ -XXX,XX +XXX,XX @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, goto bounce; } - return nvme_io_xfer(ns, lba, buf, count, write); + void *prp2; + if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { + /* We need to describe more than 2 pages, rely on PRP List */ + prp2 = ns->prpl; + } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { + /* Directly embed the 2nd page if we only need 2 pages */ + prp2 = (void *)(long)ns->prpl[0]; + } else { + /* One page is enough, don't expose anything else */ + prp2 = NULL; + } + return nvme_io_xfer(ns, lba, buf, prp2, count, write); bounce: /* Use bounce buffer to make transfer */ -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
Commit 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") introduced multi-page requests using the NVMe PRP mechanism. To store the list and "first page to write to" hints, it added fields to the NVMe namespace struct. Unfortunately, that struct resides in fseg which is read-only at runtime. While KVM ignores the read-only part and allows writes, real hardware and TCG adhere to the semantics and ignore writes to the fseg region. The net effect of that is that reads and writes were always happening on address 0, unless they went through the bounce buffer logic. This patch builds the PRP maintenance data in the existing "dma bounce buffer" and only builds it when needed. Fixes: 01f2736cc905d ("nvme: Pass large I/O requests as PRP lists") Reported-by: Matt DeVillier <matt.devillier@gmail.com> Signed-off-by: Alexander Graf <graf@amazon.com> Signed-off-by: Kevin O'Connor <kevin@koconnor.net> --- src/hw/nvme-int.h | 6 ----- src/hw/nvme.c | 61 +++++++++++++++++++---------------------------- 2 files changed, 24 insertions(+), 43 deletions(-) diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -XXX,XX +XXX,XX @@ #include "types.h" // u32 #include "pcidevice.h" // struct pci_device -#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */ - /* Data structures */ /* The register file of a NVMe host controller. This struct follows the naming @@ -XXX,XX +XXX,XX @@ struct nvme_namespace { /* Page aligned buffer of size NVME_PAGE_SIZE. */ char *dma_buffer; - - /* Page List */ - u32 prpl_len; - u64 prpl[NVME_MAX_PRPL_ENTRIES]; }; /* Data structures for NVMe admin identify commands */ diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, return res; } -static void nvme_reset_prpl(struct nvme_namespace *ns) -{ - ns->prpl_len = 0; -} - -static int nvme_add_prpl(struct nvme_namespace *ns, u64 base) -{ - if (ns->prpl_len >= NVME_MAX_PRPL_ENTRIES) - return -1; - - ns->prpl[ns->prpl_len++] = base; - - return 0; -} +#define NVME_MAX_PRPL_ENTRIES 15 /* Allows requests up to 64kb */ // Transfer data using page list (if applicable) static int nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, int write) { - int first_page = 1; u32 base = (long)buf; s32 size; if (count > ns->max_req_size) count = ns->max_req_size; - nvme_reset_prpl(ns); - size = count * ns->block_size; /* Special case for transfers that fit into PRP1, but are unaligned */ if (((size + (base & ~NVME_PAGE_MASK)) <= NVME_PAGE_SIZE)) - return nvme_io_xfer(ns, lba, buf, NULL, count, write); + goto single; /* Every request has to be page aligned */ if (base & ~NVME_PAGE_MASK) @@ -XXX,XX +XXX,XX @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, if (size & (ns->block_size - 1ULL)) goto bounce; - for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { - if (first_page) { - /* First page is special */ - first_page = 0; - continue; + /* Build PRP list if we need to describe more than 2 pages */ + if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { + u32 prpl_len = 0; + u64 *prpl = (void*)ns->dma_buffer; + int first_page = 1; + for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { + if (first_page) { + /* First page is special */ + first_page = 0; + continue; + } + if (prpl_len >= NVME_MAX_PRPL_ENTRIES) + goto bounce; + prpl[prpl_len++] = base; } - if (nvme_add_prpl(ns, base)) - goto bounce; + return nvme_io_xfer(ns, lba, buf, prpl, count, write); } - void *prp2; - if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { - /* We need to describe more than 2 pages, rely on PRP List */ - prp2 = ns->prpl; - } else if ((ns->block_size * count) > NVME_PAGE_SIZE) { - /* Directly embed the 2nd page if we only need 2 pages */ - prp2 = (void *)(long)ns->prpl[0]; - } else { - /* One page is enough, don't expose anything else */ - prp2 = NULL; - } - return nvme_io_xfer(ns, lba, buf, prp2, count, write); + /* Directly embed the 2nd page if we only need 2 pages */ + if ((ns->block_size * count) > NVME_PAGE_SIZE) + return nvme_io_xfer(ns, lba, buf, buf + NVME_PAGE_SIZE, count, write); + +single: + /* One page is enough, don't expose anything else */ + return nvme_io_xfer(ns, lba, buf, NULL, count, write); bounce: /* Use bounce buffer to make transfer */ -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org
There is no need to create multiple dma bounce buffers as the BIOS disk code isn't reentrant capable. Also, verify that the allocation succeeds. Signed-off-by: Kevin O'Connor <kevin@koconnor.net> Reviewed-by: Alexander Graf <graf@amazon.com> --- src/hw/nvme-int.h | 3 --- src/hw/nvme.c | 21 +++++++++++++++------ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/src/hw/nvme-int.h b/src/hw/nvme-int.h index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme-int.h +++ b/src/hw/nvme-int.h @@ -XXX,XX +XXX,XX @@ struct nvme_namespace { u32 block_size; u32 metadata_size; u32 max_req_size; - - /* Page aligned buffer of size NVME_PAGE_SIZE. */ - char *dma_buffer; }; /* Data structures for NVMe admin identify commands */ diff --git a/src/hw/nvme.c b/src/hw/nvme.c index XXXXXXX..XXXXXXX 100644 --- a/src/hw/nvme.c +++ b/src/hw/nvme.c @@ -XXX,XX +XXX,XX @@ #include "nvme.h" #include "nvme-int.h" +// Page aligned "dma bounce buffer" of size NVME_PAGE_SIZE in high memory +static void *nvme_dma_buffer; + static void * zalloc_page_aligned(struct zone_s *zone, u32 size) { @@ -XXX,XX +XXX,XX @@ nvme_probe_ns(struct nvme_ctrl *ctrl, u32 ns_idx, u8 mdts) goto free_buffer; } + if (!nvme_dma_buffer) { + nvme_dma_buffer = zalloc_page_aligned(&ZoneHigh, NVME_PAGE_SIZE); + if (!nvme_dma_buffer) { + warn_noalloc(); + goto free_buffer; + } + } + struct nvme_namespace *ns = malloc_fseg(sizeof(*ns)); if (!ns) { warn_noalloc(); @@ -XXX,XX +XXX,XX @@ nvme_probe_ns(struct nvme_ctrl *ctrl, u32 ns_idx, u8 mdts) ns->max_req_size = -1U; } - ns->dma_buffer = zalloc_page_aligned(&ZoneHigh, NVME_PAGE_SIZE); - char *desc = znprintf(MAXDESCSIZE, "NVMe NS %u: %llu MiB (%llu %u-byte " "blocks + %u-byte metadata)", ns_id, (ns->lba_count * ns->block_size) >> 20, @@ -XXX,XX +XXX,XX @@ nvme_bounce_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, u16 blocks = count < max_blocks ? count : max_blocks; if (write) - memcpy(ns->dma_buffer, buf, blocks * ns->block_size); + memcpy(nvme_dma_buffer, buf, blocks * ns->block_size); - int res = nvme_io_xfer(ns, lba, ns->dma_buffer, NULL, blocks, write); + int res = nvme_io_xfer(ns, lba, nvme_dma_buffer, NULL, blocks, write); if (!write && res >= 0) - memcpy(buf, ns->dma_buffer, res * ns->block_size); + memcpy(buf, nvme_dma_buffer, res * ns->block_size); return res; } @@ -XXX,XX +XXX,XX @@ nvme_prpl_xfer(struct nvme_namespace *ns, u64 lba, void *buf, u16 count, /* Build PRP list if we need to describe more than 2 pages */ if ((ns->block_size * count) > (NVME_PAGE_SIZE * 2)) { u32 prpl_len = 0; - u64 *prpl = (void*)ns->dma_buffer; + u64 *prpl = nvme_dma_buffer; int first_page = 1; for (; size > 0; base += NVME_PAGE_SIZE, size -= NVME_PAGE_SIZE) { if (first_page) { -- 2.31.1 _______________________________________________ SeaBIOS mailing list -- seabios@seabios.org To unsubscribe send an email to seabios-leave@seabios.org