hw/nvme/ctrl.c | 5 +++++ 1 file changed, 5 insertions(+)
From: Keith Busch <kbusch@kernel.org>
The emulated device had let the user set whatever max transfers size
they wanted, including no limit. However the device does have an
internal limit of 1024 segments. NVMe doesn't report max segments,
though. This is implicitly inferred based on the MDTS and MPSMIN values.
IOV_MAX is currently 1024 which 4k PRPs can exceed with 2MB transfers.
Don't allow MDTS values that can exceed this, otherwise users risk
seeing "internal error" status to their otherwise protocol compliant
commands.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
hw/nvme/ctrl.c | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index e764ec7683..5bfb773b5a 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -8335,6 +8335,11 @@ static bool nvme_check_params(NvmeCtrl *n, Error **errp)
host_memory_backend_set_mapped(n->pmr.dev, true);
}
+ if (!n->params.mdts || ((1 << n->params.mdts) + 1) > IOV_MAX) {
+ error_setg(errp, "mdts exceeds IOV_MAX");
+ return false;
+ }
+
if (n->params.zasl > n->params.mdts) {
error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
"than or equal to mdts (Maximum Data Transfer Size)");
--
2.47.3
On Aug 1 07:24, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> The emulated device had let the user set whatever max transfers size
> they wanted, including no limit. However the device does have an
> internal limit of 1024 segments. NVMe doesn't report max segments,
> though. This is implicitly inferred based on the MDTS and MPSMIN values.
>
> IOV_MAX is currently 1024 which 4k PRPs can exceed with 2MB transfers.
> Don't allow MDTS values that can exceed this, otherwise users risk
> seeing "internal error" status to their otherwise protocol compliant
> commands.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> hw/nvme/ctrl.c | 5 +++++
> 1 file changed, 5 insertions(+)
>
> diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> index e764ec7683..5bfb773b5a 100644
> --- a/hw/nvme/ctrl.c
> +++ b/hw/nvme/ctrl.c
> @@ -8335,6 +8335,11 @@ static bool nvme_check_params(NvmeCtrl *n, Error **errp)
> host_memory_backend_set_mapped(n->pmr.dev, true);
> }
>
> + if (!n->params.mdts || ((1 << n->params.mdts) + 1) > IOV_MAX) {
> + error_setg(errp, "mdts exceeds IOV_MAX");
> + return false;
> + }
> +
> if (n->params.zasl > n->params.mdts) {
> error_setg(errp, "zoned.zasl (Zone Append Size Limit) must be less "
> "than or equal to mdts (Maximum Data Transfer Size)");
> --
> 2.47.3
>
>
Hi Keith,
Looks good.
The dma helpers can actually deal with this, given a relatively simple
patch:
diff --git i/system/dma-helpers.c w/system/dma-helpers.c
index 6bad75876f11..fd169237efb2 100644
--- i/system/dma-helpers.c
+++ w/system/dma-helpers.c
@@ -158,6 +158,11 @@ static void dma_blk_cb(void *opaque, int ret)
}
if (!mem)
break;
+
+ if (dbs->iov.niov == IOV_MAX) {
+ break;
+ }
+
qemu_iovec_add(&dbs->iov, mem, cur_len);
dbs->sg_cur_byte += cur_len;
if (dbs->sg_cur_byte == dbs->sg->sg[dbs->sg_cur_index].len) {
In hw/nvme/ctrl.c the checks can then be dropped:
diff --git i/hw/nvme/ctrl.c w/hw/nvme/ctrl.c
index 518d02dc6670..4d8f678cfda5 100644
--- i/hw/nvme/ctrl.c
+++ w/hw/nvme/ctrl.c
@@ -849,10 +849,6 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
return NVME_INVALID_USE_OF_CMB | NVME_DNR;
}
- if (sg->iov.niov + 1 > IOV_MAX) {
- goto max_mappings_exceeded;
- }
-
if (cmb) {
return nvme_map_addr_cmb(n, &sg->iov, addr, len);
} else {
@@ -864,10 +860,6 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
return NVME_INVALID_USE_OF_CMB | NVME_DNR;
}
- if (sg->qsg.nsg + 1 > IOV_MAX) {
- goto max_mappings_exceeded;
- }
-
qemu_sglist_add(&sg->qsg, addr, len);
return NVME_SUCCESS;
This allows >2MB transfers with PRPs. However, it is not a clean fix
since it does not deal with the issue that remains for the CMB (which
uses the blk_aio api directly.
I'll queue up your patch for now. Thanks!
On Mon, Aug 04, 2025 at 11:44:10AM +0200, Klaus Jensen wrote:
> On Aug 1 07:24, Keith Busch wrote:
>
> This allows >2MB transfers with PRPs. However, it is not a clean fix
> since it does not deal with the issue that remains for the CMB (which
> uses the blk_aio api directly.
>
> I'll queue up your patch for now. Thanks!
Thanks, sounds okay for the near term.
Here's another idea to merge contiguous PRPs into a single segment. This
should get around that IOV_MAX limitation for Linux because that OS
wouldn't send such a large IO if it wasn't mostly physically contiguous:
---
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index e764ec7683..83fb1385d1 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -827,6 +827,7 @@ static uint16_t nvme_map_addr_pmr(NvmeCtrl *n, QEMUIOVector *iov, hwaddr addr,
static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
{
bool cmb = false, pmr = false;
+ QEMUSGList *qsg;
if (!len) {
return NVME_SUCCESS;
@@ -864,6 +865,13 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, NvmeSg *sg, hwaddr addr, size_t len)
return NVME_INVALID_USE_OF_CMB | NVME_DNR;
}
+ qsg = &sg->qsg;
+ if (qsg->sg[qsg->nsg - 1].base + qsg->sg[qsg->nsg - 1].len == addr) {
+ qsg->sg[qsg->nsg].len += len;
+ qsg->size += len;
+ return NVME_SUCCESS;
+ }
+
if (sg->qsg.nsg + 1 > IOV_MAX) {
goto max_mappings_exceeded;
}
--
© 2016 - 2025 Red Hat, Inc.