Allow virtio-scsi virtqueues to be assigned to different IOThreads. This
makes it possible to take advantage of host multi-queue block layer
scalability by assigning virtqueues that have affinity with vCPUs to
different IOThreads that have affinity with host CPUs. The same feature
was introduced for virtio-blk in the past:
https://developers.redhat.com/articles/2024/09/05/scaling-virtio-blk-disk-io-iothread-virtqueue-mapping
Here are fio randread 4k iodepth=64 results from a 4 vCPU guest with an
Intel P4800X SSD:
iothreads IOPS
------------------------------
1 189576
2 312698
4 346744
Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
include/hw/virtio/virtio-scsi.h | 5 +-
hw/scsi/virtio-scsi-dataplane.c | 90 ++++++++++++++++++++++++---------
hw/scsi/virtio-scsi.c | 63 ++++++++++++++---------
3 files changed, 107 insertions(+), 51 deletions(-)
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 7b7e3ced7a..086201efa2 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -22,6 +22,7 @@
#include "hw/virtio/virtio.h"
#include "hw/scsi/scsi.h"
#include "chardev/char-fe.h"
+#include "qapi/qapi-types-virtio.h"
#include "system/iothread.h"
#define TYPE_VIRTIO_SCSI_COMMON "virtio-scsi-common"
@@ -60,6 +61,7 @@ struct VirtIOSCSIConf {
CharBackend chardev;
uint32_t boot_tpgt;
IOThread *iothread;
+ IOThreadVirtQueueMappingList *iothread_vq_mapping_list;
};
struct VirtIOSCSI;
@@ -97,7 +99,7 @@ struct VirtIOSCSI {
QTAILQ_HEAD(, VirtIOSCSIReq) tmf_bh_list;
/* Fields for dataplane below */
- AioContext *ctx; /* one iothread per virtio-scsi-pci for now */
+ AioContext **vq_aio_context; /* per-virtqueue AioContext pointer */
bool dataplane_started;
bool dataplane_starting;
@@ -115,6 +117,7 @@ void virtio_scsi_common_realize(DeviceState *dev,
void virtio_scsi_common_unrealize(DeviceState *dev);
void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp);
+void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s);
int virtio_scsi_dataplane_start(VirtIODevice *s);
void virtio_scsi_dataplane_stop(VirtIODevice *s);
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index f49ab98ecc..6bb368c8a5 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -18,6 +18,7 @@
#include "system/block-backend.h"
#include "hw/scsi/scsi.h"
#include "scsi/constants.h"
+#include "hw/virtio/iothread-vq-mapping.h"
#include "hw/virtio/virtio-bus.h"
/* Context: BQL held */
@@ -27,8 +28,16 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
VirtIODevice *vdev = VIRTIO_DEVICE(s);
BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+ uint16_t num_vqs = vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED;
- if (vs->conf.iothread) {
+ if (vs->conf.iothread && vs->conf.iothread_vq_mapping_list) {
+ error_setg(errp,
+ "iothread and iothread-vq-mapping properties cannot be set "
+ "at the same time");
+ return;
+ }
+
+ if (vs->conf.iothread || vs->conf.iothread_vq_mapping_list) {
if (!k->set_guest_notifiers || !k->ioeventfd_assign) {
error_setg(errp,
"device is incompatible with iothread "
@@ -39,15 +48,50 @@ void virtio_scsi_dataplane_setup(VirtIOSCSI *s, Error **errp)
error_setg(errp, "ioeventfd is required for iothread");
return;
}
- s->ctx = iothread_get_aio_context(vs->conf.iothread);
+ }
+
+ s->vq_aio_context = g_new(AioContext *, num_vqs);
+
+ if (vs->conf.iothread_vq_mapping_list) {
+ if (!iothread_vq_mapping_apply(vs->conf.iothread_vq_mapping_list,
+ s->vq_aio_context, num_vqs, errp)) {
+ g_free(s->vq_aio_context);
+ s->vq_aio_context = NULL;
+ return;
+ }
+ } else if (vs->conf.iothread) {
+ AioContext *ctx = iothread_get_aio_context(vs->conf.iothread);
+ for (uint16_t i = 0; i < num_vqs; i++) {
+ s->vq_aio_context[i] = ctx;
+ }
+
+ /* Released in virtio_scsi_dataplane_cleanup() */
+ object_ref(OBJECT(vs->conf.iothread));
} else {
- if (!virtio_device_ioeventfd_enabled(vdev)) {
- return;
+ AioContext *ctx = qemu_get_aio_context();
+ for (unsigned i = 0; i < num_vqs; i++) {
+ s->vq_aio_context[i] = ctx;
}
- s->ctx = qemu_get_aio_context();
}
}
+/* Context: BQL held */
+void virtio_scsi_dataplane_cleanup(VirtIOSCSI *s)
+{
+ VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+
+ if (vs->conf.iothread_vq_mapping_list) {
+ iothread_vq_mapping_cleanup(vs->conf.iothread_vq_mapping_list);
+ }
+
+ if (vs->conf.iothread) {
+ object_unref(OBJECT(vs->conf.iothread));
+ }
+
+ g_free(s->vq_aio_context);
+ s->vq_aio_context = NULL;
+}
+
static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
{
BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(s)));
@@ -66,31 +110,20 @@ static int virtio_scsi_set_host_notifier(VirtIOSCSI *s, VirtQueue *vq, int n)
}
/* Context: BH in IOThread */
-static void virtio_scsi_dataplane_stop_bh(void *opaque)
+static void virtio_scsi_dataplane_stop_vq_bh(void *opaque)
{
- VirtIOSCSI *s = opaque;
- VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(s);
+ AioContext *ctx = qemu_get_current_aio_context();
+ VirtQueue *vq = opaque;
EventNotifier *host_notifier;
- int i;
- virtio_queue_aio_detach_host_notifier(vs->ctrl_vq, s->ctx);
- host_notifier = virtio_queue_get_host_notifier(vs->ctrl_vq);
+ virtio_queue_aio_detach_host_notifier(vq, ctx);
+ host_notifier = virtio_queue_get_host_notifier(vq);
/*
* Test and clear notifier after disabling event, in case poll callback
* didn't have time to run.
*/
virtio_queue_host_notifier_read(host_notifier);
-
- virtio_queue_aio_detach_host_notifier(vs->event_vq, s->ctx);
- host_notifier = virtio_queue_get_host_notifier(vs->event_vq);
- virtio_queue_host_notifier_read(host_notifier);
-
- for (i = 0; i < vs->conf.num_queues; i++) {
- virtio_queue_aio_detach_host_notifier(vs->cmd_vqs[i], s->ctx);
- host_notifier = virtio_queue_get_host_notifier(vs->cmd_vqs[i]);
- virtio_queue_host_notifier_read(host_notifier);
- }
}
/* Context: BQL held */
@@ -154,11 +187,14 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev)
smp_wmb(); /* paired with aio_notify_accept() */
if (s->bus.drain_count == 0) {
- virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, s->ctx);
- virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, s->ctx);
+ virtio_queue_aio_attach_host_notifier(vs->ctrl_vq,
+ s->vq_aio_context[0]);
+ virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq,
+ s->vq_aio_context[1]);
for (i = 0; i < vs->conf.num_queues; i++) {
- virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], s->ctx);
+ AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
+ virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], ctx);
}
}
return 0;
@@ -207,7 +243,11 @@ void virtio_scsi_dataplane_stop(VirtIODevice *vdev)
s->dataplane_stopping = true;
if (s->bus.drain_count == 0) {
- aio_wait_bh_oneshot(s->ctx, virtio_scsi_dataplane_stop_bh, s);
+ for (i = 0; i < vs->conf.num_queues + VIRTIO_SCSI_VQ_NUM_FIXED; i++) {
+ VirtQueue *vq = virtio_get_queue(&vs->parent_obj, i);
+ AioContext *ctx = s->vq_aio_context[i];
+ aio_wait_bh_oneshot(ctx, virtio_scsi_dataplane_stop_vq_bh, vq);
+ }
}
blk_drain_all(); /* ensure there are no in-flight requests */
diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
index 2045d27289..dabf8ace23 100644
--- a/hw/scsi/virtio-scsi.c
+++ b/hw/scsi/virtio-scsi.c
@@ -27,6 +27,7 @@
#include "hw/qdev-properties.h"
#include "hw/scsi/scsi.h"
#include "scsi/constants.h"
+#include "hw/virtio/iothread-vq-mapping.h"
#include "hw/virtio/virtio-bus.h"
#include "hw/virtio/virtio-access.h"
#include "trace.h"
@@ -318,13 +319,6 @@ static void virtio_scsi_cancel_notify(Notifier *notifier, void *data)
g_free(n);
}
-static inline void virtio_scsi_ctx_check(VirtIOSCSI *s, SCSIDevice *d)
-{
- if (s->dataplane_started && d && blk_is_available(d->conf.blk)) {
- assert(blk_get_aio_context(d->conf.blk) == s->ctx);
- }
-}
-
static void virtio_scsi_do_one_tmf_bh(VirtIOSCSIReq *req)
{
VirtIOSCSI *s = req->dev;
@@ -517,9 +511,11 @@ static void virtio_scsi_flush_defer_tmf_to_aio_context(VirtIOSCSI *s)
assert(!s->dataplane_started);
- if (s->ctx) {
+ for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
+ AioContext *ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
+
/* Our BH only runs after previously scheduled BHs */
- aio_wait_bh_oneshot(s->ctx, dummy_bh, NULL);
+ aio_wait_bh_oneshot(ctx, dummy_bh, NULL);
}
}
@@ -575,7 +571,6 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
AioContext *ctx;
int ret = 0;
- virtio_scsi_ctx_check(s, d);
/* Here VIRTIO_SCSI_S_OK means "FUNCTION COMPLETE". */
req->resp.tmf.response = VIRTIO_SCSI_S_OK;
@@ -639,6 +634,8 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET:
case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: {
+ g_autoptr(GHashTable) aio_contexts = g_hash_table_new(NULL, NULL);
+
if (!d) {
goto fail;
}
@@ -648,8 +645,15 @@ static int virtio_scsi_do_tmf(VirtIOSCSI *s, VirtIOSCSIReq *req)
qatomic_inc(&req->remaining);
- ctx = s->ctx ?: qemu_get_aio_context();
- virtio_scsi_defer_tmf_to_aio_context(req, ctx);
+ for (uint32_t i = 0; i < s->parent_obj.conf.num_queues; i++) {
+ ctx = s->vq_aio_context[VIRTIO_SCSI_VQ_NUM_FIXED + i];
+
+ if (!g_hash_table_add(aio_contexts, ctx)) {
+ continue; /* skip previously added AioContext */
+ }
+
+ virtio_scsi_defer_tmf_to_aio_context(req, ctx);
+ }
virtio_scsi_tmf_dec_remaining(req);
ret = -EINPROGRESS;
@@ -770,9 +774,12 @@ static void virtio_scsi_handle_ctrl_vq(VirtIOSCSI *s, VirtQueue *vq)
*/
static bool virtio_scsi_defer_to_dataplane(VirtIOSCSI *s)
{
- if (!s->ctx || s->dataplane_started) {
+ if (s->dataplane_started) {
return false;
}
+ if (s->vq_aio_context[0] == qemu_get_aio_context()) {
+ return false; /* not using IOThreads */
+ }
virtio_device_start_ioeventfd(&s->parent_obj.parent_obj);
return !s->dataplane_fenced;
@@ -946,7 +953,6 @@ static int virtio_scsi_handle_cmd_req_prepare(VirtIOSCSI *s, VirtIOSCSIReq *req)
virtio_scsi_complete_cmd_req(req);
return -ENOENT;
}
- virtio_scsi_ctx_check(s, d);
req->sreq = scsi_req_new(d, req->req.cmd.tag,
virtio_scsi_get_lun(req->req.cmd.lun),
req->req.cmd.cdb, vs->cdb_size, req);
@@ -1218,14 +1224,16 @@ static void virtio_scsi_hotplug(HotplugHandler *hotplug_dev, DeviceState *dev,
{
VirtIODevice *vdev = VIRTIO_DEVICE(hotplug_dev);
VirtIOSCSI *s = VIRTIO_SCSI(vdev);
+ AioContext *ctx = s->vq_aio_context[0];
SCSIDevice *sd = SCSI_DEVICE(dev);
- int ret;
- if (s->ctx && !s->dataplane_fenced) {
- ret = blk_set_aio_context(sd->conf.blk, s->ctx, errp);
- if (ret < 0) {
- return;
- }
+ if (ctx != qemu_get_aio_context() && !s->dataplane_fenced) {
+ /*
+ * Try to make the BlockBackend's AioContext match ours. Ignore failure
+ * because I/O will still work although block jobs and other users
+ * might be slower when multiple AioContexts use a BlockBackend.
+ */
+ blk_set_aio_context(sd->conf.blk, ctx, errp);
}
if (virtio_vdev_has_feature(vdev, VIRTIO_SCSI_F_HOTPLUG)) {
@@ -1260,7 +1268,7 @@ static void virtio_scsi_hotunplug(HotplugHandler *hotplug_dev, DeviceState *dev,
qdev_simple_device_unplug_cb(hotplug_dev, dev, errp);
- if (s->ctx) {
+ if (s->vq_aio_context[0] != qemu_get_aio_context()) {
/* If other users keep the BlockBackend in the iothread, that's ok */
blk_set_aio_context(sd->conf.blk, qemu_get_aio_context(), NULL);
}
@@ -1294,7 +1302,7 @@ static void virtio_scsi_drained_begin(SCSIBus *bus)
for (uint32_t i = 0; i < total_queues; i++) {
VirtQueue *vq = virtio_get_queue(vdev, i);
- virtio_queue_aio_detach_host_notifier(vq, s->ctx);
+ virtio_queue_aio_detach_host_notifier(vq, s->vq_aio_context[i]);
}
}
@@ -1320,10 +1328,12 @@ static void virtio_scsi_drained_end(SCSIBus *bus)
for (uint32_t i = 0; i < total_queues; i++) {
VirtQueue *vq = virtio_get_queue(vdev, i);
+ AioContext *ctx = s->vq_aio_context[i];
+
if (vq == vs->event_vq) {
- virtio_queue_aio_attach_host_notifier_no_poll(vq, s->ctx);
+ virtio_queue_aio_attach_host_notifier_no_poll(vq, ctx);
} else {
- virtio_queue_aio_attach_host_notifier(vq, s->ctx);
+ virtio_queue_aio_attach_host_notifier(vq, ctx);
}
}
}
@@ -1430,12 +1440,13 @@ void virtio_scsi_common_unrealize(DeviceState *dev)
virtio_cleanup(vdev);
}
+/* main loop */
static void virtio_scsi_device_unrealize(DeviceState *dev)
{
VirtIOSCSI *s = VIRTIO_SCSI(dev);
virtio_scsi_reset_tmf_bh(s);
-
+ virtio_scsi_dataplane_cleanup(s);
qbus_set_hotplug_handler(BUS(&s->bus), NULL);
virtio_scsi_common_unrealize(dev);
qemu_mutex_destroy(&s->tmf_bh_lock);
@@ -1460,6 +1471,8 @@ static const Property virtio_scsi_properties[] = {
VIRTIO_SCSI_F_CHANGE, true),
DEFINE_PROP_LINK("iothread", VirtIOSCSI, parent_obj.conf.iothread,
TYPE_IOTHREAD, IOThread *),
+ DEFINE_PROP_IOTHREAD_VQ_MAPPING_LIST("iothread-vq-mapping", VirtIOSCSI,
+ parent_obj.conf.iothread_vq_mapping_list),
};
static const VMStateDescription vmstate_virtio_scsi = {
--
2.48.1
Am 13.02.2025 um 19:00 hat Stefan Hajnoczi geschrieben: > Allow virtio-scsi virtqueues to be assigned to different IOThreads. This > makes it possible to take advantage of host multi-queue block layer > scalability by assigning virtqueues that have affinity with vCPUs to > different IOThreads that have affinity with host CPUs. The same feature > was introduced for virtio-blk in the past: > https://developers.redhat.com/articles/2024/09/05/scaling-virtio-blk-disk-io-iothread-virtqueue-mapping > > Here are fio randread 4k iodepth=64 results from a 4 vCPU guest with an > Intel P4800X SSD: > iothreads IOPS > ------------------------------ > 1 189576 > 2 312698 > 4 346744 > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> As Peter already noted, the interface is a bit confusing in that it considers the control and event queue just normal queues like any other and you need to specify a mapping for them, too (even though you probably don't care about them). I wonder if it wouldn't be better to use the iothread-vq-mapping property only for command queues and to have separate properties for the event and control queue. I think this would be less surprising to users. It would also allow you to use the round robin allocation for command queues while using a different setting for the special queues - in particular, the event queue is currently no_poll, which disables polling for the whole AioContext, so you probably want to have it just anywhere else, but not in the iothreads you use for command queues. This should probably also be the default. Kevin
On Mon, Mar 10, 2025 at 15:33:02 +0100, Kevin Wolf wrote: > Am 13.02.2025 um 19:00 hat Stefan Hajnoczi geschrieben: > > Allow virtio-scsi virtqueues to be assigned to different IOThreads. This > > makes it possible to take advantage of host multi-queue block layer > > scalability by assigning virtqueues that have affinity with vCPUs to > > different IOThreads that have affinity with host CPUs. The same feature > > was introduced for virtio-blk in the past: > > https://developers.redhat.com/articles/2024/09/05/scaling-virtio-blk-disk-io-iothread-virtqueue-mapping > > > > Here are fio randread 4k iodepth=64 results from a 4 vCPU guest with an > > Intel P4800X SSD: > > iothreads IOPS > > ------------------------------ > > 1 189576 > > 2 312698 > > 4 346744 > > > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> > > As Peter already noted, the interface is a bit confusing in that it > considers the control and event queue just normal queues like any other > and you need to specify a mapping for them, too (even though you > probably don't care about them). > > I wonder if it wouldn't be better to use the iothread-vq-mapping > property only for command queues and to have separate properties for the > event and control queue. I think this would be less surprising to users. In the v2 of libvirt's patches I've proposed: <driver queues='3'> <iothreads> <iothread id='2'> <queue id='ctrl'/> <queue id='event'/> <queue id='1'/> </iothread> <iothread id='3'> <queue id='0'/> <queue id='2'/> </iothread> </iothreads> </driver> To map the queues by name explicitly so that it's clear what's happening. In my proposed it auto-translates ctrl and event into 0 and 1 and the command queues into N+2. > It would also allow you to use the round robin allocation for command > queues while using a different setting for the special queues - in > particular, the event queue is currently no_poll, which disables polling > for the whole AioContext, so you probably want to have it just anywhere > else, but not in the iothreads you use for command queues. This should > probably also be the default. This sounds like an important bit of information. If that stays like this I think libvirt should also document this. The proposed libvirt patch also words the recommendation to use the round-robin approach unless specific needs arise so if qemu did the correct thing here it would be great.
Am 10.03.2025 um 15:37 hat Peter Krempa geschrieben: > On Mon, Mar 10, 2025 at 15:33:02 +0100, Kevin Wolf wrote: > > Am 13.02.2025 um 19:00 hat Stefan Hajnoczi geschrieben: > > > Allow virtio-scsi virtqueues to be assigned to different IOThreads. This > > > makes it possible to take advantage of host multi-queue block layer > > > scalability by assigning virtqueues that have affinity with vCPUs to > > > different IOThreads that have affinity with host CPUs. The same feature > > > was introduced for virtio-blk in the past: > > > https://developers.redhat.com/articles/2024/09/05/scaling-virtio-blk-disk-io-iothread-virtqueue-mapping > > > > > > Here are fio randread 4k iodepth=64 results from a 4 vCPU guest with an > > > Intel P4800X SSD: > > > iothreads IOPS > > > ------------------------------ > > > 1 189576 > > > 2 312698 > > > 4 346744 > > > > > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com> > > > > As Peter already noted, the interface is a bit confusing in that it > > considers the control and event queue just normal queues like any other > > and you need to specify a mapping for them, too (even though you > > probably don't care about them). > > > > I wonder if it wouldn't be better to use the iothread-vq-mapping > > property only for command queues and to have separate properties for the > > event and control queue. I think this would be less surprising to users. > > In the v2 of libvirt's patches I've proposed: > > <driver queues='3'> > <iothreads> > <iothread id='2'> > <queue id='ctrl'/> > <queue id='event'/> > <queue id='1'/> > </iothread> > <iothread id='3'> > <queue id='0'/> > <queue id='2'/> > </iothread> > </iothreads> > </driver> > > To map the queues by name explicitly so that it's clear what's > happening. > > In my proposed it auto-translates ctrl and event into 0 and 1 and the > command queues into N+2. Note that if I understand patch 12 correctly, the 'ctrl' queue setting will never actually take effect. So libvirt probably shouln't even offer it (and neither should QEMU). > > It would also allow you to use the round robin allocation for command > > queues while using a different setting for the special queues - in > > particular, the event queue is currently no_poll, which disables polling > > for the whole AioContext, so you probably want to have it just anywhere > > else, but not in the iothreads you use for command queues. This should > > probably also be the default. > > This sounds like an important bit of information. If that stays like > this I think libvirt should also document this. > > The proposed libvirt patch also words the recommendation to use the > round-robin approach unless specific needs arise so if qemu did the > correct thing here it would be great. Yes, I consider this a QEMU bug that should be fixed. It's no_poll not in the sense that we must use the eventfd because we can't otherwise figure out if it's ready, but that we don't usually care about new things being ready in the queue. But if we always tie the event queue to the main loop, too, it would already be worked around for most cases - the main loop generally won't be able to poll anyway because of other fd handlers that don't support polling. Kevin
© 2016 - 2025 Red Hat, Inc.