[v1] scsi: eliminate AioContext lock

[PATCH 1/4] scsi: only access SCSIDevice->requests from one thread

Posted by Stefan Hajnoczi 2 years, 2 months ago

Stop depending on the AioContext lock and instead access
SCSIDevice->requests from only one thread at a time:
- When the VM is running only the BlockBackend's AioContext may access
  the requests list.
- When the VM is stopped only the main loop may access the requests
  list.

These constraints protect the requests list without the need for locking
in the I/O code path.

Note that multiple IOThreads are not supported yet because the code
assumes all SCSIRequests are executed from a single AioContext. Leave
that as future work.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
 include/hw/scsi/scsi.h |   7 +-
 hw/scsi/scsi-bus.c     | 174 ++++++++++++++++++++++++++++-------------
 2 files changed, 124 insertions(+), 57 deletions(-)

diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
index 3692ca82f3..10c4e8288d 100644
--- a/include/hw/scsi/scsi.h
+++ b/include/hw/scsi/scsi.h
@@ -69,14 +69,19 @@ struct SCSIDevice
 {
     DeviceState qdev;
     VMChangeStateEntry *vmsentry;
-    QEMUBH *bh;
     uint32_t id;
     BlockConf conf;
     SCSISense unit_attention;
     bool sense_is_ua;
     uint8_t sense[SCSI_SENSE_BUF_SIZE];
     uint32_t sense_len;
+
+    /*
+     * The requests list is only accessed from the AioContext that executes
+     * requests or from the main loop when IOThread processing is stopped.
+     */
     QTAILQ_HEAD(, SCSIRequest) requests;
+
     uint32_t channel;
     uint32_t lun;
     int blocksize;
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index fc4b77fdb0..b8bfde9565 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -85,6 +85,82 @@ SCSIDevice *scsi_device_get(SCSIBus *bus, int channel, int id, int lun)
     return d;
 }
 
+/*
+ * Invoke @fn() for each enqueued request in device @s. Must be called from the
+ * main loop thread while the guest is stopped. This is only suitable for
+ * vmstate ->put(), use scsi_device_for_each_req_async() for other cases.
+ */
+static void scsi_device_for_each_req_sync(SCSIDevice *s,
+                                          void (*fn)(SCSIRequest *, void *),
+                                          void *opaque)
+{
+    SCSIRequest *req;
+    SCSIRequest *next_req;
+
+    assert(!runstate_is_running());
+    assert(qemu_in_main_thread());
+
+    QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
+        fn(req, opaque);
+    }
+}
+
+typedef struct {
+    SCSIDevice *s;
+    void (*fn)(SCSIRequest *, void *);
+    void *fn_opaque;
+} SCSIDeviceForEachReqAsyncData;
+
+static void scsi_device_for_each_req_async_bh(void *opaque)
+{
+    g_autofree SCSIDeviceForEachReqAsyncData *data = opaque;
+    SCSIDevice *s = data->s;
+    SCSIRequest *req;
+    SCSIRequest *next;
+
+    /*
+     * It is unlikely that the AioContext will change before this BH is called,
+     * but if it happens then ->requests must not be accessed from this
+     * AioContext.
+     */
+    if (blk_get_aio_context(s->conf.blk) == qemu_get_current_aio_context()) {
+        QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
+            data->fn(req, data->fn_opaque);
+        }
+    }
+
+    /* Drop the reference taken by scsi_device_for_each_req_async() */
+    object_unref(OBJECT(s));
+}
+
+/*
+ * Schedule @fn() to be invoked for each enqueued request in device @s. @fn()
+ * runs in the AioContext that is executing the request.
+ */
+static void scsi_device_for_each_req_async(SCSIDevice *s,
+                                           void (*fn)(SCSIRequest *, void *),
+                                           void *opaque)
+{
+    assert(qemu_in_main_thread());
+
+    SCSIDeviceForEachReqAsyncData *data =
+        g_new(SCSIDeviceForEachReqAsyncData, 1);
+
+    data->s = s;
+    data->fn = fn;
+    data->fn_opaque = opaque;
+
+    /*
+     * Hold a reference to the SCSIDevice until
+     * scsi_device_for_each_req_async_bh() finishes.
+     */
+    object_ref(OBJECT(s));
+
+    aio_bh_schedule_oneshot(blk_get_aio_context(s->conf.blk),
+                            scsi_device_for_each_req_async_bh,
+                            data);
+}
+
 static void scsi_device_realize(SCSIDevice *s, Error **errp)
 {
     SCSIDeviceClass *sc = SCSI_DEVICE_GET_CLASS(s);
@@ -144,20 +220,18 @@ void scsi_bus_init_named(SCSIBus *bus, size_t bus_size, DeviceState *host,
     qbus_set_bus_hotplug_handler(BUS(bus));
 }
 
-static void scsi_dma_restart_bh(void *opaque)
+void scsi_req_retry(SCSIRequest *req)
 {
-    SCSIDevice *s = opaque;
-    SCSIRequest *req, *next;
+    req->retry = true;
+}
 
-    qemu_bh_delete(s->bh);
-    s->bh = NULL;
-
-    aio_context_acquire(blk_get_aio_context(s->conf.blk));
-    QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
-        scsi_req_ref(req);
-        if (req->retry) {
-            req->retry = false;
-            switch (req->cmd.mode) {
+/* Called in the AioContext that is executing the request */
+static void scsi_dma_restart_req(SCSIRequest *req, void *opaque)
+{
+    scsi_req_ref(req);
+    if (req->retry) {
+        req->retry = false;
+        switch (req->cmd.mode) {
             case SCSI_XFER_FROM_DEV:
             case SCSI_XFER_TO_DEV:
                 scsi_req_continue(req);
@@ -166,37 +240,22 @@ static void scsi_dma_restart_bh(void *opaque)
                 scsi_req_dequeue(req);
                 scsi_req_enqueue(req);
                 break;
-            }
         }
-        scsi_req_unref(req);
     }
-    aio_context_release(blk_get_aio_context(s->conf.blk));
-    /* Drop the reference that was acquired in scsi_dma_restart_cb */
-    object_unref(OBJECT(s));
-}
-
-void scsi_req_retry(SCSIRequest *req)
-{
-    /* No need to save a reference, because scsi_dma_restart_bh just
-     * looks at the request list.  */
-    req->retry = true;
+    scsi_req_unref(req);
 }
 
 static void scsi_dma_restart_cb(void *opaque, bool running, RunState state)
 {
     SCSIDevice *s = opaque;
 
+    assert(qemu_in_main_thread());
+
     if (!running) {
         return;
     }
-    if (!s->bh) {
-        AioContext *ctx = blk_get_aio_context(s->conf.blk);
-        /* The reference is dropped in scsi_dma_restart_bh.*/
-        object_ref(OBJECT(s));
-        s->bh = aio_bh_new_guarded(ctx, scsi_dma_restart_bh, s,
-                                   &DEVICE(s)->mem_reentrancy_guard);
-        qemu_bh_schedule(s->bh);
-    }
+
+    scsi_device_for_each_req_async(s, scsi_dma_restart_req, NULL);
 }
 
 static bool scsi_bus_is_address_free(SCSIBus *bus,
@@ -1657,15 +1716,16 @@ void scsi_device_set_ua(SCSIDevice *sdev, SCSISense sense)
     }
 }
 
+static void scsi_device_purge_one_req(SCSIRequest *req, void *opaque)
+{
+    scsi_req_cancel_async(req, NULL);
+}
+
 void scsi_device_purge_requests(SCSIDevice *sdev, SCSISense sense)
 {
-    SCSIRequest *req;
+    scsi_device_for_each_req_async(sdev, scsi_device_purge_one_req, NULL);
 
     aio_context_acquire(blk_get_aio_context(sdev->conf.blk));
-    while (!QTAILQ_EMPTY(&sdev->requests)) {
-        req = QTAILQ_FIRST(&sdev->requests);
-        scsi_req_cancel_async(req, NULL);
-    }
     blk_drain(sdev->conf.blk);
     aio_context_release(blk_get_aio_context(sdev->conf.blk));
     scsi_device_set_ua(sdev, sense);
@@ -1737,31 +1797,33 @@ static char *scsibus_get_fw_dev_path(DeviceState *dev)
 
 /* SCSI request list.  For simplicity, pv points to the whole device */
 
+static void put_scsi_req(SCSIRequest *req, void *opaque)
+{
+    QEMUFile *f = opaque;
+
+    assert(!req->io_canceled);
+    assert(req->status == -1 && req->host_status == -1);
+    assert(req->enqueued);
+
+    qemu_put_sbyte(f, req->retry ? 1 : 2);
+    qemu_put_buffer(f, req->cmd.buf, sizeof(req->cmd.buf));
+    qemu_put_be32s(f, &req->tag);
+    qemu_put_be32s(f, &req->lun);
+    if (req->bus->info->save_request) {
+        req->bus->info->save_request(f, req);
+    }
+    if (req->ops->save_request) {
+        req->ops->save_request(f, req);
+    }
+}
+
 static int put_scsi_requests(QEMUFile *f, void *pv, size_t size,
                              const VMStateField *field, JSONWriter *vmdesc)
 {
     SCSIDevice *s = pv;
-    SCSIBus *bus = DO_UPCAST(SCSIBus, qbus, s->qdev.parent_bus);
-    SCSIRequest *req;
 
-    QTAILQ_FOREACH(req, &s->requests, next) {
-        assert(!req->io_canceled);
-        assert(req->status == -1 && req->host_status == -1);
-        assert(req->enqueued);
-
-        qemu_put_sbyte(f, req->retry ? 1 : 2);
-        qemu_put_buffer(f, req->cmd.buf, sizeof(req->cmd.buf));
-        qemu_put_be32s(f, &req->tag);
-        qemu_put_be32s(f, &req->lun);
-        if (bus->info->save_request) {
-            bus->info->save_request(f, req);
-        }
-        if (req->ops->save_request) {
-            req->ops->save_request(f, req);
-        }
-    }
+    scsi_device_for_each_req_sync(s, put_scsi_req, f);
     qemu_put_sbyte(f, 0);
-
     return 0;
 }
 
-- 
2.42.0

Re: [PATCH 1/4] scsi: only access SCSIDevice->requests from one thread

Posted by Kevin Wolf 2 years, 2 months ago

Am 23.11.2023 um 20:49 hat Stefan Hajnoczi geschrieben:
> Stop depending on the AioContext lock and instead access
> SCSIDevice->requests from only one thread at a time:
> - When the VM is running only the BlockBackend's AioContext may access
>   the requests list.
> - When the VM is stopped only the main loop may access the requests
>   list.
> 
> These constraints protect the requests list without the need for locking
> in the I/O code path.
> 
> Note that multiple IOThreads are not supported yet because the code
> assumes all SCSIRequests are executed from a single AioContext. Leave
> that as future work.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  include/hw/scsi/scsi.h |   7 +-
>  hw/scsi/scsi-bus.c     | 174 ++++++++++++++++++++++++++++-------------
>  2 files changed, 124 insertions(+), 57 deletions(-)
> 
> diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
> index 3692ca82f3..10c4e8288d 100644
> --- a/include/hw/scsi/scsi.h
> +++ b/include/hw/scsi/scsi.h
> @@ -69,14 +69,19 @@ struct SCSIDevice
>  {
>      DeviceState qdev;
>      VMChangeStateEntry *vmsentry;
> -    QEMUBH *bh;
>      uint32_t id;
>      BlockConf conf;
>      SCSISense unit_attention;
>      bool sense_is_ua;
>      uint8_t sense[SCSI_SENSE_BUF_SIZE];
>      uint32_t sense_len;
> +
> +    /*
> +     * The requests list is only accessed from the AioContext that executes
> +     * requests or from the main loop when IOThread processing is stopped.
> +     */
>      QTAILQ_HEAD(, SCSIRequest) requests;
> +
>      uint32_t channel;
>      uint32_t lun;
>      int blocksize;
> diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
> index fc4b77fdb0..b8bfde9565 100644
> --- a/hw/scsi/scsi-bus.c
> +++ b/hw/scsi/scsi-bus.c
> @@ -85,6 +85,82 @@ SCSIDevice *scsi_device_get(SCSIBus *bus, int channel, int id, int lun)
>      return d;
>  }
>  
> +/*
> + * Invoke @fn() for each enqueued request in device @s. Must be called from the
> + * main loop thread while the guest is stopped. This is only suitable for
> + * vmstate ->put(), use scsi_device_for_each_req_async() for other cases.
> + */
> +static void scsi_device_for_each_req_sync(SCSIDevice *s,
> +                                          void (*fn)(SCSIRequest *, void *),
> +                                          void *opaque)
> +{
> +    SCSIRequest *req;
> +    SCSIRequest *next_req;
> +
> +    assert(!runstate_is_running());
> +    assert(qemu_in_main_thread());
> +
> +    QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
> +        fn(req, opaque);
> +    }
> +}
> +
> +typedef struct {
> +    SCSIDevice *s;
> +    void (*fn)(SCSIRequest *, void *);
> +    void *fn_opaque;
> +} SCSIDeviceForEachReqAsyncData;
> +
> +static void scsi_device_for_each_req_async_bh(void *opaque)
> +{
> +    g_autofree SCSIDeviceForEachReqAsyncData *data = opaque;
> +    SCSIDevice *s = data->s;
> +    SCSIRequest *req;
> +    SCSIRequest *next;
> +
> +    /*
> +     * It is unlikely that the AioContext will change before this BH is called,
> +     * but if it happens then ->requests must not be accessed from this
> +     * AioContext.
> +     */

What is the scenario where this happens? I would have expected that
switching the AioContext of a node involves draining the node first,
which would execute this BH before the context changes.

The other option I see is an empty BlockBackend, which can change its
AioContext without polling BHs, but in that case there is no connection
to other users, so the only change could come from virtio-scsi itself.
If there is such a case, it would probably be helpful to be specific in
the comment.

> +    if (blk_get_aio_context(s->conf.blk) == qemu_get_current_aio_context()) {
> +        QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
> +            data->fn(req, data->fn_opaque);
> +        }
> +    }

Of course, if the situation does happen, the question is why just doing
nothing is correct. Wouldn't that mean that the guest still sees stuck
requests?

Would rescheduling the BH in the new context be better?

> +
> +    /* Drop the reference taken by scsi_device_for_each_req_async() */
> +    object_unref(OBJECT(s));
> +}
> +
> +/*
> + * Schedule @fn() to be invoked for each enqueued request in device @s. @fn()
> + * runs in the AioContext that is executing the request.
> + */
> +static void scsi_device_for_each_req_async(SCSIDevice *s,
> +                                           void (*fn)(SCSIRequest *, void *),
> +                                           void *opaque)

If we keep the behaviour above (doesn't do anything if the AioContext
changes), then I think it needs to be documented for this function and
callers should be explicit about why it's okay.

Kevin

Re: [PATCH 1/4] scsi: only access SCSIDevice->requests from one thread

Posted by Stefan Hajnoczi 2 years, 2 months ago

On Fri, Dec 01, 2023 at 05:03:13PM +0100, Kevin Wolf wrote:
> Am 23.11.2023 um 20:49 hat Stefan Hajnoczi geschrieben:
> > Stop depending on the AioContext lock and instead access
> > SCSIDevice->requests from only one thread at a time:
> > - When the VM is running only the BlockBackend's AioContext may access
> >   the requests list.
> > - When the VM is stopped only the main loop may access the requests
> >   list.
> > 
> > These constraints protect the requests list without the need for locking
> > in the I/O code path.
> > 
> > Note that multiple IOThreads are not supported yet because the code
> > assumes all SCSIRequests are executed from a single AioContext. Leave
> > that as future work.
> > 
> > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > ---
> >  include/hw/scsi/scsi.h |   7 +-
> >  hw/scsi/scsi-bus.c     | 174 ++++++++++++++++++++++++++++-------------
> >  2 files changed, 124 insertions(+), 57 deletions(-)
> > 
> > diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
> > index 3692ca82f3..10c4e8288d 100644
> > --- a/include/hw/scsi/scsi.h
> > +++ b/include/hw/scsi/scsi.h
> > @@ -69,14 +69,19 @@ struct SCSIDevice
> >  {
> >      DeviceState qdev;
> >      VMChangeStateEntry *vmsentry;
> > -    QEMUBH *bh;
> >      uint32_t id;
> >      BlockConf conf;
> >      SCSISense unit_attention;
> >      bool sense_is_ua;
> >      uint8_t sense[SCSI_SENSE_BUF_SIZE];
> >      uint32_t sense_len;
> > +
> > +    /*
> > +     * The requests list is only accessed from the AioContext that executes
> > +     * requests or from the main loop when IOThread processing is stopped.
> > +     */
> >      QTAILQ_HEAD(, SCSIRequest) requests;
> > +
> >      uint32_t channel;
> >      uint32_t lun;
> >      int blocksize;
> > diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
> > index fc4b77fdb0..b8bfde9565 100644
> > --- a/hw/scsi/scsi-bus.c
> > +++ b/hw/scsi/scsi-bus.c
> > @@ -85,6 +85,82 @@ SCSIDevice *scsi_device_get(SCSIBus *bus, int channel, int id, int lun)
> >      return d;
> >  }
> >  
> > +/*
> > + * Invoke @fn() for each enqueued request in device @s. Must be called from the
> > + * main loop thread while the guest is stopped. This is only suitable for
> > + * vmstate ->put(), use scsi_device_for_each_req_async() for other cases.
> > + */
> > +static void scsi_device_for_each_req_sync(SCSIDevice *s,
> > +                                          void (*fn)(SCSIRequest *, void *),
> > +                                          void *opaque)
> > +{
> > +    SCSIRequest *req;
> > +    SCSIRequest *next_req;
> > +
> > +    assert(!runstate_is_running());
> > +    assert(qemu_in_main_thread());
> > +
> > +    QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
> > +        fn(req, opaque);
> > +    }
> > +}
> > +
> > +typedef struct {
> > +    SCSIDevice *s;
> > +    void (*fn)(SCSIRequest *, void *);
> > +    void *fn_opaque;
> > +} SCSIDeviceForEachReqAsyncData;
> > +
> > +static void scsi_device_for_each_req_async_bh(void *opaque)
> > +{
> > +    g_autofree SCSIDeviceForEachReqAsyncData *data = opaque;
> > +    SCSIDevice *s = data->s;
> > +    SCSIRequest *req;
> > +    SCSIRequest *next;
> > +
> > +    /*
> > +     * It is unlikely that the AioContext will change before this BH is called,
> > +     * but if it happens then ->requests must not be accessed from this
> > +     * AioContext.
> > +     */
> 
> What is the scenario where this happens? I would have expected that
> switching the AioContext of a node involves draining the node first,
> which would execute this BH before the context changes.

I don't think aio_poll() is invoked by bdrv_drained_begin() when there
are no requests in flight. In that case the BH could remain pending
across bdrv_drained_begin()/bdrv_drained_end().

> The other option I see is an empty BlockBackend, which can change its
> AioContext without polling BHs, but in that case there is no connection
> to other users, so the only change could come from virtio-scsi itself.
> If there is such a case, it would probably be helpful to be specific in
> the comment.
>
> > +    if (blk_get_aio_context(s->conf.blk) == qemu_get_current_aio_context()) {
> > +        QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
> > +            data->fn(req, data->fn_opaque);
> > +        }
> > +    }
> 
> Of course, if the situation does happen, the question is why just doing
> nothing is correct. Wouldn't that mean that the guest still sees stuck
> requests?
> 
> Would rescheduling the BH in the new context be better?

In the case where there are no requests it is correct to do nothing, but
it's not a general solution.

> > +
> > +    /* Drop the reference taken by scsi_device_for_each_req_async() */
> > +    object_unref(OBJECT(s));
> > +}
> > +
> > +/*
> > + * Schedule @fn() to be invoked for each enqueued request in device @s. @fn()
> > + * runs in the AioContext that is executing the request.
> > + */
> > +static void scsi_device_for_each_req_async(SCSIDevice *s,
> > +                                           void (*fn)(SCSIRequest *, void *),
> > +                                           void *opaque)
> 
> If we keep the behaviour above (doesn't do anything if the AioContext
> changes), then I think it needs to be documented for this function and
> callers should be explicit about why it's okay.

I think your suggestion to reschedule in the new AioContext is best.

Stefan

Re: [PATCH 1/4] scsi: only access SCSIDevice->requests from one thread

Posted by Kevin Wolf 2 years, 2 months ago

Am 04.12.2023 um 17:30 hat Stefan Hajnoczi geschrieben:
> On Fri, Dec 01, 2023 at 05:03:13PM +0100, Kevin Wolf wrote:
> > Am 23.11.2023 um 20:49 hat Stefan Hajnoczi geschrieben:
> > > Stop depending on the AioContext lock and instead access
> > > SCSIDevice->requests from only one thread at a time:
> > > - When the VM is running only the BlockBackend's AioContext may access
> > >   the requests list.
> > > - When the VM is stopped only the main loop may access the requests
> > >   list.
> > > 
> > > These constraints protect the requests list without the need for locking
> > > in the I/O code path.
> > > 
> > > Note that multiple IOThreads are not supported yet because the code
> > > assumes all SCSIRequests are executed from a single AioContext. Leave
> > > that as future work.
> > > 
> > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > ---
> > >  include/hw/scsi/scsi.h |   7 +-
> > >  hw/scsi/scsi-bus.c     | 174 ++++++++++++++++++++++++++++-------------
> > >  2 files changed, 124 insertions(+), 57 deletions(-)
> > > 
> > > diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
> > > index 3692ca82f3..10c4e8288d 100644
> > > --- a/include/hw/scsi/scsi.h
> > > +++ b/include/hw/scsi/scsi.h
> > > @@ -69,14 +69,19 @@ struct SCSIDevice
> > >  {
> > >      DeviceState qdev;
> > >      VMChangeStateEntry *vmsentry;
> > > -    QEMUBH *bh;
> > >      uint32_t id;
> > >      BlockConf conf;
> > >      SCSISense unit_attention;
> > >      bool sense_is_ua;
> > >      uint8_t sense[SCSI_SENSE_BUF_SIZE];
> > >      uint32_t sense_len;
> > > +
> > > +    /*
> > > +     * The requests list is only accessed from the AioContext that executes
> > > +     * requests or from the main loop when IOThread processing is stopped.
> > > +     */
> > >      QTAILQ_HEAD(, SCSIRequest) requests;
> > > +
> > >      uint32_t channel;
> > >      uint32_t lun;
> > >      int blocksize;
> > > diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
> > > index fc4b77fdb0..b8bfde9565 100644
> > > --- a/hw/scsi/scsi-bus.c
> > > +++ b/hw/scsi/scsi-bus.c
> > > @@ -85,6 +85,82 @@ SCSIDevice *scsi_device_get(SCSIBus *bus, int channel, int id, int lun)
> > >      return d;
> > >  }
> > >  
> > > +/*
> > > + * Invoke @fn() for each enqueued request in device @s. Must be called from the
> > > + * main loop thread while the guest is stopped. This is only suitable for
> > > + * vmstate ->put(), use scsi_device_for_each_req_async() for other cases.
> > > + */
> > > +static void scsi_device_for_each_req_sync(SCSIDevice *s,
> > > +                                          void (*fn)(SCSIRequest *, void *),
> > > +                                          void *opaque)
> > > +{
> > > +    SCSIRequest *req;
> > > +    SCSIRequest *next_req;
> > > +
> > > +    assert(!runstate_is_running());
> > > +    assert(qemu_in_main_thread());
> > > +
> > > +    QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
> > > +        fn(req, opaque);
> > > +    }
> > > +}
> > > +
> > > +typedef struct {
> > > +    SCSIDevice *s;
> > > +    void (*fn)(SCSIRequest *, void *);
> > > +    void *fn_opaque;
> > > +} SCSIDeviceForEachReqAsyncData;
> > > +
> > > +static void scsi_device_for_each_req_async_bh(void *opaque)
> > > +{
> > > +    g_autofree SCSIDeviceForEachReqAsyncData *data = opaque;
> > > +    SCSIDevice *s = data->s;
> > > +    SCSIRequest *req;
> > > +    SCSIRequest *next;
> > > +
> > > +    /*
> > > +     * It is unlikely that the AioContext will change before this BH is called,
> > > +     * but if it happens then ->requests must not be accessed from this
> > > +     * AioContext.
> > > +     */
> > 
> > What is the scenario where this happens? I would have expected that
> > switching the AioContext of a node involves draining the node first,
> > which would execute this BH before the context changes.
> 
> I don't think aio_poll() is invoked by bdrv_drained_begin() when there
> are no requests in flight. In that case the BH could remain pending
> across bdrv_drained_begin()/bdrv_drained_end().

Hm, I wonder if that is actually a bug. Without executing pending BHs,
you can't be sure that nothing touches the node any more.

Before commit 5e8ac217, we always polled at least once, though I think
that was an unintentional side effect.

> > The other option I see is an empty BlockBackend, which can change its
> > AioContext without polling BHs, but in that case there is no connection
> > to other users, so the only change could come from virtio-scsi itself.
> > If there is such a case, it would probably be helpful to be specific in
> > the comment.
> >
> > > +    if (blk_get_aio_context(s->conf.blk) == qemu_get_current_aio_context()) {
> > > +        QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
> > > +            data->fn(req, data->fn_opaque);
> > > +        }
> > > +    }
> > 
> > Of course, if the situation does happen, the question is why just doing
> > nothing is correct. Wouldn't that mean that the guest still sees stuck
> > requests?
> > 
> > Would rescheduling the BH in the new context be better?
> 
> In the case where there are no requests it is correct to do nothing,
> but it's not a general solution.

Why is it correct when there are no requests? I can see this for
scsi_device_purge_requests() because it only cancels in-flight requests,
but scsi_dma_restart_cb() is about requests queued at the device level
that are not in flight in the block layer. Not restarting them if there
aren't any other requests in flight seems wrong.

> > > +
> > > +    /* Drop the reference taken by scsi_device_for_each_req_async() */
> > > +    object_unref(OBJECT(s));
> > > +}
> > > +
> > > +/*
> > > + * Schedule @fn() to be invoked for each enqueued request in device @s. @fn()
> > > + * runs in the AioContext that is executing the request.
> > > + */
> > > +static void scsi_device_for_each_req_async(SCSIDevice *s,
> > > +                                           void (*fn)(SCSIRequest *, void *),
> > > +                                           void *opaque)
> > 
> > If we keep the behaviour above (doesn't do anything if the AioContext
> > changes), then I think it needs to be documented for this function and
> > callers should be explicit about why it's okay.
> 
> I think your suggestion to reschedule in the new AioContext is best.

Ok, then the answer for the above is less important.

Kevin

Re: [PATCH 1/4] scsi: only access SCSIDevice->requests from one thread

Posted by Stefan Hajnoczi 2 years, 2 months ago

On Tue, 5 Dec 2023 at 05:01, Kevin Wolf <kwolf@redhat.com> wrote:
>
> Am 04.12.2023 um 17:30 hat Stefan Hajnoczi geschrieben:
> > On Fri, Dec 01, 2023 at 05:03:13PM +0100, Kevin Wolf wrote:
> > > Am 23.11.2023 um 20:49 hat Stefan Hajnoczi geschrieben:
> > > > Stop depending on the AioContext lock and instead access
> > > > SCSIDevice->requests from only one thread at a time:
> > > > - When the VM is running only the BlockBackend's AioContext may access
> > > >   the requests list.
> > > > - When the VM is stopped only the main loop may access the requests
> > > >   list.
> > > >
> > > > These constraints protect the requests list without the need for locking
> > > > in the I/O code path.
> > > >
> > > > Note that multiple IOThreads are not supported yet because the code
> > > > assumes all SCSIRequests are executed from a single AioContext. Leave
> > > > that as future work.
> > > >
> > > > Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> > > > ---
> > > >  include/hw/scsi/scsi.h |   7 +-
> > > >  hw/scsi/scsi-bus.c     | 174 ++++++++++++++++++++++++++++-------------
> > > >  2 files changed, 124 insertions(+), 57 deletions(-)
> > > >
> > > > diff --git a/include/hw/scsi/scsi.h b/include/hw/scsi/scsi.h
> > > > index 3692ca82f3..10c4e8288d 100644
> > > > --- a/include/hw/scsi/scsi.h
> > > > +++ b/include/hw/scsi/scsi.h
> > > > @@ -69,14 +69,19 @@ struct SCSIDevice
> > > >  {
> > > >      DeviceState qdev;
> > > >      VMChangeStateEntry *vmsentry;
> > > > -    QEMUBH *bh;
> > > >      uint32_t id;
> > > >      BlockConf conf;
> > > >      SCSISense unit_attention;
> > > >      bool sense_is_ua;
> > > >      uint8_t sense[SCSI_SENSE_BUF_SIZE];
> > > >      uint32_t sense_len;
> > > > +
> > > > +    /*
> > > > +     * The requests list is only accessed from the AioContext that executes
> > > > +     * requests or from the main loop when IOThread processing is stopped.
> > > > +     */
> > > >      QTAILQ_HEAD(, SCSIRequest) requests;
> > > > +
> > > >      uint32_t channel;
> > > >      uint32_t lun;
> > > >      int blocksize;
> > > > diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
> > > > index fc4b77fdb0..b8bfde9565 100644
> > > > --- a/hw/scsi/scsi-bus.c
> > > > +++ b/hw/scsi/scsi-bus.c
> > > > @@ -85,6 +85,82 @@ SCSIDevice *scsi_device_get(SCSIBus *bus, int channel, int id, int lun)
> > > >      return d;
> > > >  }
> > > >
> > > > +/*
> > > > + * Invoke @fn() for each enqueued request in device @s. Must be called from the
> > > > + * main loop thread while the guest is stopped. This is only suitable for
> > > > + * vmstate ->put(), use scsi_device_for_each_req_async() for other cases.
> > > > + */
> > > > +static void scsi_device_for_each_req_sync(SCSIDevice *s,
> > > > +                                          void (*fn)(SCSIRequest *, void *),
> > > > +                                          void *opaque)
> > > > +{
> > > > +    SCSIRequest *req;
> > > > +    SCSIRequest *next_req;
> > > > +
> > > > +    assert(!runstate_is_running());
> > > > +    assert(qemu_in_main_thread());
> > > > +
> > > > +    QTAILQ_FOREACH_SAFE(req, &s->requests, next, next_req) {
> > > > +        fn(req, opaque);
> > > > +    }
> > > > +}
> > > > +
> > > > +typedef struct {
> > > > +    SCSIDevice *s;
> > > > +    void (*fn)(SCSIRequest *, void *);
> > > > +    void *fn_opaque;
> > > > +} SCSIDeviceForEachReqAsyncData;
> > > > +
> > > > +static void scsi_device_for_each_req_async_bh(void *opaque)
> > > > +{
> > > > +    g_autofree SCSIDeviceForEachReqAsyncData *data = opaque;
> > > > +    SCSIDevice *s = data->s;
> > > > +    SCSIRequest *req;
> > > > +    SCSIRequest *next;
> > > > +
> > > > +    /*
> > > > +     * It is unlikely that the AioContext will change before this BH is called,
> > > > +     * but if it happens then ->requests must not be accessed from this
> > > > +     * AioContext.
> > > > +     */
> > >
> > > What is the scenario where this happens? I would have expected that
> > > switching the AioContext of a node involves draining the node first,
> > > which would execute this BH before the context changes.
> >
> > I don't think aio_poll() is invoked by bdrv_drained_begin() when there
> > are no requests in flight. In that case the BH could remain pending
> > across bdrv_drained_begin()/bdrv_drained_end().
>
> Hm, I wonder if that is actually a bug. Without executing pending BHs,
> you can't be sure that nothing touches the node any more.
>
> Before commit 5e8ac217, we always polled at least once, though I think
> that was an unintentional side effect.

It makes the programming model easier and safer if aio_bh_poll() is
guaranteed to be called by bdrv_drained_begin().

Then I could convert this conditional into an assertion and assume it
never happens.

> > > The other option I see is an empty BlockBackend, which can change its
> > > AioContext without polling BHs, but in that case there is no connection
> > > to other users, so the only change could come from virtio-scsi itself.
> > > If there is such a case, it would probably be helpful to be specific in
> > > the comment.
> > >
> > > > +    if (blk_get_aio_context(s->conf.blk) == qemu_get_current_aio_context()) {
> > > > +        QTAILQ_FOREACH_SAFE(req, &s->requests, next, next) {
> > > > +            data->fn(req, data->fn_opaque);
> > > > +        }
> > > > +    }
> > >
> > > Of course, if the situation does happen, the question is why just doing
> > > nothing is correct. Wouldn't that mean that the guest still sees stuck
> > > requests?
> > >
> > > Would rescheduling the BH in the new context be better?
> >
> > In the case where there are no requests it is correct to do nothing,
> > but it's not a general solution.
>
> Why is it correct when there are no requests? I can see this for
> scsi_device_purge_requests() because it only cancels in-flight requests,
> but scsi_dma_restart_cb() is about requests queued at the device level
> that are not in flight in the block layer. Not restarting them if there
> aren't any other requests in flight seems wrong.

You're right!

> > > > +
> > > > +    /* Drop the reference taken by scsi_device_for_each_req_async() */
> > > > +    object_unref(OBJECT(s));
> > > > +}
> > > > +
> > > > +/*
> > > > + * Schedule @fn() to be invoked for each enqueued request in device @s. @fn()
> > > > + * runs in the AioContext that is executing the request.
> > > > + */
> > > > +static void scsi_device_for_each_req_async(SCSIDevice *s,
> > > > +                                           void (*fn)(SCSIRequest *, void *),
> > > > +                                           void *opaque)
> > >
> > > If we keep the behaviour above (doesn't do anything if the AioContext
> > > changes), then I think it needs to be documented for this function and
> > > callers should be explicit about why it's okay.
> >
> > I think your suggestion to reschedule in the new AioContext is best.
>
> Ok, then the answer for the above is less important.

I already sent v2 but will send a v3 if you think bdrv_drained_begin()
should always call aio_poll()?

Stefan

Re: [PATCH 1/4] scsi: only access SCSIDevice->requests from one thread

Posted by Eric Blake 2 years, 2 months ago

On Thu, Nov 23, 2023 at 02:49:28PM -0500, Stefan Hajnoczi wrote:
> Stop depending on the AioContext lock and instead access
> SCSIDevice->requests from only one thread at a time:
> - When the VM is running only the BlockBackend's AioContext may access
>   the requests list.
> - When the VM is stopped only the main loop may access the requests
>   list.
> 
> These constraints protect the requests list without the need for locking
> in the I/O code path.
> 
> Note that multiple IOThreads are not supported yet because the code
> assumes all SCSIRequests are executed from a single AioContext. Leave
> that as future work.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
>  include/hw/scsi/scsi.h |   7 +-
>  hw/scsi/scsi-bus.c     | 174 ++++++++++++++++++++++++++++-------------
>  2 files changed, 124 insertions(+), 57 deletions(-)
>

Reviewed-by: Eric Blake <eblake@redhat.com>

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org

[PATCH 1/4] scsi: only access SCSIDevice->requests from one thread
[PATCH 2/4] virtio-scsi: don't lock AioContext around virtio_queue_aio_attach_host_notifier()
[PATCH 3/4] scsi: don't lock AioContext in I/O code path
[PATCH 4/4] dma-helpers: don't lock AioContext in dma_blk_cb()