[Qemu-devel] [PATCH v4] virtio-scsi: restart DMA after iothread

Stefan Hajnoczi posted 1 patch 4 years, 9 months ago
Test s390x passed
Test checkpatch failed
Test asan passed
Test docker-mingw@fedora passed
Test docker-clang@ubuntu passed
Test FreeBSD passed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20190612120421.20336-1-stefanha@redhat.com
Maintainers: "Michael S. Tsirkin" <mst@redhat.com>, Fam Zheng <fam@euphon.net>, Paolo Bonzini <pbonzini@redhat.com>
There is a newer version of this series
include/sysemu/sysemu.h | 10 ++++++++++
hw/scsi/scsi-bus.c      |  6 ++++--
hw/virtio/virtio.c      |  6 ++++--
vl.c                    | 44 +++++++++++++++++++++++++++++++----------
4 files changed, 52 insertions(+), 14 deletions(-)
[Qemu-devel] [PATCH v4] virtio-scsi: restart DMA after iothread
Posted by Stefan Hajnoczi 4 years, 9 months ago
When the 'cont' command resumes guest execution the vm change state
handlers are invoked.  Unfortunately there is no explicit ordering
between vm change state handlers.  When two layers of code both use vm
change state handlers, we don't control which handler runs first.

virtio-scsi with iothreads hits a deadlock when a failed SCSI command is
restarted and completes before the iothread is re-initialized.

This patch introduces priorities for VM change state handlers so the
IOThread is guaranteed to be initialized before DMA requests are
restarted.

Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
---
v4:
Paolo and Michael were interested in a priorities system.  Kevin wasn't
convinced.  Here is a patch implementing the priorities approach so you
can decide whether you prefer this or not.

The customer has now confirmed that the deadlock is fixed.  I have
changed the Subject line from RFC to PATCH.

 include/sysemu/sysemu.h | 10 ++++++++++
 hw/scsi/scsi-bus.c      |  6 ++++--
 hw/virtio/virtio.c      |  6 ++++--
 vl.c                    | 44 +++++++++++++++++++++++++++++++----------
 4 files changed, 52 insertions(+), 14 deletions(-)

diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 61579ae71e..1a4db092c7 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -27,8 +27,18 @@ bool runstate_store(char *str, size_t size);
 typedef struct vm_change_state_entry VMChangeStateEntry;
 typedef void VMChangeStateHandler(void *opaque, int running, RunState state);
 
+enum {
+    /* Low priorities run first when the VM starts */
+    VM_CHANGE_STATE_HANDLER_PRIO_UNDEFINED = 0,
+    VM_CHANGE_STATE_HANDLER_PRIO_IOTHREAD = 100,
+    VM_CHANGE_STATE_HANDLER_PRIO_DEVICE = 200,
+    /* High priorities run first when the VM stops */
+};
+
 VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
                                                      void *opaque);
+VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
+        VMChangeStateHandler *cb, void *opaque, int priority);
 void qemu_del_vm_change_state_handler(VMChangeStateEntry *e);
 void vm_state_notify(int running, RunState state);
 
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index c480553083..eda5b9a19e 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -206,8 +206,10 @@ static void scsi_qdev_realize(DeviceState *qdev, Error **errp)
         error_propagate(errp, local_err);
         return;
     }
-    dev->vmsentry = qemu_add_vm_change_state_handler(scsi_dma_restart_cb,
-                                                     dev);
+    dev->vmsentry = qemu_add_vm_change_state_handler_prio(
+            scsi_dma_restart_cb,
+            dev,
+            VM_CHANGE_STATE_HANDLER_PRIO_DEVICE);
 }
 
 static void scsi_qdev_unrealize(DeviceState *qdev, Error **errp)
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 07f4a64b48..9256af587a 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -2354,8 +2354,10 @@ void virtio_init(VirtIODevice *vdev, const char *name,
     } else {
         vdev->config = NULL;
     }
-    vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change,
-                                                     vdev);
+    vdev->vmstate = qemu_add_vm_change_state_handler_prio(
+            virtio_vmstate_change,
+            vdev,
+            VM_CHANGE_STATE_HANDLER_PRIO_IOTHREAD);
     vdev->device_endian = virtio_default_endian();
     vdev->use_guest_notifier_mask = true;
 }
diff --git a/vl.c b/vl.c
index cd1fbc4cdc..26c82063d2 100644
--- a/vl.c
+++ b/vl.c
@@ -1469,27 +1469,45 @@ static int machine_help_func(QemuOpts *opts, MachineState *machine)
 struct vm_change_state_entry {
     VMChangeStateHandler *cb;
     void *opaque;
-    QLIST_ENTRY (vm_change_state_entry) entries;
+    QTAILQ_ENTRY (vm_change_state_entry) entries;
+    int priority;
 };
 
-static QLIST_HEAD(, vm_change_state_entry) vm_change_state_head;
+static QTAILQ_HEAD(, vm_change_state_entry) vm_change_state_head;
 
-VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
-                                                     void *opaque)
+VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
+        VMChangeStateHandler *cb, void *opaque, int priority)
 {
     VMChangeStateEntry *e;
+    VMChangeStateEntry *other;
 
     e = g_malloc0(sizeof (*e));
-
     e->cb = cb;
     e->opaque = opaque;
-    QLIST_INSERT_HEAD(&vm_change_state_head, e, entries);
+    e->priority = priority;
+
+    /* Keep list sorted in ascending priority order */
+    QTAILQ_FOREACH(other, &vm_change_state_head, entries) {
+        if (priority < other->priority) {
+            QTAILQ_INSERT_BEFORE(other, e, entries);
+            return e;
+        }
+    }
+
+    QTAILQ_INSERT_TAIL(&vm_change_state_head, e, entries);
     return e;
 }
 
+VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
+                                                     void *opaque)
+{
+    return qemu_add_vm_change_state_handler_prio(cb, opaque,
+            VM_CHANGE_STATE_HANDLER_PRIO_UNDEFINED);
+}
+
 void qemu_del_vm_change_state_handler(VMChangeStateEntry *e)
 {
-    QLIST_REMOVE (e, entries);
+    QTAILQ_REMOVE (&vm_change_state_head, e, entries);
     g_free (e);
 }
 
@@ -1499,8 +1517,14 @@ void vm_state_notify(int running, RunState state)
 
     trace_vm_state_notify(running, state, RunState_str(state));
 
-    QLIST_FOREACH_SAFE(e, &vm_change_state_head, entries, next) {
-        e->cb(e->opaque, running, state);
+    if (running) {
+        QTAILQ_FOREACH_SAFE(e, &vm_change_state_head, entries, next) {
+            e->cb(e->opaque, running, state);
+        }
+    } else {
+        QTAILQ_FOREACH_REVERSE_SAFE(e, &vm_change_state_head, entries, next) {
+            e->cb(e->opaque, running, state);
+        }
     }
 }
 
@@ -3009,7 +3033,7 @@ int main(int argc, char **argv, char **envp)
         exit(1);
     }
 
-    QLIST_INIT (&vm_change_state_head);
+    QTAILQ_INIT (&vm_change_state_head);
     os_setup_early_signal_handling();
 
     cpu_option = NULL;
-- 
2.21.0


Re: [Qemu-devel] [PATCH v4] virtio-scsi: restart DMA after iothread
Posted by Paolo Bonzini 4 years, 9 months ago
On 12/06/19 14:04, Stefan Hajnoczi wrote:
> When the 'cont' command resumes guest execution the vm change state
> handlers are invoked.  Unfortunately there is no explicit ordering
> between vm change state handlers.  When two layers of code both use vm
> change state handlers, we don't control which handler runs first.
> 
> virtio-scsi with iothreads hits a deadlock when a failed SCSI command is
> restarted and completes before the iothread is re-initialized.
> 
> This patch introduces priorities for VM change state handlers so the
> IOThread is guaranteed to be initialized before DMA requests are
> restarted.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
> v4:
> Paolo and Michael were interested in a priorities system.  Kevin wasn't
> convinced.  Here is a patch implementing the priorities approach so you
> can decide whether you prefer this or not.

I like this better than the others, but I do agree with Kevin that the
names aren't great and PRIO_IOTHREAD has nothing to do with iothreads
really.

Maybe the priority should be simply the "depth" of the device's bus, so
2 for scsi because we know it always has a controller between the device
and the machine and 1 for everything else.  Maybe who knows, in the
future the vmstate_change handler could be moved in DeviceClass and
propagated across buses[1]

So I vote for this patch but with VM_CHANGE_STATE_HANDLER_PRIO_IOTHREAD
renamed to VM_CHANGE_STATE_HANDLER_PRIO_DEVICE and
VM_CHANGE_STATE_HANDLER_PRIO_DEVICE renamed to
VM_CHANGE_STATE_HANDLER_PRIO_DISK (which is consistent with the naming
of scsi-disk, though not with ide-drive...).

Paolo

[1] With care, because currently the initialization order for stop is
virtio-device -> virtio-pci -> scsi-bus (and the opposite for start).  I
am not sure that the order for virtio-pci and virtio-device could be
exchanged, as would be the case if you followed the bus order
pci->virtio->scsi.

> The customer has now confirmed that the deadlock is fixed.  I have
> changed the Subject line from RFC to PATCH.
> 
>  include/sysemu/sysemu.h | 10 ++++++++++
>  hw/scsi/scsi-bus.c      |  6 ++++--
>  hw/virtio/virtio.c      |  6 ++++--
>  vl.c                    | 44 +++++++++++++++++++++++++++++++----------
>  4 files changed, 52 insertions(+), 14 deletions(-)
> 
> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
> index 61579ae71e..1a4db092c7 100644
> --- a/include/sysemu/sysemu.h
> +++ b/include/sysemu/sysemu.h
> @@ -27,8 +27,18 @@ bool runstate_store(char *str, size_t size);
>  typedef struct vm_change_state_entry VMChangeStateEntry;
>  typedef void VMChangeStateHandler(void *opaque, int running, RunState state);
>  
> +enum {
> +    /* Low priorities run first when the VM starts */
> +    VM_CHANGE_STATE_HANDLER_PRIO_UNDEFINED = 0,
> +    VM_CHANGE_STATE_HANDLER_PRIO_IOTHREAD = 100,
> +    VM_CHANGE_STATE_HANDLER_PRIO_DEVICE = 200,
> +    /* High priorities run first when the VM stops */
> +};
> +
>  VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
>                                                       void *opaque);
> +VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
> +        VMChangeStateHandler *cb, void *opaque, int priority);
>  void qemu_del_vm_change_state_handler(VMChangeStateEntry *e);
>  void vm_state_notify(int running, RunState state);
>  
> diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
> index c480553083..eda5b9a19e 100644
> --- a/hw/scsi/scsi-bus.c
> +++ b/hw/scsi/scsi-bus.c
> @@ -206,8 +206,10 @@ static void scsi_qdev_realize(DeviceState *qdev, Error **errp)
>          error_propagate(errp, local_err);
>          return;
>      }
> -    dev->vmsentry = qemu_add_vm_change_state_handler(scsi_dma_restart_cb,
> -                                                     dev);
> +    dev->vmsentry = qemu_add_vm_change_state_handler_prio(
> +            scsi_dma_restart_cb,
> +            dev,
> +            VM_CHANGE_STATE_HANDLER_PRIO_DEVICE);
>  }
>  
>  static void scsi_qdev_unrealize(DeviceState *qdev, Error **errp)
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 07f4a64b48..9256af587a 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -2354,8 +2354,10 @@ void virtio_init(VirtIODevice *vdev, const char *name,
>      } else {
>          vdev->config = NULL;
>      }
> -    vdev->vmstate = qemu_add_vm_change_state_handler(virtio_vmstate_change,
> -                                                     vdev);
> +    vdev->vmstate = qemu_add_vm_change_state_handler_prio(
> +            virtio_vmstate_change,
> +            vdev,
> +            VM_CHANGE_STATE_HANDLER_PRIO_IOTHREAD);
>      vdev->device_endian = virtio_default_endian();
>      vdev->use_guest_notifier_mask = true;
>  }
> diff --git a/vl.c b/vl.c
> index cd1fbc4cdc..26c82063d2 100644
> --- a/vl.c
> +++ b/vl.c
> @@ -1469,27 +1469,45 @@ static int machine_help_func(QemuOpts *opts, MachineState *machine)
>  struct vm_change_state_entry {
>      VMChangeStateHandler *cb;
>      void *opaque;
> -    QLIST_ENTRY (vm_change_state_entry) entries;
> +    QTAILQ_ENTRY (vm_change_state_entry) entries;
> +    int priority;
>  };
>  
> -static QLIST_HEAD(, vm_change_state_entry) vm_change_state_head;
> +static QTAILQ_HEAD(, vm_change_state_entry) vm_change_state_head;
>  
> -VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
> -                                                     void *opaque)
> +VMChangeStateEntry *qemu_add_vm_change_state_handler_prio(
> +        VMChangeStateHandler *cb, void *opaque, int priority)
>  {
>      VMChangeStateEntry *e;
> +    VMChangeStateEntry *other;
>  
>      e = g_malloc0(sizeof (*e));
> -
>      e->cb = cb;
>      e->opaque = opaque;
> -    QLIST_INSERT_HEAD(&vm_change_state_head, e, entries);
> +    e->priority = priority;
> +
> +    /* Keep list sorted in ascending priority order */
> +    QTAILQ_FOREACH(other, &vm_change_state_head, entries) {
> +        if (priority < other->priority) {
> +            QTAILQ_INSERT_BEFORE(other, e, entries);
> +            return e;
> +        }
> +    }
> +
> +    QTAILQ_INSERT_TAIL(&vm_change_state_head, e, entries);
>      return e;
>  }
>  
> +VMChangeStateEntry *qemu_add_vm_change_state_handler(VMChangeStateHandler *cb,
> +                                                     void *opaque)
> +{
> +    return qemu_add_vm_change_state_handler_prio(cb, opaque,
> +            VM_CHANGE_STATE_HANDLER_PRIO_UNDEFINED);
> +}
> +
>  void qemu_del_vm_change_state_handler(VMChangeStateEntry *e)
>  {
> -    QLIST_REMOVE (e, entries);
> +    QTAILQ_REMOVE (&vm_change_state_head, e, entries);
>      g_free (e);
>  }
>  
> @@ -1499,8 +1517,14 @@ void vm_state_notify(int running, RunState state)
>  
>      trace_vm_state_notify(running, state, RunState_str(state));
>  
> -    QLIST_FOREACH_SAFE(e, &vm_change_state_head, entries, next) {
> -        e->cb(e->opaque, running, state);
> +    if (running) {
> +        QTAILQ_FOREACH_SAFE(e, &vm_change_state_head, entries, next) {
> +            e->cb(e->opaque, running, state);
> +        }
> +    } else {
> +        QTAILQ_FOREACH_REVERSE_SAFE(e, &vm_change_state_head, entries, next) {
> +            e->cb(e->opaque, running, state);
> +        }
>      }
>  }
>  
> @@ -3009,7 +3033,7 @@ int main(int argc, char **argv, char **envp)
>          exit(1);
>      }
>  
> -    QLIST_INIT (&vm_change_state_head);
> +    QTAILQ_INIT (&vm_change_state_head);
>      os_setup_early_signal_handling();
>  
>      cpu_option = NULL;
> 


Re: [Qemu-devel] [PATCH v4] virtio-scsi: restart DMA after iothread
Posted by Kevin Wolf 4 years, 9 months ago
Am 12.06.2019 um 14:04 hat Stefan Hajnoczi geschrieben:
> When the 'cont' command resumes guest execution the vm change state
> handlers are invoked.  Unfortunately there is no explicit ordering
> between vm change state handlers.  When two layers of code both use vm
> change state handlers, we don't control which handler runs first.
> 
> virtio-scsi with iothreads hits a deadlock when a failed SCSI command is
> restarted and completes before the iothread is re-initialized.
> 
> This patch introduces priorities for VM change state handlers so the
> IOThread is guaranteed to be initialized before DMA requests are
> restarted.
> 
> Signed-off-by: Stefan Hajnoczi <stefanha@redhat.com>
> ---
> v4:
> Paolo and Michael were interested in a priorities system.  Kevin wasn't
> convinced.  Here is a patch implementing the priorities approach so you
> can decide whether you prefer this or not.

I still prefer the v3 approach that reflects the actual dependencies in
the code. With the priorities approach, we try to represent a tree of
devices (a structure we already have) with an additional flat list of
numbers. While these numbers can actually provide all the information
that we need in practice in this specific case, it still feels a bit
hacky and might not be extensible to future cases.

But in the end, if Paolo feels strongly that for whatever reason
propagating events through the real device tree isn't good, let's get
the bug fixed with whatever hack it takes.

Kevin

Re: [Qemu-devel] [PATCH v4] virtio-scsi: restart DMA after iothread
Posted by Paolo Bonzini 4 years, 9 months ago
On 17/06/19 14:29, Kevin Wolf wrote:
> But in the end, if Paolo feels strongly that for whatever reason
> propagating events through the real device tree isn't good, let's get
> the bug fixed with whatever hack it takes.

It is actually good, but the implementation in hw/scsi is ugly because
it singles out virtio-scsi - whereas the rule should simply be that the
HBA is stopped after the disks and started before.  Having the HBA do
something special if it cares about the order is the part that I didn't
like.

Paolo

Re: [Qemu-devel] [PATCH v4] virtio-scsi: restart DMA after iothread
Posted by Kevin Wolf 4 years, 9 months ago
Am 17.06.2019 um 19:23 hat Paolo Bonzini geschrieben:
> On 17/06/19 14:29, Kevin Wolf wrote:
> > But in the end, if Paolo feels strongly that for whatever reason
> > propagating events through the real device tree isn't good, let's get
> > the bug fixed with whatever hack it takes.
> 
> It is actually good, but the implementation in hw/scsi is ugly because
> it singles out virtio-scsi - whereas the rule should simply be that the
> HBA is stopped after the disks and started before.  Having the HBA do
> something special if it cares about the order is the part that I didn't
> like.

The alternative would be to add a VM state change handler to all other
HBAs, too, so that scsi-bus never has to register its own handler. I
don't think we have that many HBAs, so maybe it's actually workable.

Of course, this makes me think that maybe for the actual proper
solution, VM state change handlers should really be a feature that qdev
provides for devices. Then if a HBA doesn't have anything else to do,
the qdev core would just recurse into the children right away; if it has
something to do, it would disable the device after its children, and
re-enable it before the children.

However, I'm afraid we're talking about a major feature then and not
about a simple fix any more. :-(

Kevin

Re: [Qemu-devel] [PATCH v4] virtio-scsi: restart DMA after iothread
Posted by Paolo Bonzini 4 years, 9 months ago
On 17/06/19 19:58, Kevin Wolf wrote:
> Of course, this makes me think that maybe for the actual proper
> solution, VM state change handlers should really be a feature that qdev
> provides for devices. Then if a HBA doesn't have anything else to do,
> the qdev core would just recurse into the children right away; if it has
> something to do, it would disable the device after its children, and
> re-enable it before the children.

This was more or less my reply to this version of the patch. :)

Paolo

Re: [Qemu-devel] [PATCH v4] virtio-scsi: restart DMA after iothread
Posted by Stefan Hajnoczi 4 years, 9 months ago
On Mon, Jun 17, 2019 at 08:27:04PM +0200, Paolo Bonzini wrote:
> On 17/06/19 19:58, Kevin Wolf wrote:
> > Of course, this makes me think that maybe for the actual proper
> > solution, VM state change handlers should really be a feature that qdev
> > provides for devices. Then if a HBA doesn't have anything else to do,
> > the qdev core would just recurse into the children right away; if it has
> > something to do, it would disable the device after its children, and
> > re-enable it before the children.
> 
> This was more or less my reply to this version of the patch. :)

I'll send a new patch that is a combination of these ideas.

Stefan
Re: [Qemu-devel] [PATCH v4] virtio-scsi: restart DMA after iothread
Posted by no-reply@patchew.org 4 years, 9 months ago
Patchew URL: https://patchew.org/QEMU/20190612120421.20336-1-stefanha@redhat.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Subject: [Qemu-devel] [PATCH v4] virtio-scsi: restart DMA after iothread
Type: series
Message-id: 20190612120421.20336-1-stefanha@redhat.com

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

From https://github.com/patchew-project/qemu
   219dca61eb..59c58f96b2  master     -> master
From https://github.com/patchew-project/qemu
 * [new tag]               patchew/20190612120421.20336-1-stefanha@redhat.com -> patchew/20190612120421.20336-1-stefanha@redhat.com
Switched to a new branch 'test'
d1d49071ff virtio-scsi: restart DMA after iothread

=== OUTPUT BEGIN ===
ERROR: space prohibited between function name and open parenthesis '('
#88: FILE: vl.c:1472:
+    QTAILQ_ENTRY (vm_change_state_entry) entries;

ERROR: space prohibited between function name and open parenthesis '('
#101: FILE: vl.c:1484:
+    e = g_malloc0(sizeof (*e));

ERROR: space prohibited between function name and open parenthesis '('
#136: FILE: vl.c:1510:
+    QTAILQ_REMOVE (&vm_change_state_head, e, entries);

ERROR: space prohibited between function name and open parenthesis '('
#162: FILE: vl.c:3036:
+    QTAILQ_INIT (&vm_change_state_head);

total: 4 errors, 0 warnings, 122 lines checked

Commit d1d49071ff6e (virtio-scsi: restart DMA after iothread) has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20190612120421.20336-1-stefanha@redhat.com/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-devel@redhat.com