[PATCH v2] virtio-pci: disable vring processing when bus-mastering is disabled

Michael Roth posted 1 patch 4 years, 4 months ago
Test asan passed
Test checkpatch passed
Test FreeBSD passed
Test docker-mingw@fedora passed
Test docker-clang@ubuntu passed
Test docker-quick@centos7 failed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20191120005003.27035-1-mdroth@linux.vnet.ibm.com
Maintainers: Eduardo Habkost <ehabkost@redhat.com>, "Michael S. Tsirkin" <mst@redhat.com>, Marcel Apfelbaum <marcel.apfelbaum@gmail.com>
hw/core/machine.c          |  1 +
hw/virtio/virtio-pci.c     | 12 ++++++++----
hw/virtio/virtio.c         | 35 ++++++++++++++++++++++++++++-------
include/hw/virtio/virtio.h | 15 +++++++++++++++
4 files changed, 52 insertions(+), 11 deletions(-)
[PATCH v2] virtio-pci: disable vring processing when bus-mastering is disabled
Posted by Michael Roth 4 years, 4 months ago
Currently the SLOF firmware for pseries guests will disable/re-enable
a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND
register after the initial probe/feature negotiation, as it tends to
work with a single device at a time at various stages like probing
and running block/network bootloaders without doing a full reset
in-between.

In QEMU, when PCI_COMMAND_MASTER is disabled we disable the
corresponding IOMMU memory region, so DMA accesses (including to vring
fields like idx/flags) will no longer undergo the necessary
translation. Normally we wouldn't expect this to happen since it would
be misbehavior on the driver side to continue driving DMA requests.

However, in the case of pseries, with iommu_platform=on, we trigger the
following sequence when tearing down the virtio-blk dataplane ioeventfd
in response to the guest unsetting PCI_COMMAND_MASTER:

  #2  0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757
  #3  0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950
  #4  0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0)
      at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255
  #5  0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660)
      at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776
  #6  0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550
  #7  0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546
  #8  0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527
  #9  0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8)
      at /home/mdroth/w/qemu.git/util/aio-posix.c:520
  #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0)
      at /home/mdroth/w/qemu.git/util/aio-posix.c:607
  #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true)
      at /home/mdroth/w/qemu.git/util/aio-posix.c:639
  #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 <virtio_blk_data_plane_stop_bh>, opaque=opaque@entry=0x555556de86f0)
      at /home/mdroth/w/qemu.git/util/aio-wait.c:71
  #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=<optimized out>)
      at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288
  #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245
  #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237
  #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292
  #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=<optimized out>, val=1048832, len=<optimized out>)
      at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613

I.e. the calling code is only scheduling a one-shot BH for
virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
an additional virtqueue entry before we get there. This is likely due
to the following check in virtio_queue_host_notifier_aio_poll:

  static bool virtio_queue_host_notifier_aio_poll(void *opaque)
  {
      EventNotifier *n = opaque;
      VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
      bool progress;

      if (!vq->vring.desc || virtio_queue_empty(vq)) {
          return false;
      }

      progress = virtio_queue_notify_aio_vq(vq);

namely the call to virtio_queue_empty(). In this case, since no new
requests have actually been issued, shadow_avail_idx == last_avail_idx,
so we actually try to access the vring via vring_avail_idx() to get
the latest non-shadowed idx:

  int virtio_queue_empty(VirtQueue *vq)
  {
      bool empty;
      ...

      if (vq->shadow_avail_idx != vq->last_avail_idx) {
          return 0;
      }

      rcu_read_lock();
      empty = vring_avail_idx(vq) == vq->last_avail_idx;
      rcu_read_unlock();
      return empty;

but since the IOMMU region has been disabled we get a bogus value (0
usually), which causes virtio_queue_empty() to falsely report that
there are entries to be processed, which causes errors such as:

  "virtio: zero sized buffers are not allowed"

or

  "virtio-blk missing headers"

and puts the device in an error state.

This patch works around the issue by introducing virtio_set_disabled(),
which sets a 'disabled' flag to bypass checks like virtio_queue_empty()
when bus-mastering is disabled. Since we'd check this flag at all the
same sites as vdev->broken, we replace those checks with an inline
function which checks for either vdev->broken or vdev->disabled.

The 'disabled' flag is only migrated when set, which should be fairly
rare, but to maintain migration compatibility we disable it's use for
older machine types. Users requiring the use of the flag in conjunction
with older machine types can set it explicitly as a virtio-device
option.

NOTES:

 - This leaves some other oddities in play, like the fact that
   DRIVER_OK also gets unset in response to bus-mastering being
   disabled, but not restored (however the device seems to continue
   working)
 - Similarly, we disable the host notifier via
   virtio_bus_stop_ioeventfd(), which seems to move the handling out
   of virtio-blk dataplane and back into the main IO thread, and it
   ends up staying there till a reset (but otherwise continues working
   normally)

Cc: David Gibson <david@gibson.dropbear.id.au>,
Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
Cc: "Michael S. Tsirkin" <mst@redhat.com>
Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
---
v2:
 - add migration support and only default to using 'disabled' flag
   for newer machines via virtio-device compat option (MST)
 - use inline functions to decouple checks from specific fields (MST)
 - rebased on master
---
 hw/core/machine.c          |  1 +
 hw/virtio/virtio-pci.c     | 12 ++++++++----
 hw/virtio/virtio.c         | 35 ++++++++++++++++++++++++++++-------
 include/hw/virtio/virtio.h | 15 +++++++++++++++
 4 files changed, 52 insertions(+), 11 deletions(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index 1689ad3bf8..9f3073b23b 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -29,6 +29,7 @@
 
 GlobalProperty hw_compat_4_1[] = {
     { "virtio-pci", "x-pcie-flr-init", "off" },
+    { "virtio-device", "use-disabled-flag", "false" },
 };
 const size_t hw_compat_4_1_len = G_N_ELEMENTS(hw_compat_4_1);
 
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index c6b47a9c73..394d409fb9 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
         pcie_cap_flr_write_config(pci_dev, address, val, len);
     }
 
-    if (range_covers_byte(address, len, PCI_COMMAND) &&
-        !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
-        virtio_pci_stop_ioeventfd(proxy);
-        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
+    if (range_covers_byte(address, len, PCI_COMMAND)) {
+        if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
+            virtio_set_disabled(vdev, true);
+            virtio_pci_stop_ioeventfd(proxy);
+            virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
+        } else {
+            virtio_set_disabled(vdev, false);
+        }
     }
 
     if (proxy->config_cap &&
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 04716b5f6c..3cb603a466 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -546,7 +546,7 @@ static inline bool is_desc_avail(uint16_t flags, bool wrap_counter)
  * Called within rcu_read_lock().  */
 static int virtio_queue_empty_rcu(VirtQueue *vq)
 {
-    if (unlikely(vq->vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
         return 1;
     }
 
@@ -565,7 +565,7 @@ static int virtio_queue_split_empty(VirtQueue *vq)
 {
     bool empty;
 
-    if (unlikely(vq->vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
         return 1;
     }
 
@@ -783,7 +783,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
 
     virtqueue_unmap_sg(vq, elem, len);
 
-    if (unlikely(vq->vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
         return;
     }
 
@@ -839,7 +839,7 @@ static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count)
 
 void virtqueue_flush(VirtQueue *vq, unsigned int count)
 {
-    if (unlikely(vq->vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
         vq->inuse -= count;
         return;
     }
@@ -1602,7 +1602,7 @@ err_undo_map:
 
 void *virtqueue_pop(VirtQueue *vq, size_t sz)
 {
-    if (unlikely(vq->vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
         return NULL;
     }
 
@@ -1698,7 +1698,7 @@ unsigned int virtqueue_drop_all(VirtQueue *vq)
 {
     struct VirtIODevice *vdev = vq->vdev;
 
-    if (unlikely(vdev->broken)) {
+    if (virtio_device_disabled(vq->vdev)) {
         return 0;
     }
 
@@ -1816,7 +1816,7 @@ static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector)
     BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
     VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
 
-    if (unlikely(vdev->broken)) {
+    if (virtio_device_disabled(vdev)) {
         return;
     }
 
@@ -1920,6 +1920,7 @@ void virtio_reset(void *opaque)
     vdev->guest_features = 0;
     vdev->queue_sel = 0;
     vdev->status = 0;
+    vdev->disabled = false;
     atomic_set(&vdev->isr, 0);
     vdev->config_vector = VIRTIO_NO_VECTOR;
     virtio_notify_vector(vdev, vdev->config_vector);
@@ -2553,6 +2554,13 @@ static bool virtio_started_needed(void *opaque)
     return vdev->started;
 }
 
+static bool virtio_disabled_needed(void *opaque)
+{
+    VirtIODevice *vdev = opaque;
+
+    return vdev->disabled;
+}
+
 static const VMStateDescription vmstate_virtqueue = {
     .name = "virtqueue_state",
     .version_id = 1,
@@ -2718,6 +2726,17 @@ static const VMStateDescription vmstate_virtio_started = {
     }
 };
 
+static const VMStateDescription vmstate_virtio_disabled = {
+    .name = "virtio/disabled",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = &virtio_disabled_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_BOOL(disabled, VirtIODevice),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_virtio = {
     .name = "virtio",
     .version_id = 1,
@@ -2735,6 +2754,7 @@ static const VMStateDescription vmstate_virtio = {
         &vmstate_virtio_extra_state,
         &vmstate_virtio_started,
         &vmstate_virtio_packed_virtqueues,
+        &vmstate_virtio_disabled,
         NULL
     }
 };
@@ -3569,6 +3589,7 @@ static void virtio_device_instance_finalize(Object *obj)
 static Property virtio_properties[] = {
     DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features),
     DEFINE_PROP_BOOL("use-started", VirtIODevice, use_started, true),
+    DEFINE_PROP_BOOL("use-disabled-flag", VirtIODevice, use_disabled_flag, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index c32a815303..f23d2efbc0 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -100,6 +100,8 @@ struct VirtIODevice
     uint16_t device_id;
     bool vm_running;
     bool broken; /* device in invalid state, needs reset */
+    bool use_disabled_flag; /* allow use of 'disable' flag when needed */
+    bool disabled; /* device in temporarily disabled state */
     bool use_started;
     bool started;
     bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */
@@ -378,4 +380,17 @@ static inline void virtio_set_started(VirtIODevice *vdev, bool started)
         vdev->started = started;
     }
 }
+
+static inline void virtio_set_disabled(VirtIODevice *vdev, bool disable)
+{
+    if (vdev->use_disabled_flag) {
+        vdev->disabled = disable;
+    }
+}
+
+static inline bool virtio_device_disabled(VirtIODevice *vdev)
+{
+    return unlikely(vdev->disabled || vdev->broken);
+}
+
 #endif
-- 
2.17.1


Re: [PATCH v2] virtio-pci: disable vring processing when bus-mastering is disabled
Posted by no-reply@patchew.org 4 years, 4 months ago
Patchew URL: https://patchew.org/QEMU/20191120005003.27035-1-mdroth@linux.vnet.ibm.com/



Hi,

This series failed the docker-quick@centos7 build test. Please find the testing commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
make docker-image-centos7 V=1 NETWORK=1
time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
=== TEST SCRIPT END ===

  TEST    check-unit: tests/test-thread-pool
wait_for_migration_fail: unexpected status status=wait-unplug allow_active=1
**
ERROR:/tmp/qemu-test/src/tests/migration-test.c:908:wait_for_migration_fail: assertion failed: (result)
ERROR - Bail out! ERROR:/tmp/qemu-test/src/tests/migration-test.c:908:wait_for_migration_fail: assertion failed: (result)
make: *** [check-qtest-aarch64] Error 1
make: *** Waiting for unfinished jobs....
  TEST    check-unit: tests/test-hbitmap
  TEST    check-unit: tests/test-bdrv-drain
---
    raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', '--label', 'com.qemu.instance.uuid=c863e15882a747a88c290575505cc1de', '-u', '1001', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', '/home/patchew/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', '/var/tmp/patchew-tester-tmp-wg70rgpu/src/docker-src.2019-11-20-01.02.57.12412:/var/tmp/qemu:z,ro', 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit status 2.
filter=--filter=label=com.qemu.instance.uuid=c863e15882a747a88c290575505cc1de
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-wg70rgpu/src'
make: *** [docker-run-test-quick@centos7] Error 2

real    9m13.236s
user    0m8.131s


The full log is available at
http://patchew.org/logs/20191120005003.27035-1-mdroth@linux.vnet.ibm.com/testing.docker-quick@centos7/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-devel@redhat.com
Re: [PATCH v2] virtio-pci: disable vring processing when bus-mastering is disabled
Posted by Michael Roth 4 years, 4 months ago
Quoting no-reply@patchew.org (2019-11-20 00:12:11)
> Patchew URL: https://patchew.org/QEMU/20191120005003.27035-1-mdroth@linux.vnet.ibm.com/
> 
> 
> 
> Hi,
> 
> This series failed the docker-quick@centos7 build test. Please find the testing commands and
> their output below. If you have Docker installed, you can probably reproduce it
> locally.
> 
> === TEST SCRIPT BEGIN ===
> #!/bin/bash
> make docker-image-centos7 V=1 NETWORK=1
> time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
> === TEST SCRIPT END ===
> 
>   TEST    check-unit: tests/test-thread-pool
> wait_for_migration_fail: unexpected status status=wait-unplug allow_active=1
> **
> ERROR:/tmp/qemu-test/src/tests/migration-test.c:908:wait_for_migration_fail: assertion failed: (result)
> ERROR - Bail out! ERROR:/tmp/qemu-test/src/tests/migration-test.c:908:wait_for_migration_fail: assertion failed: (result)

Seems to be an unrelated issue noted in this thread:

  https://lists.gnu.org/archive/html/qemu-devel/2019-11/msg01326.html

I'm running the centos docker test in a loop but haven't been able to reproduce
so far after 7 attempts

> make: *** [check-qtest-aarch64] Error 1
> make: *** Waiting for unfinished jobs....
>   TEST    check-unit: tests/test-hbitmap
>   TEST    check-unit: tests/test-bdrv-drain
> ---
>     raise CalledProcessError(retcode, cmd)
> subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', '--label', 'com.qemu.instance.uuid=c863e15882a747a88c290575505cc1de', '-u', '1001', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', '/home/patchew/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', '/var/tmp/patchew-tester-tmp-wg70rgpu/src/docker-src.2019-11-20-01.02.57.12412:/var/tmp/qemu:z,ro', 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit status 2.
> filter=--filter=label=com.qemu.instance.uuid=c863e15882a747a88c290575505cc1de
> make[1]: *** [docker-run] Error 1
> make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-wg70rgpu/src'
> make: *** [docker-run-test-quick@centos7] Error 2
> 
> real    9m13.236s
> user    0m8.131s
> 
> 
> The full log is available at
> http://patchew.org/logs/20191120005003.27035-1-mdroth@linux.vnet.ibm.com/testing.docker-quick@centos7/?type=message.
> ---
> Email generated automatically by Patchew [https://patchew.org/].
> Please send your feedback to patchew-devel@redhat.com

Re: [PATCH v2] virtio-pci: disable vring processing when bus-mastering is disabled
Posted by Dr. David Alan Gilbert 4 years, 4 months ago
* Michael Roth (mdroth@linux.vnet.ibm.com) wrote:
> Quoting no-reply@patchew.org (2019-11-20 00:12:11)
> > Patchew URL: https://patchew.org/QEMU/20191120005003.27035-1-mdroth@linux.vnet.ibm.com/
> > 
> > 
> > 
> > Hi,
> > 
> > This series failed the docker-quick@centos7 build test. Please find the testing commands and
> > their output below. If you have Docker installed, you can probably reproduce it
> > locally.
> > 
> > === TEST SCRIPT BEGIN ===
> > #!/bin/bash
> > make docker-image-centos7 V=1 NETWORK=1
> > time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
> > === TEST SCRIPT END ===
> > 
> >   TEST    check-unit: tests/test-thread-pool
> > wait_for_migration_fail: unexpected status status=wait-unplug allow_active=1
> > **
> > ERROR:/tmp/qemu-test/src/tests/migration-test.c:908:wait_for_migration_fail: assertion failed: (result)
> > ERROR - Bail out! ERROR:/tmp/qemu-test/src/tests/migration-test.c:908:wait_for_migration_fail: assertion failed: (result)
> 
> Seems to be an unrelated issue noted in this thread:
> 
>   https://lists.gnu.org/archive/html/qemu-devel/2019-11/msg01326.html

Yeh, Jens has a patch series that's fixing that.

Dave

> I'm running the centos docker test in a loop but haven't been able to reproduce
> so far after 7 attempts
> 
> > make: *** [check-qtest-aarch64] Error 1
> > make: *** Waiting for unfinished jobs....
> >   TEST    check-unit: tests/test-hbitmap
> >   TEST    check-unit: tests/test-bdrv-drain
> > ---
> >     raise CalledProcessError(retcode, cmd)
> > subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', '--label', 'com.qemu.instance.uuid=c863e15882a747a88c290575505cc1de', '-u', '1001', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', '-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', '/home/patchew/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', '/var/tmp/patchew-tester-tmp-wg70rgpu/src/docker-src.2019-11-20-01.02.57.12412:/var/tmp/qemu:z,ro', 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit status 2.
> > filter=--filter=label=com.qemu.instance.uuid=c863e15882a747a88c290575505cc1de
> > make[1]: *** [docker-run] Error 1
> > make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-wg70rgpu/src'
> > make: *** [docker-run-test-quick@centos7] Error 2
> > 
> > real    9m13.236s
> > user    0m8.131s
> > 
> > 
> > The full log is available at
> > http://patchew.org/logs/20191120005003.27035-1-mdroth@linux.vnet.ibm.com/testing.docker-quick@centos7/?type=message.
> > ---
> > Email generated automatically by Patchew [https://patchew.org/].
> > Please send your feedback to patchew-devel@redhat.com
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK


Re: [PATCH v2] virtio-pci: disable vring processing when bus-mastering is disabled
Posted by Halil Pasic 4 years, 4 months ago
On Tue, 19 Nov 2019 18:50:03 -0600
Michael Roth <mdroth@linux.vnet.ibm.com> wrote:

[..]
> I.e. the calling code is only scheduling a one-shot BH for
> virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
> an additional virtqueue entry before we get there. This is likely due
> to the following check in virtio_queue_host_notifier_aio_poll:
> 
>   static bool virtio_queue_host_notifier_aio_poll(void *opaque)
>   {
>       EventNotifier *n = opaque;
>       VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
>       bool progress;
> 
>       if (!vq->vring.desc || virtio_queue_empty(vq)) {
>           return false;
>       }
> 
>       progress = virtio_queue_notify_aio_vq(vq);
> 
> namely the call to virtio_queue_empty(). In this case, since no new
> requests have actually been issued, shadow_avail_idx == last_avail_idx,
> so we actually try to access the vring via vring_avail_idx() to get
> the latest non-shadowed idx:
> 
>   int virtio_queue_empty(VirtQueue *vq)
>   {
>       bool empty;
>       ...
> 
>       if (vq->shadow_avail_idx != vq->last_avail_idx) {
>           return 0;
>       }
> 
>       rcu_read_lock();
>       empty = vring_avail_idx(vq) == vq->last_avail_idx;
>       rcu_read_unlock();
>       return empty;
> 
> but since the IOMMU region has been disabled we get a bogus value (0
> usually), which causes virtio_queue_empty() to falsely report that
> there are entries to be processed, which causes errors such as:
> 
>   "virtio: zero sized buffers are not allowed"
> 
> or
> 
>   "virtio-blk missing headers"
> 
> and puts the device in an error state.
> 

I've seen something similar on s390x with virtio-ccw-blk under
protected virtualization, that made me wonder about how virtio-blk in
particular but also virtio in general handles shutdown and reset.

This makes me wonder if bus-mastering disabled is the only scenario when
a something like vdev->disabled should be used.

In particular I have the following mechanism in mind 

qemu_system_reset() --> ... --> qemu_devices_reset() --> ... --> 
--> virtio_[transport]_reset() --> ... --> virtio_bus_stop_ioeventfd()
--> virtio_blk_data_plane_stop()

which in turn triggesrs the following cascade:
virtio_blk_data_plane_stop_bh --> virtio_queue_aio_set_host_notifier_handler() -->
--> virtio_queue_host_notifier_aio_read() 
which however calls 
virtio_queue_notify_aio_vq() if the notifier tests as
positive. 

Since we still have vq->handle_aio_output that means we may
call virtqueue_pop() during the reset procedure.

This was a problem for us, because (due to a bug) the shared pages that
constitute the virtio ring weren't shared any more. And thus we got
the infamous  
virtio_error(vdev, "virtio: zero sized buffers are not allowed").

Now the bug is no more, and we can tolerate that somewhat late access
to the virtio ring.

But it keeps nagging me, is it really OK for the device to access the
virtio ring during reset? My intuition tells me that the device should
not look for new requests after it has been told to reset.

Opinions? (Michael, Connie)

Regards,
Halil

> This patch works around the issue by introducing virtio_set_disabled(),
> which sets a 'disabled' flag to bypass checks like virtio_queue_empty()
> when bus-mastering is disabled. Since we'd check this flag at all the
> same sites as vdev->broken, we replace those checks with an inline
> function which checks for either vdev->broken or vdev->disabled.
> 
> The 'disabled' flag is only migrated when set, which should be fairly
> rare, but to maintain migration compatibility we disable it's use for
> older machine types. Users requiring the use of the flag in conjunction
> with older machine types can set it explicitly as a virtio-device
> option.
> 


Re: [PATCH v2] virtio-pci: disable vring processing when bus-mastering is disabled
Posted by Michael S. Tsirkin 4 years, 4 months ago
On Thu, Nov 28, 2019 at 05:48:00PM +0100, Halil Pasic wrote:
> On Tue, 19 Nov 2019 18:50:03 -0600
> Michael Roth <mdroth@linux.vnet.ibm.com> wrote:
> 
> [..]
> > I.e. the calling code is only scheduling a one-shot BH for
> > virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
> > an additional virtqueue entry before we get there. This is likely due
> > to the following check in virtio_queue_host_notifier_aio_poll:
> > 
> >   static bool virtio_queue_host_notifier_aio_poll(void *opaque)
> >   {
> >       EventNotifier *n = opaque;
> >       VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
> >       bool progress;
> > 
> >       if (!vq->vring.desc || virtio_queue_empty(vq)) {
> >           return false;
> >       }
> > 
> >       progress = virtio_queue_notify_aio_vq(vq);
> > 
> > namely the call to virtio_queue_empty(). In this case, since no new
> > requests have actually been issued, shadow_avail_idx == last_avail_idx,
> > so we actually try to access the vring via vring_avail_idx() to get
> > the latest non-shadowed idx:
> > 
> >   int virtio_queue_empty(VirtQueue *vq)
> >   {
> >       bool empty;
> >       ...
> > 
> >       if (vq->shadow_avail_idx != vq->last_avail_idx) {
> >           return 0;
> >       }
> > 
> >       rcu_read_lock();
> >       empty = vring_avail_idx(vq) == vq->last_avail_idx;
> >       rcu_read_unlock();
> >       return empty;
> > 
> > but since the IOMMU region has been disabled we get a bogus value (0
> > usually), which causes virtio_queue_empty() to falsely report that
> > there are entries to be processed, which causes errors such as:
> > 
> >   "virtio: zero sized buffers are not allowed"
> > 
> > or
> > 
> >   "virtio-blk missing headers"
> > 
> > and puts the device in an error state.
> > 
> 
> I've seen something similar on s390x with virtio-ccw-blk under
> protected virtualization, that made me wonder about how virtio-blk in
> particular but also virtio in general handles shutdown and reset.
> 
> This makes me wonder if bus-mastering disabled is the only scenario when
> a something like vdev->disabled should be used.
> 
> In particular I have the following mechanism in mind 
> 
> qemu_system_reset() --> ... --> qemu_devices_reset() --> ... --> 
> --> virtio_[transport]_reset() --> ... --> virtio_bus_stop_ioeventfd()
> --> virtio_blk_data_plane_stop()
> 
> which in turn triggesrs the following cascade:
> virtio_blk_data_plane_stop_bh --> virtio_queue_aio_set_host_notifier_handler() -->
> --> virtio_queue_host_notifier_aio_read() 
> which however calls 
> virtio_queue_notify_aio_vq() if the notifier tests as
> positive. 
> 
> Since we still have vq->handle_aio_output that means we may
> call virtqueue_pop() during the reset procedure.
> 
> This was a problem for us, because (due to a bug) the shared pages that
> constitute the virtio ring weren't shared any more. And thus we got
> the infamous  
> virtio_error(vdev, "virtio: zero sized buffers are not allowed").
> 
> Now the bug is no more, and we can tolerate that somewhat late access
> to the virtio ring.
> 
> But it keeps nagging me, is it really OK for the device to access the
> virtio ring during reset? My intuition tells me that the device should
> not look for new requests after it has been told to reset.


Well it's after it was told to reset but it's not after
it completed reset. So I think it's fine ...

> Opinions? (Michael, Connie)
> 
> Regards,
> Halil
> 
> > This patch works around the issue by introducing virtio_set_disabled(),
> > which sets a 'disabled' flag to bypass checks like virtio_queue_empty()
> > when bus-mastering is disabled. Since we'd check this flag at all the
> > same sites as vdev->broken, we replace those checks with an inline
> > function which checks for either vdev->broken or vdev->disabled.
> > 
> > The 'disabled' flag is only migrated when set, which should be fairly
> > rare, but to maintain migration compatibility we disable it's use for
> > older machine types. Users requiring the use of the flag in conjunction
> > with older machine types can set it explicitly as a virtio-device
> > option.
> > 


Re: [PATCH v2] virtio-pci: disable vring processing when bus-mastering is disabled
Posted by Halil Pasic 4 years, 3 months ago
On Thu, 28 Nov 2019 12:03:01 -0500
"Michael S. Tsirkin" <mst@redhat.com> wrote:

[..]

> > 
> > But it keeps nagging me, is it really OK for the device to access the
> > virtio ring during reset? My intuition tells me that the device should
> > not look for new requests after it has been told to reset.
> 
> 
> Well it's after it was told to reset but it's not after
> it completed reset. So I think it's fine ...

Thanks Michael! I agree and we are covered by the specification. Namely
3.3.1 Driver Requirements: Device Cleanup says "Thus a driver MUST
ensure a virtqueue isn’t live (by device reset) before removing exposed
buffers.". Draining the available buffers from the queue is not wrong -- 
although possibly unnecessary.

So I guess for externally initiated resets (ones not initiated by the
driver) we just have to make sure that the virtio structures are intact
until the virtio device is reset.

Regards,
Halil 

[..]


Re: [PATCH v2] virtio-pci: disable vring processing when bus-mastering is disabled
Posted by Alexey Kardashevskiy 4 years, 3 months ago
Hi,

I was wondering if this is going anywhere or if SLOF is still expected
to get fixed and if it is SLOF, then what exactly in SLOF's behaviour is
incorrect and requires fixing? I am a bit lost here. Thanks,




On 20/11/2019 11:50, Michael Roth wrote:
> Currently the SLOF firmware for pseries guests will disable/re-enable
> a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND
> register after the initial probe/feature negotiation, as it tends to
> work with a single device at a time at various stages like probing
> and running block/network bootloaders without doing a full reset
> in-between.
> 
> In QEMU, when PCI_COMMAND_MASTER is disabled we disable the
> corresponding IOMMU memory region, so DMA accesses (including to vring
> fields like idx/flags) will no longer undergo the necessary
> translation. Normally we wouldn't expect this to happen since it would
> be misbehavior on the driver side to continue driving DMA requests.
> 
> However, in the case of pseries, with iommu_platform=on, we trigger the
> following sequence when tearing down the virtio-blk dataplane ioeventfd
> in response to the guest unsetting PCI_COMMAND_MASTER:
> 
>   #2  0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757
>   #3  0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950
>   #4  0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0)
>       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255
>   #5  0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660)
>       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776
>   #6  0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550
>   #7  0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546
>   #8  0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527
>   #9  0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8)
>       at /home/mdroth/w/qemu.git/util/aio-posix.c:520
>   #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0)
>       at /home/mdroth/w/qemu.git/util/aio-posix.c:607
>   #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true)
>       at /home/mdroth/w/qemu.git/util/aio-posix.c:639
>   #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 <virtio_blk_data_plane_stop_bh>, opaque=opaque@entry=0x555556de86f0)
>       at /home/mdroth/w/qemu.git/util/aio-wait.c:71
>   #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=<optimized out>)
>       at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288
>   #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245
>   #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237
>   #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292
>   #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=<optimized out>, val=1048832, len=<optimized out>)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613
> 
> I.e. the calling code is only scheduling a one-shot BH for
> virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
> an additional virtqueue entry before we get there. This is likely due
> to the following check in virtio_queue_host_notifier_aio_poll:
> 
>   static bool virtio_queue_host_notifier_aio_poll(void *opaque)
>   {
>       EventNotifier *n = opaque;
>       VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
>       bool progress;
> 
>       if (!vq->vring.desc || virtio_queue_empty(vq)) {
>           return false;
>       }
> 
>       progress = virtio_queue_notify_aio_vq(vq);
> 
> namely the call to virtio_queue_empty(). In this case, since no new
> requests have actually been issued, shadow_avail_idx == last_avail_idx,
> so we actually try to access the vring via vring_avail_idx() to get
> the latest non-shadowed idx:
> 
>   int virtio_queue_empty(VirtQueue *vq)
>   {
>       bool empty;
>       ...
> 
>       if (vq->shadow_avail_idx != vq->last_avail_idx) {
>           return 0;
>       }
> 
>       rcu_read_lock();
>       empty = vring_avail_idx(vq) == vq->last_avail_idx;
>       rcu_read_unlock();
>       return empty;
> 
> but since the IOMMU region has been disabled we get a bogus value (0
> usually), which causes virtio_queue_empty() to falsely report that
> there are entries to be processed, which causes errors such as:
> 
>   "virtio: zero sized buffers are not allowed"
> 
> or
> 
>   "virtio-blk missing headers"
> 
> and puts the device in an error state.
> 
> This patch works around the issue by introducing virtio_set_disabled(),
> which sets a 'disabled' flag to bypass checks like virtio_queue_empty()
> when bus-mastering is disabled. Since we'd check this flag at all the
> same sites as vdev->broken, we replace those checks with an inline
> function which checks for either vdev->broken or vdev->disabled.
> 
> The 'disabled' flag is only migrated when set, which should be fairly
> rare, but to maintain migration compatibility we disable it's use for
> older machine types. Users requiring the use of the flag in conjunction
> with older machine types can set it explicitly as a virtio-device
> option.
> 
> NOTES:
> 
>  - This leaves some other oddities in play, like the fact that
>    DRIVER_OK also gets unset in response to bus-mastering being
>    disabled, but not restored (however the device seems to continue
>    working)
>  - Similarly, we disable the host notifier via
>    virtio_bus_stop_ioeventfd(), which seems to move the handling out
>    of virtio-blk dataplane and back into the main IO thread, and it
>    ends up staying there till a reset (but otherwise continues working
>    normally)
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>,
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> ---
> v2:
>  - add migration support and only default to using 'disabled' flag
>    for newer machines via virtio-device compat option (MST)
>  - use inline functions to decouple checks from specific fields (MST)
>  - rebased on master
> ---
>  hw/core/machine.c          |  1 +
>  hw/virtio/virtio-pci.c     | 12 ++++++++----
>  hw/virtio/virtio.c         | 35 ++++++++++++++++++++++++++++-------
>  include/hw/virtio/virtio.h | 15 +++++++++++++++
>  4 files changed, 52 insertions(+), 11 deletions(-)
> 
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index 1689ad3bf8..9f3073b23b 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -29,6 +29,7 @@
>  
>  GlobalProperty hw_compat_4_1[] = {
>      { "virtio-pci", "x-pcie-flr-init", "off" },
> +    { "virtio-device", "use-disabled-flag", "false" },
>  };
>  const size_t hw_compat_4_1_len = G_N_ELEMENTS(hw_compat_4_1);
>  
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index c6b47a9c73..394d409fb9 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
>          pcie_cap_flr_write_config(pci_dev, address, val, len);
>      }
>  
> -    if (range_covers_byte(address, len, PCI_COMMAND) &&
> -        !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> -        virtio_pci_stop_ioeventfd(proxy);
> -        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> +    if (range_covers_byte(address, len, PCI_COMMAND)) {
> +        if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> +            virtio_set_disabled(vdev, true);
> +            virtio_pci_stop_ioeventfd(proxy);
> +            virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> +        } else {
> +            virtio_set_disabled(vdev, false);
> +        }
>      }
>  
>      if (proxy->config_cap &&
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 04716b5f6c..3cb603a466 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -546,7 +546,7 @@ static inline bool is_desc_avail(uint16_t flags, bool wrap_counter)
>   * Called within rcu_read_lock().  */
>  static int virtio_queue_empty_rcu(VirtQueue *vq)
>  {
> -    if (unlikely(vq->vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          return 1;
>      }
>  
> @@ -565,7 +565,7 @@ static int virtio_queue_split_empty(VirtQueue *vq)
>  {
>      bool empty;
>  
> -    if (unlikely(vq->vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          return 1;
>      }
>  
> @@ -783,7 +783,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
>  
>      virtqueue_unmap_sg(vq, elem, len);
>  
> -    if (unlikely(vq->vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          return;
>      }
>  
> @@ -839,7 +839,7 @@ static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count)
>  
>  void virtqueue_flush(VirtQueue *vq, unsigned int count)
>  {
> -    if (unlikely(vq->vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          vq->inuse -= count;
>          return;
>      }
> @@ -1602,7 +1602,7 @@ err_undo_map:
>  
>  void *virtqueue_pop(VirtQueue *vq, size_t sz)
>  {
> -    if (unlikely(vq->vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          return NULL;
>      }
>  
> @@ -1698,7 +1698,7 @@ unsigned int virtqueue_drop_all(VirtQueue *vq)
>  {
>      struct VirtIODevice *vdev = vq->vdev;
>  
> -    if (unlikely(vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          return 0;
>      }
>  
> @@ -1816,7 +1816,7 @@ static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector)
>      BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
>      VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
>  
> -    if (unlikely(vdev->broken)) {
> +    if (virtio_device_disabled(vdev)) {
>          return;
>      }
>  
> @@ -1920,6 +1920,7 @@ void virtio_reset(void *opaque)
>      vdev->guest_features = 0;
>      vdev->queue_sel = 0;
>      vdev->status = 0;
> +    vdev->disabled = false;
>      atomic_set(&vdev->isr, 0);
>      vdev->config_vector = VIRTIO_NO_VECTOR;
>      virtio_notify_vector(vdev, vdev->config_vector);
> @@ -2553,6 +2554,13 @@ static bool virtio_started_needed(void *opaque)
>      return vdev->started;
>  }
>  
> +static bool virtio_disabled_needed(void *opaque)
> +{
> +    VirtIODevice *vdev = opaque;
> +
> +    return vdev->disabled;
> +}
> +
>  static const VMStateDescription vmstate_virtqueue = {
>      .name = "virtqueue_state",
>      .version_id = 1,
> @@ -2718,6 +2726,17 @@ static const VMStateDescription vmstate_virtio_started = {
>      }
>  };
>  
> +static const VMStateDescription vmstate_virtio_disabled = {
> +    .name = "virtio/disabled",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = &virtio_disabled_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(disabled, VirtIODevice),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static const VMStateDescription vmstate_virtio = {
>      .name = "virtio",
>      .version_id = 1,
> @@ -2735,6 +2754,7 @@ static const VMStateDescription vmstate_virtio = {
>          &vmstate_virtio_extra_state,
>          &vmstate_virtio_started,
>          &vmstate_virtio_packed_virtqueues,
> +        &vmstate_virtio_disabled,
>          NULL
>      }
>  };
> @@ -3569,6 +3589,7 @@ static void virtio_device_instance_finalize(Object *obj)
>  static Property virtio_properties[] = {
>      DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features),
>      DEFINE_PROP_BOOL("use-started", VirtIODevice, use_started, true),
> +    DEFINE_PROP_BOOL("use-disabled-flag", VirtIODevice, use_disabled_flag, true),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index c32a815303..f23d2efbc0 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -100,6 +100,8 @@ struct VirtIODevice
>      uint16_t device_id;
>      bool vm_running;
>      bool broken; /* device in invalid state, needs reset */
> +    bool use_disabled_flag; /* allow use of 'disable' flag when needed */
> +    bool disabled; /* device in temporarily disabled state */
>      bool use_started;
>      bool started;
>      bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */
> @@ -378,4 +380,17 @@ static inline void virtio_set_started(VirtIODevice *vdev, bool started)
>          vdev->started = started;
>      }
>  }
> +
> +static inline void virtio_set_disabled(VirtIODevice *vdev, bool disable)
> +{
> +    if (vdev->use_disabled_flag) {
> +        vdev->disabled = disable;
> +    }
> +}
> +
> +static inline bool virtio_device_disabled(VirtIODevice *vdev)
> +{
> +    return unlikely(vdev->disabled || vdev->broken);
> +}
> +
>  #endif
> 

-- 
Alexey

Re: [PATCH v2] virtio-pci: disable vring processing when bus-mastering is disabled
Posted by Michael S. Tsirkin 4 years, 3 months ago
On Tue, Nov 19, 2019 at 06:50:03PM -0600, Michael Roth wrote:
> Currently the SLOF firmware for pseries guests will disable/re-enable
> a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND
> register after the initial probe/feature negotiation, as it tends to
> work with a single device at a time at various stages like probing
> and running block/network bootloaders without doing a full reset
> in-between.
> 
> In QEMU, when PCI_COMMAND_MASTER is disabled we disable the
> corresponding IOMMU memory region, so DMA accesses (including to vring
> fields like idx/flags) will no longer undergo the necessary
> translation. Normally we wouldn't expect this to happen since it would
> be misbehavior on the driver side to continue driving DMA requests.
> 
> However, in the case of pseries, with iommu_platform=on, we trigger the
> following sequence when tearing down the virtio-blk dataplane ioeventfd
> in response to the guest unsetting PCI_COMMAND_MASTER:
> 
>   #2  0x0000555555922651 in virtqueue_map_desc (vdev=vdev@entry=0x555556dbcfb0, p_num_sg=p_num_sg@entry=0x7fffe657e1a8, addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, pa=0, sz=0)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757
>   #3  0x0000555555922a89 in virtqueue_pop (vq=vq@entry=0x555556dc8660, sz=sz@entry=184)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950
>   #4  0x00005555558d3eca in virtio_blk_get_request (vq=0x555556dc8660, s=0x555556dbcfb0)
>       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255
>   #5  0x00005555558d3eca in virtio_blk_handle_vq (s=0x555556dbcfb0, vq=0x555556dc8660)
>       at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776
>   #6  0x000055555591dd66 in virtio_queue_notify_aio_vq (vq=vq@entry=0x555556dc8660)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550
>   #7  0x000055555591ecef in virtio_queue_notify_aio_vq (vq=0x555556dc8660)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546
>   #8  0x000055555591ecef in virtio_queue_host_notifier_aio_poll (opaque=0x555556dc86c8)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527
>   #9  0x0000555555d02164 in run_poll_handlers_once (ctx=ctx@entry=0x55555688bfc0, timeout=timeout@entry=0x7fffe65844a8)
>       at /home/mdroth/w/qemu.git/util/aio-posix.c:520
>   #10 0x0000555555d02d1b in try_poll_mode (timeout=0x7fffe65844a8, ctx=0x55555688bfc0)
>       at /home/mdroth/w/qemu.git/util/aio-posix.c:607
>   #11 0x0000555555d02d1b in aio_poll (ctx=ctx@entry=0x55555688bfc0, blocking=blocking@entry=true)
>       at /home/mdroth/w/qemu.git/util/aio-posix.c:639
>   #12 0x0000555555d0004d in aio_wait_bh_oneshot (ctx=0x55555688bfc0, cb=cb@entry=0x5555558d5130 <virtio_blk_data_plane_stop_bh>, opaque=opaque@entry=0x555556de86f0)
>       at /home/mdroth/w/qemu.git/util/aio-wait.c:71
>   #13 0x00005555558d59bf in virtio_blk_data_plane_stop (vdev=<optimized out>)
>       at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288
>   #14 0x0000555555b906a1 in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245
>   #15 0x0000555555b90dbb in virtio_bus_stop_ioeventfd (bus=bus@entry=0x555556dbcf38)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237
>   #16 0x0000555555b92a8e in virtio_pci_stop_ioeventfd (proxy=0x555556db4e40)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292
>   #17 0x0000555555b92a8e in virtio_write_config (pci_dev=0x555556db4e40, address=<optimized out>, val=1048832, len=<optimized out>)
>       at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613
> 
> I.e. the calling code is only scheduling a one-shot BH for
> virtio_blk_data_plane_stop_bh, but somehow we end up trying to process
> an additional virtqueue entry before we get there. This is likely due
> to the following check in virtio_queue_host_notifier_aio_poll:
> 
>   static bool virtio_queue_host_notifier_aio_poll(void *opaque)
>   {
>       EventNotifier *n = opaque;
>       VirtQueue *vq = container_of(n, VirtQueue, host_notifier);
>       bool progress;
> 
>       if (!vq->vring.desc || virtio_queue_empty(vq)) {
>           return false;
>       }
> 
>       progress = virtio_queue_notify_aio_vq(vq);
> 
> namely the call to virtio_queue_empty(). In this case, since no new
> requests have actually been issued, shadow_avail_idx == last_avail_idx,
> so we actually try to access the vring via vring_avail_idx() to get
> the latest non-shadowed idx:
> 
>   int virtio_queue_empty(VirtQueue *vq)
>   {
>       bool empty;
>       ...
> 
>       if (vq->shadow_avail_idx != vq->last_avail_idx) {
>           return 0;
>       }
> 
>       rcu_read_lock();
>       empty = vring_avail_idx(vq) == vq->last_avail_idx;
>       rcu_read_unlock();
>       return empty;
> 
> but since the IOMMU region has been disabled we get a bogus value (0
> usually), which causes virtio_queue_empty() to falsely report that
> there are entries to be processed, which causes errors such as:
> 
>   "virtio: zero sized buffers are not allowed"
> 
> or
> 
>   "virtio-blk missing headers"
> 
> and puts the device in an error state.
> 
> This patch works around the issue by introducing virtio_set_disabled(),
> which sets a 'disabled' flag to bypass checks like virtio_queue_empty()
> when bus-mastering is disabled. Since we'd check this flag at all the
> same sites as vdev->broken, we replace those checks with an inline
> function which checks for either vdev->broken or vdev->disabled.
> 
> The 'disabled' flag is only migrated when set, which should be fairly
> rare, but to maintain migration compatibility we disable it's use for
> older machine types. Users requiring the use of the flag in conjunction
> with older machine types can set it explicitly as a virtio-device
> option.
> 
> NOTES:
> 
>  - This leaves some other oddities in play, like the fact that
>    DRIVER_OK also gets unset in response to bus-mastering being
>    disabled, but not restored (however the device seems to continue
>    working)
>  - Similarly, we disable the host notifier via
>    virtio_bus_stop_ioeventfd(), which seems to move the handling out
>    of virtio-blk dataplane and back into the main IO thread, and it
>    ends up staying there till a reset (but otherwise continues working
>    normally)
> 
> Cc: David Gibson <david@gibson.dropbear.id.au>,
> Cc: Alexey Kardashevskiy <aik@ozlabs.ru>
> Cc: "Michael S. Tsirkin" <mst@redhat.com>
> Signed-off-by: Michael Roth <mdroth@linux.vnet.ibm.com>
> ---
> v2:
>  - add migration support and only default to using 'disabled' flag
>    for newer machines via virtio-device compat option (MST)
>  - use inline functions to decouple checks from specific fields (MST)
>  - rebased on master

So the only nit is: let's not make this part of
stable API, prefix the property with "x-".

Otherwise looks good.

> ---
>  hw/core/machine.c          |  1 +
>  hw/virtio/virtio-pci.c     | 12 ++++++++----
>  hw/virtio/virtio.c         | 35 ++++++++++++++++++++++++++++-------
>  include/hw/virtio/virtio.h | 15 +++++++++++++++
>  4 files changed, 52 insertions(+), 11 deletions(-)
> 
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index 1689ad3bf8..9f3073b23b 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -29,6 +29,7 @@
>  
>  GlobalProperty hw_compat_4_1[] = {
>      { "virtio-pci", "x-pcie-flr-init", "off" },
> +    { "virtio-device", "use-disabled-flag", "false" },
>  };
>  const size_t hw_compat_4_1_len = G_N_ELEMENTS(hw_compat_4_1);
>  
> diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
> index c6b47a9c73..394d409fb9 100644
> --- a/hw/virtio/virtio-pci.c
> +++ b/hw/virtio/virtio-pci.c
> @@ -608,10 +608,14 @@ static void virtio_write_config(PCIDevice *pci_dev, uint32_t address,
>          pcie_cap_flr_write_config(pci_dev, address, val, len);
>      }
>  
> -    if (range_covers_byte(address, len, PCI_COMMAND) &&
> -        !(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> -        virtio_pci_stop_ioeventfd(proxy);
> -        virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> +    if (range_covers_byte(address, len, PCI_COMMAND)) {
> +        if (!(pci_dev->config[PCI_COMMAND] & PCI_COMMAND_MASTER)) {
> +            virtio_set_disabled(vdev, true);
> +            virtio_pci_stop_ioeventfd(proxy);
> +            virtio_set_status(vdev, vdev->status & ~VIRTIO_CONFIG_S_DRIVER_OK);
> +        } else {
> +            virtio_set_disabled(vdev, false);
> +        }
>      }
>  
>      if (proxy->config_cap &&
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 04716b5f6c..3cb603a466 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -546,7 +546,7 @@ static inline bool is_desc_avail(uint16_t flags, bool wrap_counter)
>   * Called within rcu_read_lock().  */
>  static int virtio_queue_empty_rcu(VirtQueue *vq)
>  {
> -    if (unlikely(vq->vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          return 1;
>      }
>  
> @@ -565,7 +565,7 @@ static int virtio_queue_split_empty(VirtQueue *vq)
>  {
>      bool empty;
>  
> -    if (unlikely(vq->vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          return 1;
>      }
>  
> @@ -783,7 +783,7 @@ void virtqueue_fill(VirtQueue *vq, const VirtQueueElement *elem,
>  
>      virtqueue_unmap_sg(vq, elem, len);
>  
> -    if (unlikely(vq->vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          return;
>      }
>  
> @@ -839,7 +839,7 @@ static void virtqueue_packed_flush(VirtQueue *vq, unsigned int count)
>  
>  void virtqueue_flush(VirtQueue *vq, unsigned int count)
>  {
> -    if (unlikely(vq->vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          vq->inuse -= count;
>          return;
>      }
> @@ -1602,7 +1602,7 @@ err_undo_map:
>  
>  void *virtqueue_pop(VirtQueue *vq, size_t sz)
>  {
> -    if (unlikely(vq->vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          return NULL;
>      }
>  
> @@ -1698,7 +1698,7 @@ unsigned int virtqueue_drop_all(VirtQueue *vq)
>  {
>      struct VirtIODevice *vdev = vq->vdev;
>  
> -    if (unlikely(vdev->broken)) {
> +    if (virtio_device_disabled(vq->vdev)) {
>          return 0;
>      }
>  
> @@ -1816,7 +1816,7 @@ static void virtio_notify_vector(VirtIODevice *vdev, uint16_t vector)
>      BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
>      VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
>  
> -    if (unlikely(vdev->broken)) {
> +    if (virtio_device_disabled(vdev)) {
>          return;
>      }
>  
> @@ -1920,6 +1920,7 @@ void virtio_reset(void *opaque)
>      vdev->guest_features = 0;
>      vdev->queue_sel = 0;
>      vdev->status = 0;
> +    vdev->disabled = false;
>      atomic_set(&vdev->isr, 0);
>      vdev->config_vector = VIRTIO_NO_VECTOR;
>      virtio_notify_vector(vdev, vdev->config_vector);
> @@ -2553,6 +2554,13 @@ static bool virtio_started_needed(void *opaque)
>      return vdev->started;
>  }
>  
> +static bool virtio_disabled_needed(void *opaque)
> +{
> +    VirtIODevice *vdev = opaque;
> +
> +    return vdev->disabled;
> +}
> +
>  static const VMStateDescription vmstate_virtqueue = {
>      .name = "virtqueue_state",
>      .version_id = 1,
> @@ -2718,6 +2726,17 @@ static const VMStateDescription vmstate_virtio_started = {
>      }
>  };
>  
> +static const VMStateDescription vmstate_virtio_disabled = {
> +    .name = "virtio/disabled",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = &virtio_disabled_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_BOOL(disabled, VirtIODevice),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static const VMStateDescription vmstate_virtio = {
>      .name = "virtio",
>      .version_id = 1,
> @@ -2735,6 +2754,7 @@ static const VMStateDescription vmstate_virtio = {
>          &vmstate_virtio_extra_state,
>          &vmstate_virtio_started,
>          &vmstate_virtio_packed_virtqueues,
> +        &vmstate_virtio_disabled,
>          NULL
>      }
>  };
> @@ -3569,6 +3589,7 @@ static void virtio_device_instance_finalize(Object *obj)
>  static Property virtio_properties[] = {
>      DEFINE_VIRTIO_COMMON_FEATURES(VirtIODevice, host_features),
>      DEFINE_PROP_BOOL("use-started", VirtIODevice, use_started, true),
> +    DEFINE_PROP_BOOL("use-disabled-flag", VirtIODevice, use_disabled_flag, true),
>      DEFINE_PROP_END_OF_LIST(),
>  };
>  
> diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
> index c32a815303..f23d2efbc0 100644
> --- a/include/hw/virtio/virtio.h
> +++ b/include/hw/virtio/virtio.h
> @@ -100,6 +100,8 @@ struct VirtIODevice
>      uint16_t device_id;
>      bool vm_running;
>      bool broken; /* device in invalid state, needs reset */
> +    bool use_disabled_flag; /* allow use of 'disable' flag when needed */
> +    bool disabled; /* device in temporarily disabled state */
>      bool use_started;
>      bool started;
>      bool start_on_kick; /* when virtio 1.0 feature has not been negotiated */
> @@ -378,4 +380,17 @@ static inline void virtio_set_started(VirtIODevice *vdev, bool started)
>          vdev->started = started;
>      }
>  }
> +
> +static inline void virtio_set_disabled(VirtIODevice *vdev, bool disable)
> +{
> +    if (vdev->use_disabled_flag) {
> +        vdev->disabled = disable;
> +    }
> +}
> +
> +static inline bool virtio_device_disabled(VirtIODevice *vdev)
> +{
> +    return unlikely(vdev->disabled || vdev->broken);
> +}
> +
>  #endif
> -- 
> 2.17.1