Series comparison

-[PULL V2 00/20] Net patches
+[PULL V2 00/15] Net patches
-The following changes since commit 6157b0e19721aadb4c7fdcfe57b2924af6144b14:
+The following changes since commit d9ccf33f9479201e5add8db0af68ca9ca8da358b:
-  Merge remote-tracking branch 'remotes/vivier2/tags/linux-user-for-6.0-pull-request' into staging (2021-03-14 17:47:49 +0000)
+  Merge remote-tracking branch 'remotes/lvivier-gitlab/tags/linux-user-for-7.0-pull-request' into staging (2022-03-09 20:01:17 +0000)
 are available in the git repository at:
   https://github.com/jasowang/qemu.git tags/net-pull-request
-for you to fetch changes up to f2e8319d456724c3d8514d943dc4607e2f08e88a:
+for you to fetch changes up to eea40402ecf895ed345f8e8eb07dbb484f4542c5:
-  net: Do not fill legacy info_str for backends (2021-03-15 16:41:22 +0800)
+  vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-10 10:26:32 +0800)
 ----------------------------------------------------------------
-Changes since V1:
+----------------------------------------------------------------
-- drop the workaound of "-nic" id and fix the merge
+Eugenio Pérez (14):
-- add the series of query-netdev
+      vhost: Add VhostShadowVirtqueue
       vhost: Add Shadow VirtQueue kick forwarding capabilities
       vhost: Add Shadow VirtQueue call forwarding capabilities
       vhost: Add vhost_svq_valid_features to shadow vq
       virtio: Add vhost_svq_get_vring_addr
       vdpa: adapt vhost_ops callbacks to svq
       vhost: Shadow virtqueue buffers forwarding
       util: Add iova_tree_alloc_map
       util: add iova_tree_find_iova
       vhost: Add VhostIOVATree
       vdpa: Add custom IOTLB translations to SVQ
       vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
       vdpa: Never set log_base addr if SVQ is enabled
       vdpa: Expose VHOST_F_LOG_ALL on SVQ
-----------------------------------------------------------------
+Jason Wang (1):
-Alexander Bulekov (4):
+      virtio-net: fix map leaking on error during receive
       rtl8139: switch to use qemu_receive_packet() for loopback
       pcnet: switch to use qemu_receive_packet() for loopback
       cadence_gem: switch to use qemu_receive_packet() for loopback
       lan9118: switch to use qemu_receive_packet() for loopback
-Alexey Kirillov (5):
+ hw/net/virtio-net.c                |   1 +
-      qapi: net: Add query-netdev command
+ hw/virtio/meson.build              |   2 +-
-      tests: Add tests for query-netdev command
+ hw/virtio/vhost-iova-tree.c        | 110 +++++++
-      net: Move NetClientState.info_str to dynamic allocations
+ hw/virtio/vhost-iova-tree.h        |  27 ++
-      hmp: Use QAPI NetdevInfo in hmp_info_network
+ hw/virtio/vhost-shadow-virtqueue.c | 638 +++++++++++++++++++++++++++++++++++++
-      net: Do not fill legacy info_str for backends
+ hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
+ hw/virtio/vhost-vdpa.c             | 525 +++++++++++++++++++++++++++++-
-Bin Meng (1):
+ include/hw/virtio/vhost-vdpa.h     |   8 +
-      net: Fix build error when DEBUG_NET is on
+ include/qemu/iova-tree.h           |  38 ++-
+ util/iova-tree.c                   | 169 ++++++++++
-Cornelia Huck (1):
+files changed, 1588 insertions(+), 17 deletions(-)
-      pvrdma: wean code off pvrdma_ring.h kernel header
+ create mode 100644 hw/virtio/vhost-iova-tree.c
+ create mode 100644 hw/virtio/vhost-iova-tree.h
-Jason Wang (8):
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
-      virtio-net: calculating proper msix vectors on init
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
       e1000: fail early for evil descriptor
       net: introduce qemu_receive_packet()
       e1000: switch to use qemu_receive_packet() for loopback
       dp8393x: switch to use qemu_receive_packet() for loopback packet
       msf2-mac: switch to use qemu_receive_packet() for loopback
       sungem: switch to use qemu_receive_packet() for loopback
       tx_pkt: switch to use qemu_receive_packet_iov() for loopback
 Paolo Bonzini (1):
       net: validate that ids are well formed
  hw/core/machine.c                                  |   1 +
  hw/net/cadence_gem.c                               |   4 +-
  hw/net/dp8393x.c                                   |   2 +-
  hw/net/e1000.c                                     |   6 +-
  hw/net/lan9118.c                                   |   2 +-
  hw/net/msf2-emac.c                                 |   2 +-
  hw/net/net_tx_pkt.c                                |   2 +-
  hw/net/pcnet.c                                     |   2 +-
  hw/net/rtl8139.c                                   |   2 +-
  hw/net/sungem.c                                    |   2 +-
  hw/net/xen_nic.c                                   |   5 +-
  hw/rdma/vmw/pvrdma.h                               |   5 +-
  hw/rdma/vmw/pvrdma_cmd.c                           |   6 +-
  hw/rdma/vmw/pvrdma_dev_ring.c                      |  41 +++--
  hw/rdma/vmw/pvrdma_dev_ring.h                      |   9 +-
  hw/rdma/vmw/pvrdma_main.c                          |   4 +-
  hw/virtio/virtio-net-pci.c                         |  10 +-
  include/net/net.h                                  |  10 +-
  include/net/queue.h                                |   8 +
  include/qapi/hmp-output-visitor.h                  |  30 ++++
  .../drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h | 114 ------------
  net/l2tpv3.c                                       |   8 +-
  net/net.c                                          | 117 +++++++++++--
  net/netmap.c                                       |   7 +
  net/queue.c                                        |  22 +++
  net/slirp.c                                        | 124 ++++++++++++-
  net/socket.c                                       |  92 +++++++---
  net/tap-win32.c                                    |  10 +-
  net/tap.c                                          | 107 ++++++++++--
  net/vde.c                                          |  25 ++-
  net/vhost-user.c                                   |  20 ++-
  net/vhost-vdpa.c                                   |  15 +-
  qapi/hmp-output-visitor.c                          | 193 +++++++++++++++++++++
  qapi/meson.build                                   |   1 +
  qapi/net.json                                      |  80 +++++++++
  scripts/update-linux-headers.sh                    |   3 +-
  tests/qtest/meson.build                            |   3 +
  tests/qtest/test-query-netdev.c                    | 120 +++++++++++++
 files changed, 990 insertions(+), 224 deletions(-)
  create mode 100644 include/qapi/hmp-output-visitor.h
  delete mode 100644 include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h
  create mode 100644 qapi/hmp-output-visitor.c
  create mode 100644 tests/qtest/test-query-netdev.c

-[PULL V2 01/20] virtio-net: calculating proper msix vectors on init
+Deleted patch
-Currently, the default msix vectors for virtio-net-pci is 3 which is
-obvious not suitable for multiqueue guest, so we depends on the user
-or management tools to pass a correct vectors parameter. In fact, we
-can simplifying this by calculating the number of vectors on realize.
-Consider we have N queues, the number of vectors needed is 2*N + 2
-(#queue pairs + plus one config interrupt and control vq). We didn't
-check whether or not host support control vq because it was added
-unconditionally by qemu to avoid breaking legacy guests such as Minix.
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com
-Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
-Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/core/machine.c          |  1 +
- hw/virtio/virtio-net-pci.c | 10 +++++++++-
-files changed, 10 insertions(+), 1 deletion(-)
-diff --git a/hw/core/machine.c b/hw/core/machine.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/core/machine.c
-+++ b/hw/core/machine.c
-@@ -XXX,XX +XXX,XX @@
- GlobalProperty hw_compat_5_2[] = {
-     { "ICH9-LPC", "smm-compat", "on"},
-     { "PIIX4_PM", "smm-compat", "on"},
-+    { "virtio-net-pci", "vectors", "3"},
- };
- const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2);
-diff --git a/hw/virtio/virtio-net-pci.c b/hw/virtio/virtio-net-pci.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/virtio/virtio-net-pci.c
-+++ b/hw/virtio/virtio-net-pci.c
-@@ -XXX,XX +XXX,XX @@ struct VirtIONetPCI {
- static Property virtio_net_properties[] = {
-     DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
-                     VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
--    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3),
-+    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
-+                       DEV_NVECTORS_UNSPECIFIED),
-     DEFINE_PROP_END_OF_LIST(),
- };
-@@ -XXX,XX +XXX,XX @@ static void virtio_net_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
-     DeviceState *qdev = DEVICE(vpci_dev);
-     VirtIONetPCI *dev = VIRTIO_NET_PCI(vpci_dev);
-     DeviceState *vdev = DEVICE(&dev->vdev);
-+    VirtIONet *net = VIRTIO_NET(vdev);
-+
-+    if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
-+        vpci_dev->nvectors = 2 * MAX(net->nic_conf.peers.queues, 1)
-+            + 1 /* Config interrupt */
-+            + 1 /* Control vq */;
-+    }
-     virtio_net_set_netclient_name(&dev->vdev, qdev->id,
-                                   object_get_typename(OBJECT(qdev)));
---
-.7.4

-[PULL V2 02/20] net: Fix build error when DEBUG_NET is on
+[PULL V2 01/15] virtio-net: fix map leaking on error during receive
-From: Bin Meng <bin.meng@windriver.com>
+Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 tries to fix the use after free of the sg by caching the virtqueue
 elements in an array and unmap them at once after receiving the
 packets, But it forgot to unmap the cached elements on error which
 will lead to leaking of mapping and other unexpected results.
-"qemu-common.h" should be included to provide the forward declaration
+Fixing this by detaching the cached elements on error. This addresses
-of qemu_hexdump() when DEBUG_NET is on.
+CVE-2022-26353.
-Signed-off-by: Bin Meng <bin.meng@windriver.com>
+Reported-by: Victor Tom <vv474172261@gmail.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Cc: qemu-stable@nongnu.org
 Fixes: CVE-2022-26353
 Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/net.c | 1 +
+ hw/net/virtio-net.c | 1 +
 file changed, 1 insertion(+)
-diff --git a/net/net.c b/net/net.c
+diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/net.c
+--- a/hw/net/virtio-net.c
-+++ b/net/net.c
++++ b/hw/net/virtio-net.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
-  */
+ err:
- #include "qemu/osdep.h"
+     for (j = 0; j < i; j++) {
-+#include "qemu-common.h"
++        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
+         g_free(elems[j]);
- #include "net/net.h"
+     }
- #include "clients.h"
 --
 .7.4

-[PULL V2 19/20] hmp: Use QAPI NetdevInfo in hmp_info_network
+[PULL V2 02/15] vhost: Add VhostShadowVirtqueue
-From: Alexey Kirillov <lekiravi@yandex-team.ru>
+From: Eugenio Pérez <eperezma@redhat.com>
-Replace usage of legacy field info_str of NetClientState for backend
+Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
-network devices with QAPI NetdevInfo stored_config that already used
+notifications and buffers, allowing qemu to track them. While qemu is
-in QMP query-netdev.
+forwarding the buffers and virtqueue changes, it is able to commit the
 memory it's being dirtied, the same way regular qemu's VirtIO devices
 do.
-This change increases the detail of the "info network" output and takes
+This commit only exposes basic SVQ allocation and free. Next patches of
-a more general approach to composing the output.
+the series add functionality like notifications and buffers forwarding.
-NIC and hubports still use legacy info_str field.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Alexey Kirillov <lekiravi@yandex-team.ru>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/qapi/hmp-output-visitor.h |  30 ++++++
+ hw/virtio/meson.build              |  2 +-
- net/net.c                         |  31 +++++-
+ hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
- qapi/hmp-output-visitor.c         | 193 ++++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
- qapi/meson.build                  |   1 +
+files changed, 91 insertions(+), 1 deletion(-)
-files changed, 254 insertions(+), 1 deletion(-)
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
- create mode 100644 include/qapi/hmp-output-visitor.h
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
  create mode 100644 qapi/hmp-output-visitor.c
-diff --git a/include/qapi/hmp-output-visitor.h b/include/qapi/hmp-output-visitor.h
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/meson.build
 +++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
  virtio_ss = ss.source_set()
  virtio_ss.add(files('virtio.c'))
 -virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
 +virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
  virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
  virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
  virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/include/qapi/hmp-output-visitor.h
++++ b/hw/virtio/vhost-shadow-virtqueue.c
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * HMP string output Visitor
++ * vhost shadow virtqueue
 + *
-+ * Copyright Yandex N.V., 2021
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * SPDX-License-Identifier: GPL-2.0-or-later
 + * See the COPYING file in the top-level directory.
 + *
 + */
 +
-+#ifndef HMP_OUTPUT_VISITOR_H
++#include "qemu/osdep.h"
-+#define HMP_OUTPUT_VISITOR_H
++#include "hw/virtio/vhost-shadow-virtqueue.h"
 +
-+#include "qapi/visitor.h"
++#include "qemu/error-report.h"
 +
 +typedef struct HMPOutputVisitor HMPOutputVisitor;
 +
 +/**
-+ * Create a HMP string output visitor for @obj
++ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
 + * shadow methods and file descriptors.
 + *
-+ * Flattens dicts/structures, only shows arrays borders.
++ * Returns the new virtqueue or NULL.
 + *
-+ * Errors are not expected to happen.
++ * In case of error, reason is reported through error_report.
 + *
 + * The caller is responsible for freeing the visitor with
 + * visit_free().
 + */
-+Visitor *hmp_output_visitor_new(char **result);
++VhostShadowVirtqueue *vhost_svq_new(void)
 +{
 +    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
 +    int r;
 +
-+#endif
++    r = event_notifier_init(&svq->hdev_kick, 0);
-diff --git a/net/net.c b/net/net.c
++    if (r != 0) {
-index XXXXXXX..XXXXXXX 100644
++        error_report("Couldn't create kick event notifier: %s (%d)",
---- a/net/net.c
++                     g_strerror(errno), errno);
-+++ b/net/net.c
++        goto err_init_hdev_kick;
@@ -XXX,XX +XXX,XX @@
  #include "sysemu/sysemu.h"
  #include "net/filter.h"
  #include "qapi/string-output-visitor.h"
 +#include "qapi/hmp-output-visitor.h"
  /* Net bridge is currently not supported for W32. */
  #if !defined(_WIN32)
@@ -XXX,XX +XXX,XX @@ static void netfilter_print_info(Monitor *mon, NetFilterState *nf)
      monitor_printf(mon, "\n");
  }
 +static char *generate_info_str(NetClientState *nc)
 +{
 +    NetdevInfo *ni = nc->stored_config;
 +    char *ret_out = NULL;
 +    Visitor *v;
 +
 +    /* Use legacy field info_str for NIC and hubports */
 +    if ((nc->info->type == NET_CLIENT_DRIVER_NIC) ||
 +        (nc->info->type == NET_CLIENT_DRIVER_HUBPORT)) {
 +        return g_strdup(nc->info_str ? nc->info_str : "");
 +    }
 +
-+    if (!ni) {
++    r = event_notifier_init(&svq->hdev_call, 0);
-+        return g_malloc0(1);
++    if (r != 0) {
 +        error_report("Couldn't create call event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_call;
 +    }
 +
-+    v = hmp_output_visitor_new(&ret_out);
++    return g_steal_pointer(&svq);
 +    if (visit_type_NetdevInfo(v, "", &ni, NULL)) {
 +        visit_complete(v, &ret_out);
 +    }
 +    visit_free(v);
 +
-+    return ret_out;
++err_init_hdev_call:
 +    event_notifier_cleanup(&svq->hdev_kick);
 +
 +err_init_hdev_kick:
 +    return NULL;
 +}
 +
- void print_net_client(Monitor *mon, NetClientState *nc)
++/**
- {
++ * Free the resources of the shadow virtqueue.
-     NetFilterState *nf;
++ *
-+    char *info_str = generate_info_str(nc);
++ * @pvq: gpointer to SVQ so it can be used by autofree functions.
++ */
-     monitor_printf(mon, "%s: index=%d,type=%s,%s\n", nc->name,
++void vhost_svq_free(gpointer pvq)
-                    nc->queue_index,
++{
-                    NetClientDriver_str(nc->info->type),
++    VhostShadowVirtqueue *vq = pvq;
--                   nc->info_str ? nc->info_str : "");
++    event_notifier_cleanup(&vq->hdev_kick);
-+                   info_str);
++    event_notifier_cleanup(&vq->hdev_call);
-+    g_free(info_str);
++    g_free(vq);
-+
++}
-     if (!QTAILQ_EMPTY(&nc->filters)) {
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
          monitor_printf(mon, "filters:\n");
      }
 diff --git a/qapi/hmp-output-visitor.c b/qapi/hmp-output-visitor.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/qapi/hmp-output-visitor.c
++++ b/hw/virtio/vhost-shadow-virtqueue.h
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * HMP string output Visitor
++ * vhost shadow virtqueue
 + *
-+ * Copyright Yandex N.V., 2021
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * SPDX-License-Identifier: GPL-2.0-or-later
 + * See the COPYING file in the top-level directory.
 + *
 + */
 +
-+#include "qemu/osdep.h"
++#ifndef VHOST_SHADOW_VIRTQUEUE_H
-+#include "qemu/cutils.h"
++#define VHOST_SHADOW_VIRTQUEUE_H
 +#include "qapi/hmp-output-visitor.h"
 +#include "qapi/visitor-impl.h"
 +
-+struct HMPOutputVisitor {
++#include "qemu/event_notifier.h"
 +    Visitor visitor;
 +    char **result;
 +    GString *buffer;
 +    bool is_continue;
 +};
 +
-+static HMPOutputVisitor *to_hov(Visitor *v)
++/* Shadow virtqueue to relay notifications */
-+{
++typedef struct VhostShadowVirtqueue {
-+    return container_of(v, HMPOutputVisitor, visitor);
++    /* Shadow kick notifier, sent to vhost */
-+}
++    EventNotifier hdev_kick;
 +    /* Shadow call notifier, sent to vhost */
 +    EventNotifier hdev_call;
 +} VhostShadowVirtqueue;
 +
-+static void hmp_output_append_formatted(Visitor *v, const char *fmt, ...)
++VhostShadowVirtqueue *vhost_svq_new(void);
 +{
 +    HMPOutputVisitor *ov = to_hov(v);
 +    va_list args;
 +
-+    if (ov->is_continue) {
++void vhost_svq_free(gpointer vq);
-+        g_string_append(ov->buffer, ",");
++G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 +    } else {
 +        ov->is_continue = true;
 +    }
 +
-+    va_start(args, fmt);
++#endif
 +    g_string_append_vprintf(ov->buffer, fmt, args);
 +    va_end(args);
 +}
 +
 +static void hmp_output_skip_comma(Visitor *v)
 +{
 +    HMPOutputVisitor *ov = to_hov(v);
 +
 +    ov->is_continue = false;
 +}
 +
 +static bool hmp_output_start_struct(Visitor *v, const char *name,
 +                                    void **obj, size_t unused, Error **errp)
 +{
 +    return true;
 +}
 +
 +static void hmp_output_end_struct(Visitor *v, void **obj) {}
 +
 +static bool hmp_output_start_list(Visitor *v, const char *name,
 +                                  GenericList **listp, size_t size,
 +                                  Error **errp)
 +{
 +    hmp_output_append_formatted(v, "%s=[", name);
 +    /* First element in array without comma before it */
 +    hmp_output_skip_comma(v);
 +
 +    return true;
 +}
 +
 +static GenericList *hmp_output_next_list(Visitor *v, GenericList *tail,
 +                                         size_t size)
 +{
 +    return tail->next;
 +}
 +
 +static void hmp_output_end_list(Visitor *v, void **obj)
 +{
 +    /* Don't need comma after last array element */
 +    hmp_output_skip_comma(v);
 +    hmp_output_append_formatted(v, "]");
 +}
 +
 +static bool hmp_output_type_int64(Visitor *v, const char *name,
 +                                  int64_t *obj, Error **errp)
 +{
 +    hmp_output_append_formatted(v, "%s=%" PRId64, name, *obj);
 +
 +    return true;
 +}
 +
 +static bool hmp_output_type_uint64(Visitor *v, const char *name,
 +                                   uint64_t *obj, Error **errp)
 +{
 +    hmp_output_append_formatted(v, "%s=%" PRIu64, name, *obj);
 +
 +    return true;
 +}
 +
 +static bool hmp_output_type_bool(Visitor *v, const char *name, bool *obj,
 +                                 Error **errp)
 +{
 +    hmp_output_append_formatted(v, "%s=%s", name, *obj ? "true" : "false");
 +
 +    return true;
 +}
 +
 +static bool hmp_output_type_str(Visitor *v, const char *name, char **obj,
 +                                Error **errp)
 +{
 +    /* Skip already printed or unused fields */
 +    if (!*obj || g_str_equal(name, "id") || g_str_equal(name, "type")) {
 +        return true;
 +    }
 +
 +    /* Do not print stub name for StringList elements */
 +    if (g_str_equal(name, "str")) {
 +        hmp_output_append_formatted(v, "%s", *obj);
 +    } else {
 +        hmp_output_append_formatted(v, "%s=%s", name, *obj);
 +    }
 +
 +    return true;
 +}
 +
 +static bool hmp_output_type_number(Visitor *v, const char *name,
 +                                   double *obj, Error **errp)
 +{
 +    hmp_output_append_formatted(v, "%s=%.17g", name, *obj);
 +
 +    return true;
 +}
 +
 +/* TODO: remove this function? */
 +static bool hmp_output_type_any(Visitor *v, const char *name,
 +                                QObject **obj, Error **errp)
 +{
 +    return true;
 +}
 +
 +static bool hmp_output_type_null(Visitor *v, const char *name,
 +                                 QNull **obj, Error **errp)
 +{
 +    hmp_output_append_formatted(v, "%s=NULL", name);
 +
 +    return true;
 +}
 +
 +static void hmp_output_complete(Visitor *v, void *opaque)
 +{
 +    HMPOutputVisitor *ov = to_hov(v);
 +
 +    *ov->result = g_string_free(ov->buffer, false);
 +    ov->buffer = NULL;
 +}
 +
 +static void hmp_output_free(Visitor *v)
 +{
 +    HMPOutputVisitor *ov = to_hov(v);
 +
 +    if (ov->buffer) {
 +        g_string_free(ov->buffer, true);
 +    }
 +    g_free(v);
 +}
 +
 +Visitor *hmp_output_visitor_new(char **result)
 +{
 +    HMPOutputVisitor *v;
 +
 +    v = g_malloc0(sizeof(*v));
 +
 +    v->visitor.type = VISITOR_OUTPUT;
 +    v->visitor.start_struct = hmp_output_start_struct;
 +    v->visitor.end_struct = hmp_output_end_struct;
 +    v->visitor.start_list = hmp_output_start_list;
 +    v->visitor.next_list = hmp_output_next_list;
 +    v->visitor.end_list = hmp_output_end_list;
 +    v->visitor.type_int64 = hmp_output_type_int64;
 +    v->visitor.type_uint64 = hmp_output_type_uint64;
 +    v->visitor.type_bool = hmp_output_type_bool;
 +    v->visitor.type_str = hmp_output_type_str;
 +    v->visitor.type_number = hmp_output_type_number;
 +    v->visitor.type_any = hmp_output_type_any;
 +    v->visitor.type_null = hmp_output_type_null;
 +    v->visitor.complete = hmp_output_complete;
 +    v->visitor.free = hmp_output_free;
 +
 +    v->result = result;
 +    v->buffer = g_string_new("");
 +    v->is_continue = false;
 +
 +    return &v->visitor;
 +}
 diff --git a/qapi/meson.build b/qapi/meson.build
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/meson.build
 +++ b/qapi/meson.build
@@ -XXX,XX +XXX,XX @@ util_ss.add(files(
    'qobject-output-visitor.c',
    'string-input-visitor.c',
    'string-output-visitor.c',
 +  'hmp-output-visitor.c',
  ))
  if have_system or have_tools
    util_ss.add(files(
 --
 .7.4

-[PULL V2 14/20] lan9118: switch to use qemu_receive_packet() for loopback
+[PULL V2 03/15] vhost: Add Shadow VirtQueue kick forwarding capabilities
-From: Alexander Bulekov <alxndr@bu.edu>
+From: Eugenio Pérez <eperezma@redhat.com>
-This patch switches to use qemu_receive_packet() which can detect
+At this mode no buffer forwarding will be performed in SVQ mode: Qemu
-reentrancy and return early.
+will just forward the guest's kicks to the device.
-This is intended to address CVE-2021-3416.
+Host memory notifiers regions are left out for simplicity, and they will
+not be addressed in this series.
-Cc: Prasad J Pandit <ppandit@redhat.com>
-Cc: qemu-stable@nongnu.org
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/lan9118.c | 2 +-
+ hw/virtio/vhost-shadow-virtqueue.c |  56 ++++++++++++++
-file changed, 1 insertion(+), 1 deletion(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
+ hw/virtio/vhost-vdpa.c             | 145 ++++++++++++++++++++++++++++++++++++-
-diff --git a/hw/net/lan9118.c b/hw/net/lan9118.c
+ include/hw/virtio/vhost-vdpa.h     |   4 +
 files changed, 217 insertions(+), 2 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/lan9118.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/lan9118.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void do_tx_packet(lan9118_state *s)
+@@ -XXX,XX +XXX,XX @@
-     /* FIXME: Honor TX disable, and allow queueing of packets.  */
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
-     if (s->phy_control & 0x4000)  {
-         /* This assumes the receive routine doesn't touch the VLANClient.  */
+ #include "qemu/error-report.h"
--        lan9118_receive(qemu_get_queue(s->nic), s->txp->data, s->txp->len);
++#include "qemu/main-loop.h"
-+        qemu_receive_packet(qemu_get_queue(s->nic), s->txp->data, s->txp->len);
++#include "linux-headers/linux/vhost.h"
 +
 +/**
 + * Forward guest notifications.
 + *
 + * @n: guest kick event notifier, the one that guest set to notify svq.
 + */
 +static void vhost_handle_guest_kick(EventNotifier *n)
 +{
 +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
 +                                             svq_kick);
 +    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Set a new file descriptor for the guest to kick the SVQ and notify for avail
 + *
 + * @svq: The svq
 + * @svq_kick_fd: The svq kick fd
 + *
 + * Note that the SVQ will never close the old file descriptor.
 + */
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 +{
 +    EventNotifier *svq_kick = &svq->svq_kick;
 +    bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
 +    bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
 +
 +    if (poll_stop) {
 +        event_notifier_set_handler(svq_kick, NULL);
 +    }
 +
 +    /*
 +     * event_notifier_set_handler already checks for guest's notifications if
 +     * they arrive at the new file descriptor in the switch, so there is no
 +     * need to explicitly check for them.
 +     */
 +    if (poll_start) {
 +        event_notifier_init_fd(svq_kick, svq_kick_fd);
 +        event_notifier_set(svq_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +    }
 +}
 +
 +/**
 + * Stop the shadow virtqueue operation.
 + * @svq: Shadow Virtqueue
 + */
 +void vhost_svq_stop(VhostShadowVirtqueue *svq)
 +{
 +    event_notifier_set_handler(&svq->svq_kick, NULL);
 +}
  /**
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
          goto err_init_hdev_call;
      }
 +    event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      return g_steal_pointer(&svq);
  err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ err_init_hdev_kick:
  void vhost_svq_free(gpointer pvq)
  {
      VhostShadowVirtqueue *vq = pvq;
 +    vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
      EventNotifier hdev_call;
 +
 +    /*
 +     * Borrowed virtqueue's guest to host notifier. To borrow it in this event
 +     * notifier allows to recover the VhostShadowVirtqueue from the event loop
 +     * easily. If we use the VirtQueue's one, we don't have an easy way to
 +     * retrieve VhostShadowVirtqueue.
 +     *
 +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
 +     */
 +    EventNotifier svq_kick;
  } VhostShadowVirtqueue;
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +
 +void vhost_svq_stop(VhostShadowVirtqueue *svq);
 +
  VhostShadowVirtqueue *vhost_svq_new(void);
  void vhost_svq_free(gpointer vq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/virtio/vhost.h"
  #include "hw/virtio/vhost-backend.h"
  #include "hw/virtio/virtio-net.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "hw/virtio/vhost-vdpa.h"
  #include "exec/address-spaces.h"
  #include "qemu/main-loop.h"
  #include "cpu.h"
  #include "trace.h"
  #include "qemu-common.h"
 +#include "qapi/error.h"
  /*
   * Return one past the end of the end of section. Be careful with uint64_t
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
      return v->index != 0;
  }
 +static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 +                               Error **errp)
 +{
 +    g_autoptr(GPtrArray) shadow_vqs = NULL;
 +
 +    if (!v->shadow_vqs_enabled) {
 +        return 0;
 +    }
 +
 +    shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 +    for (unsigned n = 0; n < hdev->nvqs; ++n) {
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +
 +        if (unlikely(!svq)) {
 +            error_setg(errp, "Cannot create svq %u", n);
 +            return -1;
 +        }
 +        g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
 +    }
 +
 +    v->shadow_vqs = g_steal_pointer(&shadow_vqs);
 +    return 0;
 +}
 +
  static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
      dev->opaque =  opaque ;
      v->listener = vhost_vdpa_memory_listener;
      v->msg_type = VHOST_IOTLB_MSG_V2;
 +    ret = vhost_vdpa_init_svq(dev, v, errp);
 +    if (ret) {
 +        goto err;
 +    }
      vhost_vdpa_get_iova_range(v);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
                                 VIRTIO_CONFIG_S_DRIVER);
      return 0;
 +
 +err:
 +    ram_block_discard_disable(false);
 +    return ret;
  }
  static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
  static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int i;
 +    if (v->shadow_vqs_enabled) {
 +        /* FIXME SVQ is not compatible with host notifiers mr */
 +        return;
 +    }
 +
      for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
          if (vhost_vdpa_host_notifier_init(dev, i)) {
              goto err;
@@ -XXX,XX +XXX,XX @@ err:
      return;
  }
 +static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t idx;
 +
 +    if (!v->shadow_vqs) {
 +        return;
 +    }
 +
 +    for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
 +        vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
 +    }
 +    g_ptr_array_free(v->shadow_vqs, true);
 +}
 +
  static int vhost_vdpa_cleanup(struct vhost_dev *dev)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
      trace_vhost_vdpa_cleanup(dev, v);
      vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      memory_listener_unregister(&v->listener);
 +    vhost_vdpa_svq_cleanup(dev);
      dev->opaque = NULL;
      ram_block_discard_disable(false);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
      return ret;
  }
 +static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
 +{
 +    if (!v->shadow_vqs_enabled) {
 +        return;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        vhost_svq_stop(svq);
 +    }
 +}
 +
  static int vhost_vdpa_reset_device(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      uint8_t status = 0;
 +    vhost_vdpa_reset_svq(v);
 +
      ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
      trace_vhost_vdpa_reset_device(dev, status);
      return ret;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
      return ret;
   }
 +static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
 +                                         struct vhost_vring_file *file)
 +{
 +    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +}
 +
 +/**
 + * Set the shadow virtqueue descriptors to the device
 + *
 + * @dev: The vhost device model
 + * @svq: The shadow virtqueue
 + * @idx: The index of the virtqueue in the vhost device
 + * @errp: Error
 + */
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq,
 +                                 unsigned idx,
 +                                 Error **errp)
 +{
 +    struct vhost_vring_file file = {
 +        .index = dev->vq_index + idx,
 +    };
 +    const EventNotifier *event_notifier = &svq->hdev_kick;
 +    int r;
 +
 +    file.fd = event_notifier_get_fd(event_notifier);
 +    r = vhost_vdpa_set_vring_dev_kick(dev, &file);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Can't set device kick fd");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    Error *err = NULL;
 +    unsigned i;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
 +        if (unlikely(!ok)) {
 +            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
  static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
  {
      struct vhost_vdpa *v = dev->opaque;
 +    bool ok;
      trace_vhost_vdpa_dev_start(dev, started);
      if (started) {
          vhost_vdpa_host_notifiers_init(dev);
 +        ok = vhost_vdpa_svqs_start(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_set_vring_ready(dev);
      } else {
-         qemu_send_packet(qemu_get_queue(s->nic), s->txp->data, s->txp->len);
+         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
-     }
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +    int vdpa_idx = file->index - dev->vq_index;
 +
 +    if (v->shadow_vqs_enabled) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +        vhost_svq_set_svq_kick_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_kick(dev, file);
 +    }
  }
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #ifndef HW_VIRTIO_VHOST_VDPA_H
  #define HW_VIRTIO_VHOST_VDPA_H
 +#include <gmodule.h>
 +
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      bool iotlb_batch_begin_sent;
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
 +    bool shadow_vqs_enabled;
 +    GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
  } VhostVDPA;
 --
 .7.4

-[PULL V2 05/20] net: introduce qemu_receive_packet()
+[PULL V2 04/15] vhost: Add Shadow VirtQueue call forwarding capabilities
-Some NIC supports loopback mode and this is done by calling
+From: Eugenio Pérez <eperezma@redhat.com>
 nc->info->receive() directly which in fact suppresses the effort of
 reentrancy check that is done in qemu_net_queue_send().
-Unfortunately we can't use qemu_net_queue_send() here since for
+This will make qemu aware of the device used buffers, allowing it to
-loopback there's no sender as peer, so this patch introduce a
+write the guest memory with its contents if needed.
 qemu_receive_packet() which is used for implementing loopback mode
 for a NIC with this check.
-NIC that supports loopback mode will be converted to this helper.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 This is intended to address CVE-2021-3416.
 Cc: Prasad J Pandit <ppandit@redhat.com>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Cc: qemu-stable@nongnu.org
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/net/net.h   |  5 +++++
+ hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
- include/net/queue.h |  8 ++++++++
+ hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
- net/net.c           | 38 +++++++++++++++++++++++++++++++-------
+ hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
- net/queue.c         | 22 ++++++++++++++++++++++
+files changed, 71 insertions(+), 2 deletions(-)
 files changed, 66 insertions(+), 7 deletions(-)
-diff --git a/include/net/net.h b/include/net/net.h
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/net/net.h
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/include/net/net.h
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ void *qemu_get_nic_opaque(NetClientState *nc);
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
  void qemu_del_net_client(NetClientState *nc);
  typedef void (*qemu_nic_foreach)(NICState *nic, void *opaque);
  void qemu_foreach_nic(qemu_nic_foreach func, void *opaque);
 +int qemu_can_receive_packet(NetClientState *nc);
  int qemu_can_send_packet(NetClientState *nc);
  ssize_t qemu_sendv_packet(NetClientState *nc, const struct iovec *iov,
                            int iovcnt);
  ssize_t qemu_sendv_packet_async(NetClientState *nc, const struct iovec *iov,
                                  int iovcnt, NetPacketSent *sent_cb);
  ssize_t qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size);
 +ssize_t qemu_receive_packet(NetClientState *nc, const uint8_t *buf, int size);
 +ssize_t qemu_receive_packet_iov(NetClientState *nc,
 +                                const struct iovec *iov,
 +                                int iovcnt);
  ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size);
  ssize_t qemu_send_packet_async(NetClientState *nc, const uint8_t *buf,
                                 int size, NetPacketSent *sent_cb);
 diff --git a/include/net/queue.h b/include/net/queue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/net/queue.h
 +++ b/include/net/queue.h
@@ -XXX,XX +XXX,XX @@ void qemu_net_queue_append_iov(NetQueue *queue,
  void qemu_del_net_queue(NetQueue *queue);
 +ssize_t qemu_net_queue_receive(NetQueue *queue,
 +                               const uint8_t *data,
 +                               size_t size);
 +
 +ssize_t qemu_net_queue_receive_iov(NetQueue *queue,
 +                                   const struct iovec *iov,
 +                                   int iovcnt);
 +
  ssize_t qemu_net_queue_send(NetQueue *queue,
                              NetClientState *sender,
                              unsigned flags,
 diff --git a/net/net.c b/net/net.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/net.c
 +++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ int qemu_set_vnet_be(NetClientState *nc, bool is_be)
  #endif
  }
-+int qemu_can_receive_packet(NetClientState *nc)
+ /**
 + * Forward vhost notifications
 + *
 + * @n: hdev call event notifier, the one that device set to notify svq.
 + */
 +static void vhost_svq_handle_call(EventNotifier *n)
 +{
-+    if (nc->receive_disabled) {
++    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
-+        return 0;
++                                             hdev_call);
-+    } else if (nc->info->can_receive &&
++    event_notifier_test_and_clear(n);
-+               !nc->info->can_receive(nc)) {
++    event_notifier_set(&svq->svq_call);
 +        return 0;
 +    }
 +    return 1;
 +}
 +
- int qemu_can_send_packet(NetClientState *sender)
++/**
- {
++ * Set the call notifier for the SVQ to call the guest
-     int vm_running = runstate_is_running();
++ *
-@@ -XXX,XX +XXX,XX @@ int qemu_can_send_packet(NetClientState *sender)
++ * @svq: Shadow virtqueue
-         return 1;
++ * @call_fd: call notifier
 + *
 + * Called on BQL context.
 + */
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 +{
 +    if (call_fd == VHOST_FILE_UNBIND) {
 +        /*
 +         * Fail event_notifier_set if called handling device call.
 +         *
 +         * SVQ still needs device notifications, since it needs to keep
 +         * forwarding used buffers even with the unbind.
 +         */
 +        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
 +    } else {
 +        event_notifier_init_fd(&svq->svq_call, call_fd);
 +    }
 +}
 +
 +/**
   * Set a new file descriptor for the guest to kick the SVQ and notify for avail
   *
   * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      }
--    if (sender->peer->receive_disabled) {
+     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
--        return 0;
++    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
--    } else if (sender->peer->info->can_receive &&
+     return g_steal_pointer(&svq);
--               !sender->peer->info->can_receive(sender->peer)) {
--        return 0;
+ err_init_hdev_call:
--    }
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
--    return 1;
+     VhostShadowVirtqueue *vq = pvq;
-+    return qemu_can_receive_packet(sender->peer);
+     vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
 +    event_notifier_set_handler(&vq->hdev_call, NULL);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
  }
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
- static ssize_t filter_receive_iov(NetClientState *nc,
+index XXXXXXX..XXXXXXX 100644
-@@ -XXX,XX +XXX,XX @@ ssize_t qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size)
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-     return qemu_send_packet_async(nc, buf, size, NULL);
++++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
       * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
       */
      EventNotifier svq_kick;
 +
 +    /* Guest's call notifier, where the SVQ calls guest. */
 +    EventNotifier svq_call;
  } VhostShadowVirtqueue;
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
  }
-+ssize_t qemu_receive_packet(NetClientState *nc, const uint8_t *buf, int size)
++static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
 +                                         struct vhost_vring_file *file)
 +{
-+    if (!qemu_can_receive_packet(nc)) {
++    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-+        return 0;
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 +}
 +
  /**
   * Set the shadow virtqueue descriptors to the device
   *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
   * @svq: The shadow virtqueue
   * @idx: The index of the virtqueue in the vhost device
   * @errp: Error
 + *
 + * Note that this function does not rewind kick file descriptor if cannot set
 + * call one.
   */
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                   VhostShadowVirtqueue *svq,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 +        return false;
 +    }
 +
-+    return qemu_net_queue_receive(nc->incoming_queue, buf, size);
++    event_notifier = &svq->hdev_call;
-+}
++    file.fd = event_notifier_get_fd(event_notifier);
 +    r = vhost_vdpa_set_vring_dev_call(dev, &file);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Can't set device call fd");
      }
      return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +
-+ssize_t qemu_receive_packet_iov(NetClientState *nc, const struct iovec *iov,
++    if (v->shadow_vqs_enabled) {
-+                                int iovcnt)
++        int vdpa_idx = file->index - dev->vq_index;
-+{
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
-+    if (!qemu_can_receive_packet(nc)) {
++
 +        vhost_svq_set_svq_call_fd(svq, file->fd);
 +        return 0;
++    } else {
++        return vhost_vdpa_set_vring_dev_call(dev, file);
 +    }
-+
-+    return qemu_net_queue_receive_iov(nc->incoming_queue, iov, iovcnt);
-+}
-+
- ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size)
- {
-     return qemu_send_packet_async_with_flags(nc, QEMU_NET_PACKET_FLAG_RAW,
-diff --git a/net/queue.c b/net/queue.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/queue.c
-+++ b/net/queue.c
-@@ -XXX,XX +XXX,XX @@ static ssize_t qemu_net_queue_deliver_iov(NetQueue *queue,
-     return ret;
  }
-+ssize_t qemu_net_queue_receive(NetQueue *queue,
+ static int vhost_vdpa_get_features(struct vhost_dev *dev,
 +                               const uint8_t *data,
 +                               size_t size)
 +{
 +    if (queue->delivering) {
 +        return 0;
 +    }
 +
 +    return qemu_net_queue_deliver(queue, NULL, 0, data, size);
 +}
 +
 +ssize_t qemu_net_queue_receive_iov(NetQueue *queue,
 +                                   const struct iovec *iov,
 +                                   int iovcnt)
 +{
 +    if (queue->delivering) {
 +        return 0;
 +    }
 +
 +    return qemu_net_queue_deliver_iov(queue, NULL, 0, iov, iovcnt);
 +}
 +
  ssize_t qemu_net_queue_send(NetQueue *queue,
                              NetClientState *sender,
                              unsigned flags,
 --
 .7.4

-[PULL V2 16/20] qapi: net: Add query-netdev command
+[PULL V2 05/15] vhost: Add vhost_svq_valid_features to shadow vq
-From: Alexey Kirillov <lekiravi@yandex-team.ru>
+From: Eugenio Pérez <eperezma@redhat.com>
-The query-netdev command is used to get the configuration of the current
+This allows SVQ to negotiate features with the guest and the device. For
-network device backends (netdevs).
+the device, SVQ is a driver. While this function bypasses all
-This is the QMP analog of the HMP command "info network" but only for
+non-transport features, it needs to disable the features that SVQ does
-netdevs (i.e. excluding NIC and hubports).
+not support when forwarding buffers. This includes packed vq layout,
 indirect descriptors or event idx.
-The query-netdev command returns an array of objects of the NetdevInfo
+Future changes can add support to offer more features to the guest,
-type, which are an extension of Netdev type. It means that response can
+since the use of VirtQueue gives this for free. This is left out at the
-be used for netdev-add after small modification. This can be useful for
+moment for simplicity.
 recreate the same netdev configuration.
-Information about the network device is filled in when it is created or
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
-modified and is available through the NetClientState->stored_config.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Alexey Kirillov <lekiravi@yandex-team.ru>
 Acked-by: Markus Armbruster <armbru@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- include/net/net.h |   3 ++
+ hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
- net/l2tpv3.c      |   7 ++++
+ hw/virtio/vhost-shadow-virtqueue.h |  2 ++
- net/net.c         |  30 +++++++++++++-
+ hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
- net/netmap.c      |   7 ++++
+files changed, 61 insertions(+)
  net/slirp.c       | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
  net/socket.c      |  71 +++++++++++++++++++++++++++++++
  net/tap-win32.c   |   9 ++++
  net/tap.c         | 103 ++++++++++++++++++++++++++++++++++++++++++---
  net/vde.c         |  22 ++++++++++
  net/vhost-user.c  |  18 ++++++--
  net/vhost-vdpa.c  |  14 +++++++
  qapi/net.json     |  80 +++++++++++++++++++++++++++++++++++
 files changed, 477 insertions(+), 9 deletions(-)
-diff --git a/include/net/net.h b/include/net/net.h
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/include/net/net.h
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/include/net/net.h
++++ b/hw/virtio/vhost-shadow-virtqueue.c
 @@ -XXX,XX +XXX,XX @@
- #include "qapi/qapi-types-net.h"
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
- #include "net/queue.h"
- #include "hw/qdev-properties-system.h"
+ #include "qemu/error-report.h"
-+#include "qapi/clone-visitor.h"
++#include "qapi/error.h"
-+#include "qapi/qapi-visit-net.h"
+ #include "qemu/main-loop.h"
+ #include "linux-headers/linux/vhost.h"
- #define MAC_FMT "%02X:%02X:%02X:%02X:%02X:%02X"
- #define MAC_ARG(x) ((uint8_t *)(x))[0], ((uint8_t *)(x))[1], \
+ /**
-@@ -XXX,XX +XXX,XX @@ struct NetClientState {
++ * Validate the transport device features that both guests can use with the SVQ
-     char *model;
++ * and SVQs can use with the device.
-     char *name;
++ *
-     char info_str[256];
++ * @dev_features: The features
-+    NetdevInfo *stored_config;
++ * @errp: Error pointer
-     unsigned receive_disabled : 1;
++ */
-     NetClientDestructor *destructor;
++bool vhost_svq_valid_features(uint64_t features, Error **errp)
-     unsigned int queue_index;
++{
-diff --git a/net/l2tpv3.c b/net/l2tpv3.c
++    bool ok = true;
-index XXXXXXX..XXXXXXX 100644
++    uint64_t svq_features = features;
 --- a/net/l2tpv3.c
 +++ b/net/l2tpv3.c
@@ -XXX,XX +XXX,XX @@ int net_init_l2tpv3(const Netdev *netdev,
      l2tpv3_read_poll(s, true);
 +    /* Store startup parameters */
 +    nc->stored_config = g_new0(NetdevInfo, 1);
 +    nc->stored_config->type = NET_BACKEND_L2TPV3;
 +
-+    QAPI_CLONE_MEMBERS(NetdevL2TPv3Options,
++    for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
-+                       &nc->stored_config->u.l2tpv3, l2tpv3);
++         ++b) {
 +        switch (b) {
 +        case VIRTIO_F_ANY_LAYOUT:
 +            continue;
 +
-     snprintf(s->nc.info_str, sizeof(s->nc.info_str),
++        case VIRTIO_F_ACCESS_PLATFORM:
-              "l2tpv3: connected");
++            /* SVQ trust in the host's IOMMU to translate addresses */
-     return 0;
++        case VIRTIO_F_VERSION_1:
-diff --git a/net/net.c b/net/net.c
++            /* SVQ trust that the guest vring is little endian */
-index XXXXXXX..XXXXXXX 100644
++            if (!(svq_features & BIT_ULL(b))) {
---- a/net/net.c
++                set_bit(b, &svq_features);
-+++ b/net/net.c
++                ok = false;
-@@ -XXX,XX +XXX,XX @@
++            }
- #include "monitor/monitor.h"
++            continue;
  #include "qemu/help_option.h"
  #include "qapi/qapi-commands-net.h"
 -#include "qapi/qapi-visit-net.h"
  #include "qapi/qmp/qdict.h"
  #include "qapi/qmp/qerror.h"
  #include "qemu/error-report.h"
@@ -XXX,XX +XXX,XX @@ static void qemu_free_net_client(NetClientState *nc)
      }
      g_free(nc->name);
      g_free(nc->model);
 +    qapi_free_NetdevInfo(nc->stored_config);
      if (nc->destructor) {
          nc->destructor(nc);
      }
@@ -XXX,XX +XXX,XX @@ RxFilterInfoList *qmp_query_rx_filter(bool has_name, const char *name,
      return filter_list;
  }
 +NetdevInfoList *qmp_query_netdev(Error **errp)
 +{
 +    NetdevInfoList *list = NULL;
 +    NetClientState *nc;
 +
-+    QTAILQ_FOREACH(nc, &net_clients, next) {
++        default:
-+        /*
++            if (svq_features & BIT_ULL(b)) {
-+         * Only look at netdevs (backend network devices), not for each queue
++                clear_bit(b, &svq_features);
-+         * or NIC / hubport
++                ok = false;
 +         */
 +        if (nc->stored_config) {
 +            NetdevInfo *element = QAPI_CLONE(NetdevInfo, nc->stored_config);
 +
 +            g_free(element->id); /* Need to dealloc empty id after clone */
 +            element->id = g_strdup(nc->name);
 +
 +            element->has_peer_id = nc->peer != NULL;
 +            if (element->has_peer_id) {
 +                element->peer_id = g_strdup(nc->peer->name);
 +            }
-+
-+            QAPI_LIST_PREPEND(list, element);
 +        }
 +    }
 +
-+    return list;
++    if (!ok) {
 +        error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
 +                         ", ok: 0x%"PRIx64, features, svq_features);
 +    }
 +    return ok;
 +}
 +
- void hmp_info_network(Monitor *mon, const QDict *qdict)
++/**
   * Forward guest notifications.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier svq_call;
  } VhostShadowVirtqueue;
 +bool vhost_svq_valid_features(uint64_t features, Error **errp);
 +
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
-     NetClientState *nc, *peer;
+     g_autoptr(GPtrArray) shadow_vqs = NULL;
-diff --git a/net/netmap.c b/net/netmap.c
++    uint64_t dev_features, svq_features;
-index XXXXXXX..XXXXXXX 100644
++    int r;
---- a/net/netmap.c
++    bool ok;
-+++ b/net/netmap.c
-@@ -XXX,XX +XXX,XX @@ int net_init_netmap(const Netdev *netdev,
+     if (!v->shadow_vqs_enabled) {
-     pstrcpy(s->ifname, sizeof(s->ifname), netmap_opts->ifname);
+         return 0;
-     netmap_read_poll(s, true); /* Initially only poll for reads. */
+     }
-+    /* Store startup parameters */
++    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
-+    nc->stored_config = g_new0(NetdevInfo, 1);
++    if (r != 0) {
-+    nc->stored_config->type = NET_BACKEND_NETMAP;
++        error_setg_errno(errp, -r, "Can't get vdpa device features");
-+
++        return r;
 +    QAPI_CLONE_MEMBERS(NetdevNetmapOptions,
 +                       &nc->stored_config->u.netmap, netmap_opts);
 +
      return 0;
  }
 diff --git a/net/slirp.c b/net/slirp.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/slirp.c
 +++ b/net/slirp.c
@@ -XXX,XX +XXX,XX @@ static int net_slirp_init(NetClientState *peer, const char *model,
      int shift;
      char *end;
      struct slirp_config_str *config;
 +    NetdevUserOptions *stored;
 +    StringList **stored_hostfwd;
 +    StringList **stored_guestfwd;
      if (!ipv4 && (vnetwork || vhost || vnameserver)) {
          error_setg(errp, "IPv4 disabled but netmask/host/dns provided");
@@ -XXX,XX +XXX,XX @@ static int net_slirp_init(NetClientState *peer, const char *model,
      nc = qemu_new_net_client(&net_slirp_info, peer, model, name);
 +    /* Store startup parameters */
 +    nc->stored_config = g_new0(NetdevInfo, 1);
 +    nc->stored_config->type = NET_BACKEND_USER;
 +    stored = &nc->stored_config->u.user;
 +
 +    if (vhostname) {
 +        stored->has_hostname = true;
 +        stored->hostname = g_strdup(vhostname);
 +    }
 +
-+    stored->has_q_restrict = true;
++    svq_features = dev_features;
-+    stored->q_restrict = restricted;
++    ok = vhost_svq_valid_features(svq_features, errp);
-+
++    if (unlikely(!ok)) {
-+    stored->has_ipv4 = true;
++        return -1;
 +    stored->ipv4 = ipv4;
 +
 +    stored->has_ipv6 = true;
 +    stored->ipv6 = ipv6;
 +
 +    if (ipv4) {
 +        uint8_t *net_bytes = (uint8_t *)&net;
 +        uint8_t *mask_bytes = (uint8_t *)&mask;
 +
 +        stored->has_net = true;
 +        stored->net = g_strdup_printf("%d.%d.%d.%d/%d.%d.%d.%d",
 +                                      net_bytes[0], net_bytes[1],
 +                                      net_bytes[2], net_bytes[3],
 +                                      mask_bytes[0], mask_bytes[1],
 +                                      mask_bytes[2], mask_bytes[3]);
 +
 +        stored->has_host = true;
 +        stored->host = g_strdup(inet_ntoa(host));
 +    }
 +
-+    if (tftp_export) {
+     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
-+        stored->has_tftp = true;
+     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-+        stored->tftp = g_strdup(tftp_export);
+         g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +    }
 +
 +    if (bootfile) {
 +        stored->has_bootfile = true;
 +        stored->bootfile = g_strdup(bootfile);
 +    }
 +
 +    if (vdhcp_start) {
 +        stored->has_dhcpstart = true;
 +        stored->dhcpstart = g_strdup(vdhcp_start);
 +    }
 +
 +    if (ipv4) {
 +        stored->has_dns = true;
 +        stored->dns = g_strdup(inet_ntoa(dns));
 +    }
 +
 +    if (dnssearch) {
 +        stored->has_dnssearch = true;
 +        StringList **stored_list = &stored->dnssearch;
 +
 +        for (int i = 0; dnssearch[i]; i++) {
 +            String *element = g_new0(String, 1);
 +
 +            element->str = g_strdup(dnssearch[i]);
 +            QAPI_LIST_APPEND(stored_list, element);
 +        }
 +    }
 +
 +    if (vdomainname) {
 +        stored->has_domainname = true;
 +        stored->domainname = g_strdup(vdomainname);
 +    }
 +
 +    if (ipv6) {
 +        char addrstr[INET6_ADDRSTRLEN];
 +        const char *res;
 +
 +        stored->has_ipv6_prefix = true;
 +        stored->ipv6_prefix = g_strdup(vprefix6);
 +
 +        stored->has_ipv6_prefixlen = true;
 +        stored->ipv6_prefixlen = vprefix6_len;
 +
 +        res = inet_ntop(AF_INET6, &ip6_host,
 +                        addrstr, sizeof(addrstr));
 +
 +        stored->has_ipv6_host = true;
 +        stored->ipv6_host = g_strdup(res);
 +
 +        res = inet_ntop(AF_INET6, &ip6_dns,
 +                        addrstr, sizeof(addrstr));
 +
 +        stored->has_ipv6_dns = true;
 +        stored->ipv6_dns = g_strdup(res);
 +    }
 +
 +    if (smb_export) {
 +        stored->has_smb = true;
 +        stored->smb = g_strdup(smb_export);
 +    }
 +
 +    if (vsmbserver) {
 +        stored->has_smbserver = true;
 +        stored->smbserver = g_strdup(vsmbserver);
 +    }
 +
 +    if (tftp_server_name) {
 +        stored->has_tftp_server_name = true;
 +        stored->tftp_server_name = g_strdup(tftp_server_name);
 +    }
 +
      snprintf(nc->info_str, sizeof(nc->info_str),
               "net=%s,restrict=%s", inet_ntoa(net),
               restricted ? "on" : "off");
@@ -XXX,XX +XXX,XX @@ static int net_slirp_init(NetClientState *peer, const char *model,
      s->poll_notifier.notify = net_slirp_poll_notify;
      main_loop_poll_add_notifier(&s->poll_notifier);
 +    stored_hostfwd = &stored->hostfwd;
 +    stored_guestfwd = &stored->guestfwd;
 +
      for (config = slirp_configs; config; config = config->next) {
 +        String *element = g_new0(String, 1);
 +
 +        element->str = g_strdup(config->str);
          if (config->flags & SLIRP_CFG_HOSTFWD) {
              if (slirp_hostfwd(s, config->str, errp) < 0) {
                  goto error;
              }
 +            stored->has_hostfwd = true;
 +            QAPI_LIST_APPEND(stored_hostfwd, element);
          } else {
              if (slirp_guestfwd(s, config->str, errp) < 0) {
                  goto error;
              }
 +            stored->has_guestfwd = true;
 +            QAPI_LIST_APPEND(stored_guestfwd, element);
          }
      }
  #ifndef _WIN32
 diff --git a/net/socket.c b/net/socket.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/socket.c
 +++ b/net/socket.c
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer,
      NetSocketState *s;
      SocketAddress *sa;
      SocketAddressType sa_type;
 +    NetdevSocketOptions *stored;
      sa = socket_local_address(fd, errp);
      if (!sa) {
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer,
      net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
      net_socket_read_poll(s, true);
 +    /* Store startup parameters */
 +    nc->stored_config = g_new0(NetdevInfo, 1);
 +    nc->stored_config->type = NET_BACKEND_SOCKET;
 +    stored = &nc->stored_config->u.socket;
 +
 +    stored->has_fd = true;
 +    stored->fd = g_strdup_printf("%d", fd);
 +
      /* mcast: save bound address as dst */
      if (is_connected && mcast != NULL) {
 +        stored->has_mcast = true;
 +        stored->mcast = g_strdup(mcast);
 +
          s->dgram_dst = saddr;
          snprintf(nc->info_str, sizeof(nc->info_str),
                   "socket: fd=%d (cloned mcast=%s:%d)",
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_stream(NetClientState *peer,
  {
      NetClientState *nc;
      NetSocketState *s;
 +    NetdevSocketOptions *stored;
      nc = qemu_new_net_client(&net_socket_info, peer, model, name);
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_stream(NetClientState *peer,
      } else {
          qemu_set_fd_handler(s->fd, NULL, net_socket_connect, s);
      }
 +
 +    /* Store startup parameters */
 +    nc->stored_config = g_new0(NetdevInfo, 1);
 +    nc->stored_config->type = NET_BACKEND_SOCKET;
 +    stored = &nc->stored_config->u.socket;
 +
 +    stored->has_fd = true;
 +    stored->fd = g_strdup_printf("%d", fd);
 +
      return s;
  }
@@ -XXX,XX +XXX,XX @@ static void net_socket_accept(void *opaque)
      struct sockaddr_in saddr;
      socklen_t len;
      int fd;
 +    NetdevSocketOptions *stored;
      for(;;) {
          len = sizeof(saddr);
@@ -XXX,XX +XXX,XX @@ static void net_socket_accept(void *opaque)
      s->fd = fd;
      s->nc.link_down = false;
      net_socket_connect(s);
 +
 +    /* Store additional startup parameters (extend net_socket_listen_init) */
 +    stored = &s->nc.stored_config->u.socket;
 +
 +    stored->has_fd = true;
 +    stored->fd = g_strdup_printf("%d", fd);
 +
      snprintf(s->nc.info_str, sizeof(s->nc.info_str),
               "socket: connection from %s:%d",
               inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
@@ -XXX,XX +XXX,XX @@ static int net_socket_listen_init(NetClientState *peer,
      NetSocketState *s;
      struct sockaddr_in saddr;
      int fd, ret;
 +    NetdevSocketOptions *stored;
      if (parse_host_port(&saddr, host_str, errp) < 0) {
          return -1;
@@ -XXX,XX +XXX,XX @@ static int net_socket_listen_init(NetClientState *peer,
      net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
      qemu_set_fd_handler(s->listen_fd, net_socket_accept, NULL, s);
 +
 +    /* Store startup parameters */
 +    nc->stored_config = g_new0(NetdevInfo, 1);
 +    nc->stored_config->type = NET_BACKEND_SOCKET;
 +    stored = &nc->stored_config->u.socket;
 +
 +    stored->has_listen = true;
 +    stored->listen = g_strdup(host_str);
 +
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ static int net_socket_connect_init(NetClientState *peer,
      NetSocketState *s;
      int fd, connected, ret;
      struct sockaddr_in saddr;
 +    NetdevSocketOptions *stored;
      if (parse_host_port(&saddr, host_str, errp) < 0) {
          return -1;
@@ -XXX,XX +XXX,XX @@ static int net_socket_connect_init(NetClientState *peer,
          return -1;
      }
 +    /* Store additional startup parameters (extend net_socket_fd_init) */
 +    stored = &s->nc.stored_config->u.socket;
 +
 +    stored->has_connect = true;
 +    stored->connect = g_strdup(host_str);
 +
      snprintf(s->nc.info_str, sizeof(s->nc.info_str),
               "socket: connect to %s:%d",
               inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
@@ -XXX,XX +XXX,XX @@ static int net_socket_mcast_init(NetClientState *peer,
      int fd;
      struct sockaddr_in saddr;
      struct in_addr localaddr, *param_localaddr;
 +    NetdevSocketOptions *stored;
      if (parse_host_port(&saddr, host_str, errp) < 0) {
          return -1;
@@ -XXX,XX +XXX,XX @@ static int net_socket_mcast_init(NetClientState *peer,
      s->dgram_dst = saddr;
 +    /* Store additional startup parameters (extend net_socket_fd_init) */
 +    stored = &s->nc.stored_config->u.socket;
 +
 +    if (!stored->has_mcast) {
 +        stored->has_mcast = true;
 +        stored->mcast = g_strdup(host_str);
 +    }
 +
 +    if (localaddr_str) {
 +        stored->has_localaddr = true;
 +        stored->localaddr = g_strdup(localaddr_str);
 +    }
 +
      snprintf(s->nc.info_str, sizeof(s->nc.info_str),
               "socket: mcast=%s:%d",
               inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
@@ -XXX,XX +XXX,XX @@ static int net_socket_udp_init(NetClientState *peer,
      NetSocketState *s;
      int fd, ret;
      struct sockaddr_in laddr, raddr;
 +    NetdevSocketOptions *stored;
      if (parse_host_port(&laddr, lhost, errp) < 0) {
          return -1;
@@ -XXX,XX +XXX,XX @@ static int net_socket_udp_init(NetClientState *peer,
      s->dgram_dst = raddr;
 +    /* Store additional startup parameters (extend net_socket_fd_init) */
 +    stored = &s->nc.stored_config->u.socket;
 +
 +    stored->has_localaddr = true;
 +    stored->localaddr = g_strdup(lhost);
 +
 +    stored->has_udp = true;
 +    stored->udp = g_strdup(rhost);
 +
      snprintf(s->nc.info_str, sizeof(s->nc.info_str),
               "socket: udp=%s:%d",
               inet_ntoa(raddr.sin_addr), ntohs(raddr.sin_port));
 diff --git a/net/tap-win32.c b/net/tap-win32.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/tap-win32.c
 +++ b/net/tap-win32.c
@@ -XXX,XX +XXX,XX @@ static int tap_win32_init(NetClientState *peer, const char *model,
      NetClientState *nc;
      TAPState *s;
      tap_win32_overlapped_t *handle;
 +    NetdevTapOptions *stored;
      if (tap_win32_open(&handle, ifname) < 0) {
          printf("tap: Could not open '%s'\n", ifname);
@@ -XXX,XX +XXX,XX @@ static int tap_win32_init(NetClientState *peer, const char *model,
      s = DO_UPCAST(TAPState, nc, nc);
 +    /* Store startup parameters */
 +    nc->stored_config = g_new0(NetdevInfo, 1);
 +    nc->stored_config->type = NET_BACKEND_TAP;
 +    stored = &nc->stored_config->u.tap;
 +
 +    stored->has_ifname = true;
 +    stored->ifname = g_strdup(ifname);
 +
      snprintf(s->nc.info_str, sizeof(s->nc.info_str),
               "tap: ifname=%s", ifname);
 diff --git a/net/tap.c b/net/tap.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/tap.c
 +++ b/net/tap.c
@@ -XXX,XX +XXX,XX @@ int net_init_bridge(const Netdev *netdev, const char *name,
      const char *helper, *br;
      TAPState *s;
      int fd, vnet_hdr;
 +    NetdevBridgeOptions *stored;
      assert(netdev->type == NET_CLIENT_DRIVER_BRIDGE);
      bridge = &netdev->u.bridge;
@@ -XXX,XX +XXX,XX @@ int net_init_bridge(const Netdev *netdev, const char *name,
      }
      s = net_tap_fd_init(peer, "bridge", name, fd, vnet_hdr);
 +    /* Store startup parameters */
 +    s->nc.stored_config = g_new0(NetdevInfo, 1);
 +    s->nc.stored_config->type = NET_BACKEND_BRIDGE;
 +    stored = &s->nc.stored_config->u.bridge;
 +
 +    if (br) {
 +        stored->has_br = true;
 +        stored->br = g_strdup(br);
 +    }
 +
 +    if (helper) {
 +        stored->has_helper = true;
 +        stored->helper = g_strdup(helper);
 +    }
 +
      snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s,br=%s", helper,
               br);
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
                               const char *model, const char *name,
                               const char *ifname, const char *script,
                               const char *downscript, const char *vhostfdname,
 -                             int vnet_hdr, int fd, Error **errp)
 +                             int vnet_hdr, int fd, NetdevInfo **common_stored,
 +                             Error **errp)
  {
      Error *err = NULL;
      TAPState *s = net_tap_fd_init(peer, model, name, fd, vnet_hdr);
      int vhostfd;
 +    NetdevTapOptions *stored;
      tap_set_sndbuf(s->fd, tap, &err);
      if (err) {
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
          return;
      }
 +    /* Store startup parameters */
 +    if (!*common_stored) {
 +        *common_stored = g_new0(NetdevInfo, 1);
 +        (*common_stored)->type = NET_BACKEND_TAP;
 +        s->nc.stored_config = *common_stored;
 +    }
 +    stored = &(*common_stored)->u.tap;
 +
 +    if (tap->has_sndbuf && !stored->has_sndbuf) {
 +        stored->has_sndbuf = true;
 +        stored->sndbuf = tap->sndbuf;
 +    }
 +
 +    if (vnet_hdr && !stored->has_vnet_hdr) {
 +        stored->has_vnet_hdr = true;
 +        stored->vnet_hdr = true;
 +    }
 +
      if (tap->has_fd || tap->has_fds) {
 +        if (!stored->has_fds) {
 +            stored->has_fds = true;
 +            stored->fds = g_strdup_printf("%d", fd);
 +        } else {
 +            char *tmp_s = stored->fds;
 +            stored->fds = g_strdup_printf("%s:%d", stored->fds, fd);
 +            g_free(tmp_s);
 +        }
 +
          snprintf(s->nc.info_str, sizeof(s->nc.info_str), "fd=%d", fd);
      } else if (tap->has_helper) {
 +        if (!stored->has_helper) {
 +            stored->has_helper = true;
 +            stored->helper = g_strdup(tap->helper);
 +        }
 +
 +        if (!stored->has_br) {
 +            stored->has_br = true;
 +            stored->br = tap->has_br ? g_strdup(tap->br) :
 +                                       g_strdup(DEFAULT_BRIDGE_INTERFACE);
 +        }
 +
          snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s",
                   tap->helper);
      } else {
 +        if (ifname && !stored->has_ifname) {
 +            stored->has_ifname = true;
 +            stored->ifname = g_strdup(ifname);
 +        }
 +
 +        if (script && !stored->has_script) {
 +            stored->has_script = true;
 +            stored->script = g_strdup(script);
 +        }
 +
 +        if (downscript && !stored->has_downscript) {
 +            stored->has_downscript = true;
 +            stored->downscript = g_strdup(downscript);
 +        }
 +
          snprintf(s->nc.info_str, sizeof(s->nc.info_str),
                   "ifname=%s,script=%s,downscript=%s", ifname, script,
                   downscript);
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
          vhostfdname || (tap->has_vhostforce && tap->vhostforce)) {
          VhostNetOptions options;
 +        stored->has_vhost = true;
 +        stored->vhost = true;
 +
 +        if (tap->has_vhostforce && tap->vhostforce) {
 +            stored->has_vhostforce = true;
 +            stored->vhostforce = true;
 +        }
 +
          options.backend_type = VHOST_BACKEND_TYPE_KERNEL;
          options.net_backend = &s->nc;
          if (tap->has_poll_us) {
 +            stored->has_poll_us = true;
 +            stored->poll_us = tap->poll_us;
 +
              options.busyloop_timeout = tap->poll_us;
          } else {
              options.busyloop_timeout = 0;
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
          }
          options.opaque = (void *)(uintptr_t)vhostfd;
 +        if (!stored->has_vhostfds) {
 +            stored->has_vhostfds = true;
 +            stored->vhostfds = g_strdup_printf("%d", vhostfd);
 +        } else {
 +            char *tmp_s = stored->vhostfds;
 +            stored->vhostfds = g_strdup_printf("%s:%d", stored->fds, vhostfd);
 +            g_free(tmp_s);
 +        }
 +
          s->vhost_net = vhost_net_init(&options);
          if (!s->vhost_net) {
              if (tap->has_vhostforce && tap->vhostforce) {
@@ -XXX,XX +XXX,XX @@ int net_init_tap(const Netdev *netdev, const char *name,
      const char *vhostfdname;
      char ifname[128];
      int ret = 0;
 +    NetdevInfo *common_stored = NULL; /* will store configuration */
      assert(netdev->type == NET_CLIENT_DRIVER_TAP);
      tap = &netdev->u.tap;
@@ -XXX,XX +XXX,XX @@ int net_init_tap(const Netdev *netdev, const char *name,
          net_init_tap_one(tap, peer, "tap", name, NULL,
                           script, downscript,
 -                         vhostfdname, vnet_hdr, fd, &err);
 +                         vhostfdname, vnet_hdr, fd, &common_stored, &err);
          if (err) {
              error_propagate(errp, err);
              close(fd);
@@ -XXX,XX +XXX,XX @@ int net_init_tap(const Netdev *netdev, const char *name,
              net_init_tap_one(tap, peer, "tap", name, ifname,
                               script, downscript,
                               tap->has_vhostfds ? vhost_fds[i] : NULL,
 -                             vnet_hdr, fd, &err);
 +                             vnet_hdr, fd, &common_stored, &err);
              if (err) {
                  error_propagate(errp, err);
                  ret = -1;
@@ -XXX,XX +XXX,XX @@ free_fail:
          net_init_tap_one(tap, peer, "bridge", name, ifname,
                           script, downscript, vhostfdname,
 -                         vnet_hdr, fd, &err);
 +                         vnet_hdr, fd, &common_stored, &err);
          if (err) {
              error_propagate(errp, err);
              close(fd);
@@ -XXX,XX +XXX,XX @@ free_fail:
              net_init_tap_one(tap, peer, "tap", name, ifname,
                               i >= 1 ? "no" : script,
                               i >= 1 ? "no" : downscript,
 -                             vhostfdname, vnet_hdr, fd, &err);
 +                             vhostfdname, vnet_hdr, fd,
 +                             &common_stored, &err);
              if (err) {
                  error_propagate(errp, err);
                  close(fd);
 diff --git a/net/vde.c b/net/vde.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/vde.c
 +++ b/net/vde.c
@@ -XXX,XX +XXX,XX @@ static int net_vde_init(NetClientState *peer, const char *model,
      VDECONN *vde;
      char *init_group = (char *)group;
      char *init_sock = (char *)sock;
 +    NetdevVdeOptions *stored;
      struct vde_open_args args = {
          .port = port,
@@ -XXX,XX +XXX,XX @@ static int net_vde_init(NetClientState *peer, const char *model,
      qemu_set_fd_handler(vde_datafd(s->vde), vde_to_qemu, NULL, s);
 +    /* Store startup parameters */
 +    nc->stored_config = g_new0(NetdevInfo, 1);
 +    nc->stored_config->type = NET_BACKEND_VDE;
 +    stored = &nc->stored_config->u.vde;
 +
 +    if (sock) {
 +        stored->has_sock = true;
 +        stored->sock = g_strdup(sock);
 +    }
 +
 +    stored->has_port = true;
 +    stored->port = port;
 +
 +    if (group) {
 +        stored->has_group = true;
 +        stored->group = g_strdup(group);
 +    }
 +
 +    stored->has_mode = true;
 +    stored->mode = mode;
 +
      return 0;
  }
 diff --git a/net/vhost-user.c b/net/vhost-user.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/vhost-user.c
 +++ b/net/vhost-user.c
@@ -XXX,XX +XXX,XX @@ static void net_vhost_user_event(void *opaque, QEMUChrEvent event)
  }
  static int net_vhost_user_init(NetClientState *peer, const char *device,
 -                               const char *name, Chardev *chr,
 -                               int queues)
 +                               const char *name, const char *chardev,
 +                               Chardev *chr, int queues)
  {
      Error *err = NULL;
      NetClientState *nc, *nc0 = NULL;
      NetVhostUserState *s = NULL;
      VhostUserState *user;
      int i;
 +    NetdevVhostUserOptions *stored;
      assert(name);
      assert(queues > 0);
@@ -XXX,XX +XXX,XX @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
      assert(s->vhost_net);
 +    /* Store startup parameters */
 +    nc0->stored_config = g_new0(NetdevInfo, 1);
 +    nc0->stored_config->type = NET_BACKEND_VHOST_USER;
 +    stored = &nc0->stored_config->u.vhost_user;
 +
 +    stored->chardev = g_strdup(chardev);
 +
 +    stored->has_queues = true;
 +    stored->queues = queues;
 +
      return 0;
  err:
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_user(const Netdev *netdev, const char *name,
          return -1;
      }
 -    return net_vhost_user_init(peer, "vhost_user", name, chr, queues);
 +    return net_vhost_user_init(peer, "vhost_user", name,
 +                               vhost_user_opts->chardev, chr, queues);
  }
 diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/vhost-vdpa.c
 +++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int net_vhost_vdpa_init(NetClientState *peer, const char *device,
      VhostVDPAState *s;
      int vdpa_device_fd = -1;
      int ret = 0;
 +    NetdevVhostVDPAOptions *stored;
 +
      assert(name);
      nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device, name);
 +
 +    /* Store startup parameters */
 +    nc->stored_config = g_new0(NetdevInfo, 1);
 +    nc->stored_config->type = NET_BACKEND_VHOST_VDPA;
 +    stored = &nc->stored_config->u.vhost_vdpa;
 +
 +    stored->has_vhostdev = true;
 +    stored->vhostdev = g_strdup(vhostdev);
 +
 +    stored->has_queues = true;
 +    stored->queues = 1; /* TODO: change when support multiqueue */
 +
      snprintf(nc->info_str, sizeof(nc->info_str), TYPE_VHOST_VDPA);
      nc->queue_index = 0;
      s = DO_UPCAST(VhostVDPAState, nc, nc);
 diff --git a/qapi/net.json b/qapi/net.json
 index XXXXXXX..XXXXXXX 100644
 --- a/qapi/net.json
 +++ b/qapi/net.json
@@ -XXX,XX +XXX,XX @@
  ##
  { 'event': 'FAILOVER_NEGOTIATED',
    'data': {'device-id': 'str'} }
 +
 +##
 +# @NetBackend:
 +#
 +# Available netdev backend drivers.
 +#
 +# Since: 6.0
 +##
 +{ 'enum': 'NetBackend',
 +  'data': [ 'bridge', 'l2tpv3', 'netmap', 'socket', 'tap', 'user', 'vde',
 +            'vhost-user', 'vhost-vdpa' ] }
 +
 +##
 +# @NetdevInfo:
 +#
 +# Configuration of a network backend device (netdev).
 +#
 +# @id: Device identifier.
 +#
 +# @type: Specify the driver used for interpreting remaining arguments.
 +#
 +# @peer-id: The connected frontend network device name (absent if no frontend
 +#           is connected).
 +#
 +# Since: 6.0
 +##
 +{ 'union': 'NetdevInfo',
 +  'base': { 'id': 'str',
 +            'type': 'NetBackend',
 +            '*peer-id': 'str' },
 +  'discriminator': 'type',
 +  'data': {
 +      'bridge':     'NetdevBridgeOptions',
 +      'l2tpv3':     'NetdevL2TPv3Options',
 +      'netmap':     'NetdevNetmapOptions',
 +      'socket':     'NetdevSocketOptions',
 +      'tap':        'NetdevTapOptions',
 +      'user':       'NetdevUserOptions',
 +      'vde':        'NetdevVdeOptions',
 +      'vhost-user': 'NetdevVhostUserOptions',
 +      'vhost-vdpa': 'NetdevVhostVDPAOptions' } }
 +
 +##
 +# @query-netdev:
 +#
 +# Get a list of @NetdevInfo for all virtual network backend devices (netdevs).
 +#
 +# Returns: a list of @NetdevInfo describing each netdev.
 +#
 +# Since: 6.0
 +#
 +# Example:
 +#
 +# -> { "execute": "query-netdev" }
 +# <- { "return": [
 +#          {
 +#              "ipv6": true,
 +#              "ipv4": true,
 +#              "host": "10.0.2.2",
 +#              "ipv6-dns": "fec0::3",
 +#              "ipv6-prefix": "fec0::",
 +#              "net": "10.0.2.0/255.255.255.0",
 +#              "ipv6-host": "fec0::2",
 +#              "type": "user",
 +#              "peer-id": "net0",
 +#              "dns": "10.0.2.3",
 +#              "hostfwd": [
 +#                  {
 +#                      "str": "tcp::20004-:22"
 +#                  }
 +#              ],
 +#              "ipv6-prefixlen": 64,
 +#              "id": "netdev0",
 +#              "restrict": false
 +#          }
 +#      ]
 +#    }
 +#
 +##
 +{ 'command': 'query-netdev', 'returns': ['NetdevInfo'] }
 --
 .7.4

-[PULL V2 13/20] cadence_gem: switch to use qemu_receive_packet() for loopback
+[PULL V2 06/15] virtio: Add vhost_svq_get_vring_addr
-From: Alexander Bulekov <alxndr@bu.edu>
+From: Eugenio Pérez <eperezma@redhat.com>
-This patch switches to use qemu_receive_packet() which can detect
+It reports the shadow virtqueue address from qemu virtual address space.
 reentrancy and return early.
-This is intended to address CVE-2021-3416.
+Since this will be different from the guest's vaddr, but the device can
 access it, SVQ takes special care about its alignment & lack of garbage
 data. It assumes that IOMMU will work in host_page_size ranges for that.
-Cc: Prasad J Pandit <ppandit@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
-Cc: qemu-stable@nongnu.org
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/cadence_gem.c | 4 ++--
+ hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
-file changed, 2 insertions(+), 2 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  9 +++++++++
 files changed, 38 insertions(+)
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/cadence_gem.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
-                 /* Send the packet somewhere */
+ }
-                 if (s->phy_loop || (s->regs[GEM_NWCTRL] &
-                                     GEM_NWCTRL_LOCALLOOP)) {
+ /**
--                    gem_receive(qemu_get_queue(s->nic), s->tx_packet,
++ * Get the shadow vq vring address.
--                                total_bytes);
++ * @svq: Shadow virtqueue
-+                    qemu_receive_packet(qemu_get_queue(s->nic), s->tx_packet,
++ * @addr: Destination to store address
-+                                        total_bytes);
++ */
-                 } else {
++void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
-                     qemu_send_packet(qemu_get_queue(s->nic), s->tx_packet,
++                              struct vhost_vring_addr *addr)
-                                      total_bytes);
++{
 +    addr->desc_user_addr = (uint64_t)svq->vring.desc;
 +    addr->avail_user_addr = (uint64_t)svq->vring.avail;
 +    addr->used_user_addr = (uint64_t)svq->vring.used;
 +}
 +
 +size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
 +{
 +    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    size_t avail_size = offsetof(vring_avail_t, ring) +
 +                                             sizeof(uint16_t) * svq->vring.num;
 +
 +    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
 +}
 +
 +size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
 +{
 +    size_t used_size = offsetof(vring_used_t, ring) +
 +                                    sizeof(vring_used_elem_t) * svq->vring.num;
 +    return ROUND_UP(used_size, qemu_real_host_page_size);
 +}
 +
 +/**
   * Set a new file descriptor for the guest to kick the SVQ and notify for avail
   *
   * @svq: The svq
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
  #define VHOST_SHADOW_VIRTQUEUE_H
  #include "qemu/event_notifier.h"
 +#include "hw/virtio/virtio.h"
 +#include "standard-headers/linux/vhost_types.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
 +    /* Shadow vring */
 +    struct vring vring;
 +
      /* Shadow kick notifier, sent to vhost */
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 +void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 +                              struct vhost_vring_addr *addr);
 +size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 +size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 --
 .7.4

-[PULL V2 18/20] net: Move NetClientState.info_str to dynamic allocations
+[PULL V2 07/15] vdpa: adapt vhost_ops callbacks to svq
-From: Alexey Kirillov <lekiravi@yandex-team.ru>
+From: Eugenio Pérez <eperezma@redhat.com>
-The info_str field of the NetClientState structure is static and has a size
+First half of the buffers forwarding part, preparing vhost-vdpa
-of 256 bytes. This amount is often unclaimed, and the field itself is used
+callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
-exclusively for HMP "info network".
+this is effectively dead code at the moment, but it helps to reduce
 patch size.
-The patch translates info_str to dynamic memory allocation.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 This action is also allows us to painlessly discard usage of this field
 for backend devices.
 Signed-off-by: Alexey Kirillov <lekiravi@yandex-team.ru>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/xen_nic.c  |  5 ++---
+ hw/virtio/vhost-vdpa.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
- include/net/net.h |  2 +-
+file changed, 41 insertions(+), 7 deletions(-)
  net/l2tpv3.c      |  3 +--
  net/net.c         | 14 ++++++++------
  net/slirp.c       |  5 ++---
  net/socket.c      | 43 ++++++++++++++++++++++++-------------------
  net/tap-win32.c   |  3 +--
  net/tap.c         | 13 +++++--------
  net/vde.c         |  3 +--
  net/vhost-user.c  |  3 +--
  net/vhost-vdpa.c  |  2 +-
 files changed, 47 insertions(+), 49 deletions(-)
-diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/xen_nic.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/xen_nic.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static int net_init(struct XenLegacyDevice *xendev)
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
-     netdev->nic = qemu_new_nic(&net_xen_info, &netdev->conf,
+     return ret;
-                                "xen", NULL, netdev);
+  }
--    snprintf(qemu_get_queue(netdev->nic)->info_str,
++static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
--             sizeof(qemu_get_queue(netdev->nic)->info_str),
++                                         struct vhost_vring_state *ring)
--             "nic: xenbus vif macaddr=%s", netdev->mac);
++{
-+    qemu_get_queue(netdev->nic)->info_str = g_strdup_printf(
++    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-+        "nic: xenbus vif macaddr=%s", netdev->mac);
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
++}
-     /* fill info */
++
-     xenstore_write_be_int(&netdev->xendev, "feature-rx-copy", 1);
+ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
-diff --git a/include/net/net.h b/include/net/net.h
+                                          struct vhost_vring_file *file)
 index XXXXXXX..XXXXXXX 100644
 --- a/include/net/net.h
 +++ b/include/net/net.h
@@ -XXX,XX +XXX,XX @@ struct NetClientState {
      NetQueue *incoming_queue;
      char *model;
      char *name;
 -    char info_str[256];
 +    char *info_str;
      NetdevInfo *stored_config;
      unsigned receive_disabled : 1;
      NetClientDestructor *destructor;
 diff --git a/net/l2tpv3.c b/net/l2tpv3.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/l2tpv3.c
 +++ b/net/l2tpv3.c
@@ -XXX,XX +XXX,XX @@ int net_init_l2tpv3(const Netdev *netdev,
      QAPI_CLONE_MEMBERS(NetdevL2TPv3Options,
                         &nc->stored_config->u.l2tpv3, l2tpv3);
 -    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
 -             "l2tpv3: connected");
 +    s->nc.info_str = g_strdup_printf("l2tpv3: connected");
      return 0;
  outerr:
      qemu_del_net_client(nc);
 diff --git a/net/net.c b/net/net.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/net.c
 +++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ char *qemu_mac_strdup_printf(const uint8_t *macaddr)
  void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6])
  {
--    snprintf(nc->info_str, sizeof(nc->info_str),
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
--             "model=%s,macaddr=%02x:%02x:%02x:%02x:%02x:%02x",
+     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 -             nc->model,
 -             macaddr[0], macaddr[1], macaddr[2],
 -             macaddr[3], macaddr[4], macaddr[5]);
 +    g_free(nc->info_str);
 +    nc->info_str = g_strdup_printf(
 +        "model=%s,macaddr=%02x:%02x:%02x:%02x:%02x:%02x",
 +        nc->model,
 +        macaddr[0], macaddr[1], macaddr[2],
 +        macaddr[3], macaddr[4], macaddr[5]);
  }
- static int mac_table[256] = {0};
++static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
-@@ -XXX,XX +XXX,XX @@ static void qemu_free_net_client(NetClientState *nc)
++                                         struct vhost_vring_addr *addr)
-     }
++{
-     g_free(nc->name);
++    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-     g_free(nc->model);
++                                addr->desc_user_addr, addr->used_user_addr,
-+    g_free(nc->info_str);
++                                addr->avail_user_addr,
-     qapi_free_NetdevInfo(nc->stored_config);
++                                addr->log_guest_addr);
-     if (nc->destructor) {
++
-         nc->destructor(nc);
++    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
-@@ -XXX,XX +XXX,XX @@ void print_net_client(Monitor *mon, NetClientState *nc)
++
-     monitor_printf(mon, "%s: index=%d,type=%s,%s\n", nc->name,
++}
-                    nc->queue_index,
++
-                    NetClientDriver_str(nc->info->type),
+ /**
--                   nc->info_str);
+  * Set the shadow virtqueue descriptors to the device
-+                   nc->info_str ? nc->info_str : "");
+  *
-     if (!QTAILQ_EMPTY(&nc->filters)) {
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
-         monitor_printf(mon, "filters:\n");
+ static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
-     }
+                                        struct vhost_vring_addr *addr)
-diff --git a/net/slirp.c b/net/slirp.c
+ {
-index XXXXXXX..XXXXXXX 100644
+-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
---- a/net/slirp.c
+-                                    addr->desc_user_addr, addr->used_user_addr,
-+++ b/net/slirp.c
+-                                    addr->avail_user_addr,
-@@ -XXX,XX +XXX,XX @@ static int net_slirp_init(NetClientState *peer, const char *model,
+-                                    addr->log_guest_addr);
-         stored->tftp_server_name = g_strdup(tftp_server_name);
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
-     }
++    struct vhost_vdpa *v = dev->opaque;
++
--    snprintf(nc->info_str, sizeof(nc->info_str),
++    if (v->shadow_vqs_enabled) {
--             "net=%s,restrict=%s", inet_ntoa(net),
++        /*
--             restricted ? "on" : "off");
++         * Device vring addr was set at device start. SVQ base is handled by
-+    nc->info_str = g_strdup_printf("net=%s,restrict=%s", inet_ntoa(net),
++         * VirtQueue code.
-+                                   restricted ? "on" : "off");
++         */
++        return 0;
-     s = DO_UPCAST(SlirpState, nc, nc);
++    }
++
-diff --git a/net/socket.c b/net/socket.c
++    return vhost_vdpa_set_vring_dev_addr(dev, addr);
 index XXXXXXX..XXXXXXX 100644
 --- a/net/socket.c
 +++ b/net/socket.c
@@ -XXX,XX +XXX,XX @@ static void net_socket_send(void *opaque)
          s->fd = -1;
          net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
          s->nc.link_down = true;
 -        memset(s->nc.info_str, 0, sizeof(s->nc.info_str));
 +        g_free(s->nc.info_str);
 +        s->nc.info_str = g_new0(char, 1);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer,
          stored->mcast = g_strdup(mcast);
          s->dgram_dst = saddr;
 -        snprintf(nc->info_str, sizeof(nc->info_str),
 -                 "socket: fd=%d (cloned mcast=%s:%d)",
 -                 fd, inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
 +        nc->info_str = g_strdup_printf("socket: fd=%d (cloned mcast=%s:%d)",
 +                                       fd, inet_ntoa(saddr.sin_addr),
 +                                       ntohs(saddr.sin_port));
      } else {
          if (sa_type == SOCKET_ADDRESS_TYPE_UNIX) {
              s->dgram_dst.sin_family = AF_UNIX;
          }
 -        snprintf(nc->info_str, sizeof(nc->info_str),
 -                 "socket: fd=%d %s", fd, SocketAddressType_str(sa_type));
 +        nc->info_str = g_strdup_printf("socket: fd=%d %s",
 +                                       fd, SocketAddressType_str(sa_type));
      }
      return s;
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_stream(NetClientState *peer,
      nc = qemu_new_net_client(&net_socket_info, peer, model, name);
 -    snprintf(nc->info_str, sizeof(nc->info_str), "socket: fd=%d", fd);
 +    nc->info_str = g_strdup_printf("socket: fd=%d", fd);
      s = DO_UPCAST(NetSocketState, nc, nc);
@@ -XXX,XX +XXX,XX @@ static void net_socket_accept(void *opaque)
      stored->has_fd = true;
      stored->fd = g_strdup_printf("%d", fd);
 -    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
 -             "socket: connection from %s:%d",
 -             inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
 +    g_free(s->nc.info_str);
 +    s->nc.info_str = g_strdup_printf("socket: connection from %s:%d",
 +                                     inet_ntoa(saddr.sin_addr),
 +                                     ntohs(saddr.sin_port));
  }
- static int net_socket_listen_init(NetClientState *peer,
+ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
-@@ -XXX,XX +XXX,XX @@ static int net_socket_connect_init(NetClientState *peer,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
-     stored->has_connect = true;
+ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
-     stored->connect = g_strdup(host_str);
+                                        struct vhost_vring_state *ring)
+ {
--    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
+-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
--             "socket: connect to %s:%d",
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
--             inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
++    struct vhost_vdpa *v = dev->opaque;
-+    g_free(s->nc.info_str);
++
-+    s->nc.info_str = g_strdup_printf("socket: connect to %s:%d",
++    if (v->shadow_vqs_enabled) {
-+                                     inet_ntoa(saddr.sin_addr),
++        /*
-+                                     ntohs(saddr.sin_port));
++         * Device vring base was set at device start. SVQ base is handled by
-     return 0;
++         * VirtQueue code.
 +         */
 +        return 0;
 +    }
 +
 +    return vhost_vdpa_set_dev_vring_base(dev, ring);
  }
-@@ -XXX,XX +XXX,XX @@ static int net_socket_mcast_init(NetClientState *peer,
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
          stored->localaddr = g_strdup(localaddr_str);
      }
 -    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
 -             "socket: mcast=%s:%d",
 -             inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
 +    g_free(s->nc.info_str);
 +    s->nc.info_str = g_strdup_printf("socket: mcast=%s:%d",
 +                                     inet_ntoa(saddr.sin_addr),
 +                                     ntohs(saddr.sin_port));
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ static int net_socket_udp_init(NetClientState *peer,
      stored->has_udp = true;
      stored->udp = g_strdup(rhost);
 -    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
 -             "socket: udp=%s:%d",
 -             inet_ntoa(raddr.sin_addr), ntohs(raddr.sin_port));
 +    g_free(s->nc.info_str);
 +    s->nc.info_str = g_strdup_printf("socket: udp=%s:%d",
 +                                     inet_ntoa(raddr.sin_addr),
 +                                     ntohs(raddr.sin_port));
      return 0;
  }
 diff --git a/net/tap-win32.c b/net/tap-win32.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/tap-win32.c
 +++ b/net/tap-win32.c
@@ -XXX,XX +XXX,XX @@ static int tap_win32_init(NetClientState *peer, const char *model,
      stored->has_ifname = true;
      stored->ifname = g_strdup(ifname);
 -    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
 -             "tap: ifname=%s", ifname);
 +    s->nc.info_str = g_strdup_printf("tap: ifname=%s", ifname);
      s->handle = handle;
 diff --git a/net/tap.c b/net/tap.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/tap.c
 +++ b/net/tap.c
@@ -XXX,XX +XXX,XX @@ int net_init_bridge(const Netdev *netdev, const char *name,
          stored->helper = g_strdup(helper);
      }
 -    snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s,br=%s", helper,
 -             br);
 +    s->nc.info_str = g_strdup_printf("helper=%s,br=%s", helper, br);
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
              g_free(tmp_s);
          }
 -        snprintf(s->nc.info_str, sizeof(s->nc.info_str), "fd=%d", fd);
 +        s->nc.info_str = g_strdup_printf("fd=%d", fd);
      } else if (tap->has_helper) {
          if (!stored->has_helper) {
              stored->has_helper = true;
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
                                         g_strdup(DEFAULT_BRIDGE_INTERFACE);
          }
 -        snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s",
 -                 tap->helper);
 +        s->nc.info_str = g_strdup_printf("helper=%s", tap->helper);
      } else {
          if (ifname && !stored->has_ifname) {
              stored->has_ifname = true;
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
              stored->downscript = g_strdup(downscript);
          }
 -        snprintf(s->nc.info_str, sizeof(s->nc.info_str),
 -                 "ifname=%s,script=%s,downscript=%s", ifname, script,
 -                 downscript);
 +        s->nc.info_str = g_strdup_printf("ifname=%s,script=%s,downscript=%s",
 +                                         ifname, script, downscript);
          if (strcmp(downscript, "no") != 0) {
              snprintf(s->down_script, sizeof(s->down_script), "%s", downscript);
 diff --git a/net/vde.c b/net/vde.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/vde.c
 +++ b/net/vde.c
@@ -XXX,XX +XXX,XX @@ static int net_vde_init(NetClientState *peer, const char *model,
      nc = qemu_new_net_client(&net_vde_info, peer, model, name);
 -    snprintf(nc->info_str, sizeof(nc->info_str), "sock=%s,fd=%d",
 -             sock, vde_datafd(vde));
 +    nc->info_str = g_strdup_printf("sock=%s,fd=%d", sock, vde_datafd(vde));
      s = DO_UPCAST(VDEState, nc, nc);
 diff --git a/net/vhost-user.c b/net/vhost-user.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/vhost-user.c
 +++ b/net/vhost-user.c
@@ -XXX,XX +XXX,XX @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
      user = g_new0(struct VhostUserState, 1);
      for (i = 0; i < queues; i++) {
          nc = qemu_new_net_client(&net_vhost_user_info, peer, device, name);
 -        snprintf(nc->info_str, sizeof(nc->info_str), "vhost-user%d to %s",
 -                 i, chr->label);
 +        nc->info_str = g_strdup_printf("vhost-user%d to %s", i, chr->label);
          nc->queue_index = i;
          if (!nc0) {
              nc0 = nc;
 diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/vhost-vdpa.c
 +++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int net_vhost_vdpa_init(NetClientState *peer, const char *device,
      stored->has_queues = true;
      stored->queues = 1; /* TODO: change when support multiqueue */
 -    snprintf(nc->info_str, sizeof(nc->info_str), TYPE_VHOST_VDPA);
 +    nc->info_str = g_strdup_printf(TYPE_VHOST_VDPA);
      nc->queue_index = 0;
      s = DO_UPCAST(VhostVDPAState, nc, nc);
      vdpa_device_fd = qemu_open_old(vhostdev, O_RDWR);
 --
 .7.4

-[PULL V2 15/20] pvrdma: wean code off pvrdma_ring.h kernel header
+[PULL V2 08/15] vhost: Shadow virtqueue buffers forwarding
-From: Cornelia Huck <cohuck@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The pvrdma code relies on the pvrdma_ring.h kernel header for some
+Initial version of shadow virtqueue that actually forward buffers. There
-basic ring buffer handling. The content of that header isn't very
+is no iommu support at the moment, and that will be addressed in future
-exciting, but contains some (q)atomic_*() invocations that (a)
+patches of this series. Since all vhost-vdpa devices use forced IOMMU,
-cause manual massaging when doing a headers update, and (b) are
+this means that SVQ is not usable at this point of the series on any
-an indication that we probably should not be importing that header
+device.
-at all.
+For simplicity it only supports modern devices, that expects vring
-Let's reimplement the ring buffer handling directly in the pvrdma
+in little endian, with split ring and no event idx or indirect
-code instead. This arguably also improves readability of the code.
+descriptors. Support for them will not be added in this series.
-Importing the header can now be dropped.
+It reuses the VirtQueue code for the device part. The driver part is
+based on Linux's virtio_ring driver, but with stripped functionality
-Signed-off-by: Cornelia Huck <cohuck@redhat.com>
+and optimizations so it's easier to review.
-Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
-Reviewed-by: Yuval Shaia <yuval.shaia.ml@gmail.com>
+However, forwarding buffers have some particular pieces: One of the most
-Tested-by: Yuval Shaia <yuval.shaia.ml@gmail.com>
+unexpected ones is that a guest's buffer can expand through more than
 one descriptor in SVQ. While this is handled gracefully by qemu's
 emulated virtio devices, it may cause unexpected SVQ queue full. This
 patch also solves it by checking for this condition at both guest's
 kicks and device's calls. The code may be more elegant in the future if
 SVQ code runs in its own iocontext.
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/rdma/vmw/pvrdma.h                               |   5 +-
+ hw/virtio/vhost-shadow-virtqueue.c | 354 ++++++++++++++++++++++++++++++++++++-
- hw/rdma/vmw/pvrdma_cmd.c                           |   6 +-
+ hw/virtio/vhost-shadow-virtqueue.h |  26 +++
- hw/rdma/vmw/pvrdma_dev_ring.c                      |  41 ++++----
+ hw/virtio/vhost-vdpa.c             | 159 ++++++++++++++++-
- hw/rdma/vmw/pvrdma_dev_ring.h                      |   9 +-
+files changed, 527 insertions(+), 12 deletions(-)
- hw/rdma/vmw/pvrdma_main.c                          |   4 +-
- .../drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h | 114 ---------------------
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
  scripts/update-linux-headers.sh                    |   3 +-
 files changed, 38 insertions(+), 144 deletions(-)
  delete mode 100644 include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h
 diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/rdma/vmw/pvrdma.h
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/rdma/vmw/pvrdma.h
++++ b/hw/virtio/vhost-shadow-virtqueue.c
 @@ -XXX,XX +XXX,XX @@
- #include "../rdma_backend_defs.h"
+ #include "qemu/error-report.h"
- #include "../rdma_rm_defs.h"
+ #include "qapi/error.h"
+ #include "qemu/main-loop.h"
--#include "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h"
++#include "qemu/log.h"
- #include "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h"
++#include "qemu/memalign.h"
- #include "pvrdma_dev_ring.h"
+ #include "linux-headers/linux/vhost.h"
- #include "qom/object.h"
-@@ -XXX,XX +XXX,XX @@ typedef struct DSRInfo {
+ /**
-     union pvrdma_cmd_req *req;
+@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
-     union pvrdma_cmd_resp *rsp;
+ }
--    struct pvrdma_ring *async_ring_state;
+ /**
-+    PvrdmaRingState *async_ring_state;
+- * Forward guest notifications.
-     PvrdmaRing async;
++ * Number of descriptors that the SVQ can make available from the guest.
++ *
--    struct pvrdma_ring *cq_ring_state;
++ * @svq: The svq
-+    PvrdmaRingState *cq_ring_state;
++ */
-     PvrdmaRing cq;
++static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
- } DSRInfo;
++{
++    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
-diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
++}
 +
 +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
 +                                    const struct iovec *iovec,
 +                                    size_t num, bool more_descs, bool write)
 +{
 +    uint16_t i = svq->free_head, last = svq->free_head;
 +    unsigned n;
 +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
 +    vring_desc_t *descs = svq->vring.desc;
 +
 +    if (num == 0) {
 +        return;
 +    }
 +
 +    for (n = 0; n < num; n++) {
 +        if (more_descs || (n + 1 < num)) {
 +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
 +        } else {
 +            descs[i].flags = flags;
 +        }
 +        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
 +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
 +
 +        last = i;
 +        i = cpu_to_le16(descs[i].next);
 +    }
 +
 +    svq->free_head = le16_to_cpu(descs[last].next);
 +}
 +
 +static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 +                                VirtQueueElement *elem,
 +                                unsigned *head)
 +{
 +    unsigned avail_idx;
 +    vring_avail_t *avail = svq->vring.avail;
 +
 +    *head = svq->free_head;
 +
 +    /* We need some descriptors here */
 +    if (unlikely(!elem->out_num && !elem->in_num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +            "Guest provided element with no descriptors");
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
 +                            elem->in_num > 0, false);
 +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +
 +    /*
 +     * Put the entry in the available array (but don't update avail->idx until
 +     * they do sync).
 +     */
 +    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
 +    avail->ring[avail_idx] = cpu_to_le16(*head);
 +    svq->shadow_avail_idx++;
 +
 +    /* Update the avail index after write the descriptor */
 +    smp_wmb();
 +    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
 +
 +    return true;
 +}
 +
 +static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
 +{
 +    unsigned qemu_head;
 +    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    svq->ring_id_maps[qemu_head] = elem;
 +    return true;
 +}
 +
 +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
 +{
 +    /*
 +     * We need to expose the available array entries before checking the used
 +     * flags
 +     */
 +    smp_mb();
 +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
 +        return;
 +    }
 +
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Forward available buffers.
 + *
 + * @svq: Shadow VirtQueue
 + *
 + * Note that this function does not guarantee that all guest's available
 + * buffers are available to the device in SVQ avail ring. The guest may have
 + * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
 + * qemu vaddr.
 + *
 + * If that happens, guest's kick notifications will be disabled until the
 + * device uses some buffers.
 + */
 +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 +{
 +    /* Clear event notifier */
 +    event_notifier_test_and_clear(&svq->svq_kick);
 +
 +    /* Forward to the device as many available buffers as possible */
 +    do {
 +        virtio_queue_set_notification(svq->vq, false);
 +
 +        while (true) {
 +            VirtQueueElement *elem;
 +            bool ok;
 +
 +            if (svq->next_guest_avail_elem) {
 +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +            } else {
 +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
 +            }
 +
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (elem->out_num + elem->in_num >
 +                vhost_svq_available_slots(svq)) {
 +                /*
 +                 * This condition is possible since a contiguous buffer in GPA
 +                 * does not imply a contiguous buffer in qemu's VA
 +                 * scatter-gather segments. If that happens, the buffer exposed
 +                 * to the device needs to be a chain of descriptors at this
 +                 * moment.
 +                 *
 +                 * SVQ cannot hold more available buffers if we are here:
 +                 * queue the current guest descriptor and ignore further kicks
 +                 * until some elements are used.
 +                 */
 +                svq->next_guest_avail_elem = elem;
 +                return;
 +            }
 +
 +            ok = vhost_svq_add(svq, elem);
 +            if (unlikely(!ok)) {
 +                /* VQ is broken, just return and ignore any other kicks */
 +                return;
 +            }
 +            vhost_svq_kick(svq);
 +        }
 +
 +        virtio_queue_set_notification(svq->vq, true);
 +    } while (!virtio_queue_empty(svq->vq));
 +}
 +
 +/**
 + * Handle guest's kick.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
   */
 -static void vhost_handle_guest_kick(EventNotifier *n)
 +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               svq_kick);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->hdev_kick);
 +    vhost_handle_guest_kick(svq);
 +}
 +
 +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
 +{
 +    if (svq->last_used_idx != svq->shadow_used_idx) {
 +        return true;
 +    }
 +
 +    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
 +
 +    return svq->last_used_idx != svq->shadow_used_idx;
  }
  /**
 - * Forward vhost notifications
 + * Enable vhost device calls after disable them.
 + *
 + * @svq: The svq
 + *
 + * It returns false if there are pending used buffers from the vhost device,
 + * avoiding the possible races between SVQ checking for more work and enabling
 + * callbacks. True if SVQ used vring has no more pending buffers.
 + */
 +static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +    /* Make sure the flag is written before the read of used_idx */
 +    smp_mb();
 +    return !vhost_svq_more_used(svq);
 +}
 +
 +static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +}
 +
 +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 +                                           uint32_t *len)
 +{
 +    vring_desc_t *descs = svq->vring.desc;
 +    const vring_used_t *used = svq->vring.used;
 +    vring_used_elem_t used_elem;
 +    uint16_t last_used;
 +
 +    if (!vhost_svq_more_used(svq)) {
 +        return NULL;
 +    }
 +
 +    /* Only get used array entries after they have been exposed by dev */
 +    smp_rmb();
 +    last_used = svq->last_used_idx & (svq->vring.num - 1);
 +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
 +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
 +
 +    svq->last_used_idx++;
 +    if (unlikely(used_elem.id >= svq->vring.num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
 +                      svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +            "Device %s says index %u is used, but it was not available",
 +            svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    descs[used_elem.id].next = svq->free_head;
 +    svq->free_head = used_elem.id;
 +
 +    *len = used_elem.len;
 +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 +}
 +
 +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 +                            bool check_for_avail_queue)
 +{
 +    VirtQueue *vq = svq->vq;
 +
 +    /* Forward as many used buffers as possible. */
 +    do {
 +        unsigned i = 0;
 +
 +        vhost_svq_disable_notification(svq);
 +        while (true) {
 +            uint32_t len;
 +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (unlikely(i >= svq->vring.num)) {
 +                qemu_log_mask(LOG_GUEST_ERROR,
 +                         "More than %u used buffers obtained in a %u size SVQ",
 +                         i, svq->vring.num);
 +                virtqueue_fill(vq, elem, len, i);
 +                virtqueue_flush(vq, i);
 +                return;
 +            }
 +            virtqueue_fill(vq, elem, len, i++);
 +        }
 +
 +        virtqueue_flush(vq, i);
 +        event_notifier_set(&svq->svq_call);
 +
 +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
 +            /*
 +             * Avail ring was full when vhost_svq_flush was called, so it's a
 +             * good moment to make more descriptors available if possible.
 +             */
 +            vhost_handle_guest_kick(svq);
 +        }
 +    } while (!vhost_svq_enable_notification(svq));
 +}
 +
 +/**
 + * Forward used buffers.
   *
   * @n: hdev call event notifier, the one that device set to notify svq.
 + *
 + * Note that we are not making any buffers available in the loop, there is no
 + * way that it runs more than virtqueue size times.
   */
  static void vhost_svq_handle_call(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               hdev_call);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->svq_call);
 +    vhost_svq_flush(svq, true);
  }
  /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
      if (poll_start) {
          event_notifier_init_fd(svq_kick, svq_kick_fd);
          event_notifier_set(svq_kick);
 -        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
 +    }
 +}
 +
 +/**
 + * Start the shadow virtqueue operation.
 + *
 + * @svq: Shadow Virtqueue
 + * @vdev: VirtIO device
 + * @vq: Virtqueue to shadow
 + */
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq)
 +{
 +    size_t desc_size, driver_size, device_size;
 +
 +    svq->next_guest_avail_elem = NULL;
 +    svq->shadow_avail_idx = 0;
 +    svq->shadow_used_idx = 0;
 +    svq->last_used_idx = 0;
 +    svq->vdev = vdev;
 +    svq->vq = vq;
 +
 +    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
 +    driver_size = vhost_svq_driver_area_size(svq);
 +    device_size = vhost_svq_device_area_size(svq);
 +    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
 +    desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
 +    memset(svq->vring.desc, 0, driver_size);
 +    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
 +    memset(svq->vring.used, 0, device_size);
 +    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
 +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
 +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
      }
  }
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
  void vhost_svq_stop(VhostShadowVirtqueue *svq)
  {
      event_notifier_set_handler(&svq->svq_kick, NULL);
 +    g_autofree VirtQueueElement *next_avail_elem = NULL;
 +
 +    if (!svq->vq) {
 +        return;
 +    }
 +
 +    /* Send all pending used descriptors to guest */
 +    vhost_svq_flush(svq, false);
 +
 +    for (unsigned i = 0; i < svq->vring.num; ++i) {
 +        g_autofree VirtQueueElement *elem = NULL;
 +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
 +        if (elem) {
 +            virtqueue_detach_element(svq->vq, elem, 0);
 +        }
 +    }
 +
 +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +    if (next_avail_elem) {
 +        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
 +    }
 +    svq->vq = NULL;
 +    g_free(svq->ring_id_maps);
 +    qemu_vfree(svq->vring.desc);
 +    qemu_vfree(svq->vring.used);
  }
  /**
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/rdma/vmw/pvrdma_cmd.c
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-+++ b/hw/rdma/vmw/pvrdma_cmd.c
++++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@ static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring,
+@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
-     r = g_malloc(sizeof(*r));
-     *ring = r;
+     /* Guest's call notifier, where the SVQ calls guest. */
+     EventNotifier svq_call;
--    r->ring_state = (struct pvrdma_ring *)
++
-+    r->ring_state = (PvrdmaRingState *)
++    /* Virtio queue shadowing */
-         rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE);
++    VirtQueue *vq;
++
-     if (!r->ring_state) {
++    /* Virtio device */
-@@ -XXX,XX +XXX,XX @@ static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma,
++    VirtIODevice *vdev;
-     *rings = sr;
++
++    /* Map for use the guest's descriptors */
-     /* Create send ring */
++    VirtQueueElement **ring_id_maps;
--    sr->ring_state = (struct pvrdma_ring *)
++
-+    sr->ring_state = (PvrdmaRingState *)
++    /* Next VirtQueue element that guest made available */
-         rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE);
++    VirtQueueElement *next_guest_avail_elem;
-     if (!sr->ring_state) {
++
-         rdma_error_report("Failed to map to QP ring state");
++    /* Next head to expose to the device */
-@@ -XXX,XX +XXX,XX @@ static int create_srq_ring(PCIDevice *pci_dev, PvrdmaRing **ring,
++    uint16_t shadow_avail_idx;
-     r = g_malloc(sizeof(*r));
++
-     *ring = r;
++    /* Next free descriptor */
++    uint16_t free_head;
--    r->ring_state = (struct pvrdma_ring *)
++
-+    r->ring_state = (PvrdmaRingState *)
++    /* Last seen used idx */
-             rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE);
++    uint16_t shadow_used_idx;
-     if (!r->ring_state) {
++
-         rdma_error_report("Failed to map tp SRQ ring state");
++    /* Next head to consume from the device */
-diff --git a/hw/rdma/vmw/pvrdma_dev_ring.c b/hw/rdma/vmw/pvrdma_dev_ring.c
++    uint16_t last_used_idx;
  } VhostShadowVirtqueue;
  bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
  VhostShadowVirtqueue *vhost_svq_new(void);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/rdma/vmw/pvrdma_dev_ring.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/rdma/vmw/pvrdma_dev_ring.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
- #include "trace.h"
+  * Note that this function does not rewind kick file descriptor if cannot set
+  * call one.
- #include "../rdma_utils.h"
+  */
--#include "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h"
+-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
- #include "pvrdma_dev_ring.h"
+-                                 VhostShadowVirtqueue *svq,
+-                                 unsigned idx,
- int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev,
+-                                 Error **errp)
--                     struct pvrdma_ring *ring_state, uint32_t max_elems,
++static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
-+                     PvrdmaRingState *ring_state, uint32_t max_elems,
++                                  VhostShadowVirtqueue *svq,
-                      size_t elem_sz, dma_addr_t *tbl, uint32_t npages)
++                                  unsigned idx,
 +                                  Error **errp)
  {
-     int i;
+     struct vhost_vring_file file = {
-@@ -XXX,XX +XXX,XX @@ out:
+         .index = dev->vq_index + idx,
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
- void *pvrdma_ring_next_elem_read(PvrdmaRing *ring)
+     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
- {
+     if (unlikely(r != 0)) {
--    int e;
+         error_setg_errno(errp, -r, "Can't set device kick fd");
--    unsigned int idx = 0, offset;
+-        return false;
-+    unsigned int idx, offset;
++        return r;
 +    const uint32_t tail = qatomic_read(&ring->ring_state->prod_tail);
 +    const uint32_t head = qatomic_read(&ring->ring_state->cons_head);
 -    e = pvrdma_idx_ring_has_data(ring->ring_state, ring->max_elems, &idx);
 -    if (e <= 0) {
 +    if (tail & ~((ring->max_elems << 1) - 1) ||
 +        head & ~((ring->max_elems << 1) - 1) ||
 +        tail == head) {
          trace_pvrdma_ring_next_elem_read_no_data(ring->name);
          return NULL;
      }
-+    idx = head & (ring->max_elems - 1);
+     event_notifier = &svq->hdev_call;
-     offset = idx * ring->elem_sz;
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-     return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE);
+         error_setg_errno(errp, -r, "Can't set device call fd");
      }
 +    return r;
 +}
 +
 +/**
 + * Unmap a SVQ area in the device
 + */
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 +                                      hwaddr size)
 +{
 +    int r;
 +
 +    size = ROUND_UP(size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
 +                                       const VhostShadowVirtqueue *svq)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    struct vhost_vring_addr svq_addr;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    bool ok;
 +
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 +
 +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +}
 +
 +/**
 + * Map the shadow virtqueue rings in the device
 + *
 + * @dev: The vhost device
 + * @svq: The shadow virtqueue
 + * @addr: Assigned IOVA addresses
 + * @errp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
 +                                     const VhostShadowVirtqueue *svq,
 +                                     struct vhost_vring_addr *addr,
 +                                     Error **errp)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    int r;
 +
 +    ERRP_GUARD();
 +    vhost_svq_get_vring_addr(svq, addr);
 +
 +    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 +                           (void *)addr->desc_user_addr, true);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 +                           (void *)addr->used_user_addr, false);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq,
 +                                 unsigned idx,
 +                                 Error **errp)
 +{
 +    uint16_t vq_index = dev->vq_index + idx;
 +    struct vhost_vring_state s = {
 +        .index = vq_index,
 +    };
 +    int r;
 +
 +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
 +    if (unlikely(r)) {
 +        error_setg_errno(errp, -r, "Cannot set vring base");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
      return r == 0;
  }
- void pvrdma_ring_read_inc(PvrdmaRing *ring)
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
  {
 -    pvrdma_idx_ring_inc(&ring->ring_state->cons_head, ring->max_elems);
 +    uint32_t idx = qatomic_read(&ring->ring_state->cons_head);
 +
 +    idx = (idx + 1) & ((ring->max_elems << 1) - 1);
 +    qatomic_set(&ring->ring_state->cons_head, idx);
  }
  void *pvrdma_ring_next_elem_write(PvrdmaRing *ring)
  {
 -    int idx;
 -    unsigned int offset, tail;
 +    unsigned int idx, offset;
 +    const uint32_t tail = qatomic_read(&ring->ring_state->prod_tail);
 +    const uint32_t head = qatomic_read(&ring->ring_state->cons_head);
 -    idx = pvrdma_idx_ring_has_space(ring->ring_state, ring->max_elems, &tail);
 -    if (idx <= 0) {
 +    if (tail & ~((ring->max_elems << 1) - 1) ||
 +        head & ~((ring->max_elems << 1) - 1) ||
 +        tail == (head ^ ring->max_elems)) {
          rdma_error_report("CQ is full");
          return NULL;
      }
--    idx = pvrdma_idx(&ring->ring_state->prod_tail, ring->max_elems);
+     for (i = 0; i < v->shadow_vqs->len; ++i) {
--    if (idx < 0 || tail != idx) {
++        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
--        rdma_error_report("Invalid idx %d", idx);
+         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
--        return NULL;
++        struct vhost_vring_addr addr = {
--    }
++            .index = i,
--
++        };
-+    idx = tail & (ring->max_elems - 1);
++        int r;
-     offset = idx * ring->elem_sz;
+         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
-     return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE);
+         if (unlikely(!ok)) {
- }
+-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
++            goto err;
- void pvrdma_ring_write_inc(PvrdmaRing *ring)
++        }
- {
++
--    pvrdma_idx_ring_inc(&ring->ring_state->prod_tail, ring->max_elems);
++        vhost_svq_start(svq, dev->vdev, vq);
-+    uint32_t idx = qatomic_read(&ring->ring_state->prod_tail);
++        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
-+
++        if (unlikely(!ok)) {
-+    idx = (idx + 1) & ((ring->max_elems << 1) - 1);
++            goto err_map;
-+    qatomic_set(&ring->ring_state->prod_tail, idx);
++        }
- }
++
++        /* Override vring GPA set by vhost subsystem */
- void pvrdma_ring_free(PvrdmaRing *ring)
++        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
-diff --git a/hw/rdma/vmw/pvrdma_dev_ring.h b/hw/rdma/vmw/pvrdma_dev_ring.h
++        if (unlikely(r != 0)) {
-index XXXXXXX..XXXXXXX 100644
++            error_setg_errno(&err, -r, "Cannot set device address");
---- a/hw/rdma/vmw/pvrdma_dev_ring.h
++            goto err_set_addr;
-+++ b/hw/rdma/vmw/pvrdma_dev_ring.h
++        }
-@@ -XXX,XX +XXX,XX @@
++    }
++
- #define MAX_RING_NAME_SZ 32
++    return true;
++
-+typedef struct PvrdmaRingState {
++err_set_addr:
-+    int prod_tail; /* producer tail */
++    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
-+    int cons_head; /* consumer head */
++
-+} PvrdmaRingState;
++err_map:
-+
++    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
- typedef struct PvrdmaRing {
++
-     char name[MAX_RING_NAME_SZ];
++err:
-     PCIDevice *dev;
++    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
-     uint32_t max_elems;
++    for (unsigned j = 0; j < i; ++j) {
-     size_t elem_sz;
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
--    struct pvrdma_ring *ring_state; /* used only for unmap */
++        vhost_vdpa_svq_unmap_rings(dev, svq);
-+    PvrdmaRingState *ring_state; /* used only for unmap */
++        vhost_svq_stop(svq);
-     int npages;
++    }
-     void **pages;
++
- } PvrdmaRing;
++    return false;
++}
- int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev,
++
--                     struct pvrdma_ring *ring_state, uint32_t max_elems,
++static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
-+                     PvrdmaRingState *ring_state, uint32_t max_elems,
++{
-                      size_t elem_sz, dma_addr_t *tbl, uint32_t npages);
++    struct vhost_vdpa *v = dev->opaque;
- void *pvrdma_ring_next_elem_read(PvrdmaRing *ring);
++
- void pvrdma_ring_read_inc(PvrdmaRing *ring);
++    if (!v->shadow_vqs) {
-diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
++        return true;
-index XXXXXXX..XXXXXXX 100644
++    }
---- a/hw/rdma/vmw/pvrdma_main.c
++
-+++ b/hw/rdma/vmw/pvrdma_main.c
++    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
-@@ -XXX,XX +XXX,XX @@ static void free_dev_ring(PCIDevice *pci_dev, PvrdmaRing *ring,
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
-     rdma_pci_dma_unmap(pci_dev, ring_state, TARGET_PAGE_SIZE);
++                                                      i);
- }
++        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
++        if (unlikely(!ok)) {
--static int init_dev_ring(PvrdmaRing *ring, struct pvrdma_ring **ring_state,
+             return false;
-+static int init_dev_ring(PvrdmaRing *ring, PvrdmaRingState **ring_state,
+         }
-                          const char *name, PCIDevice *pci_dev,
+     }
-                          dma_addr_t dir_addr, uint32_t num_pages)
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
- {
+         }
-@@ -XXX,XX +XXX,XX @@ static int init_dev_ring(PvrdmaRing *ring, struct pvrdma_ring **ring_state,
+         vhost_vdpa_set_vring_ready(dev);
-     /* RX ring is the second */
+     } else {
-     (*ring_state)++;
++        ok = vhost_vdpa_svqs_stop(dev);
-     rc = pvrdma_ring_init(ring, name, pci_dev,
++        if (unlikely(!ok)) {
--                          (struct pvrdma_ring *)*ring_state,
++            return -1;
-+                          (PvrdmaRingState *)*ring_state,
++        }
-                           (num_pages - 1) * TARGET_PAGE_SIZE /
+         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
-                           sizeof(struct pvrdma_cqne),
+     }
-                           sizeof(struct pvrdma_cqne),
 diff --git a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h b/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h
 deleted file mode 100644
 index XXXXXXX..XXXXXXX
 --- a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h
 +++ /dev/null
@@ -XXX,XX +XXX,XX @@
 -/*
 - * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
 - *
 - * This program is free software; you can redistribute it and/or
 - * modify it under the terms of EITHER the GNU General Public License
 - * version 2 as published by the Free Software Foundation or the BSD
 - * 2-Clause License. This program is distributed in the hope that it
 - * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
 - * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
 - * See the GNU General Public License version 2 for more details at
 - * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
 - *
 - * You should have received a copy of the GNU General Public License
 - * along with this program available in the file COPYING in the main
 - * directory of this source tree.
 - *
 - * The BSD 2-Clause License
 - *
 - *     Redistribution and use in source and binary forms, with or
 - *     without modification, are permitted provided that the following
 - *     conditions are met:
 - *
 - *      - Redistributions of source code must retain the above
 - *        copyright notice, this list of conditions and the following
 - *        disclaimer.
 - *
 - *      - Redistributions in binary form must reproduce the above
 - *        copyright notice, this list of conditions and the following
 - *        disclaimer in the documentation and/or other materials
 - *        provided with the distribution.
 - *
 - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 - * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 - * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 - * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 - * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 - * OF THE POSSIBILITY OF SUCH DAMAGE.
 - */
 -
 -#ifndef __PVRDMA_RING_H__
 -#define __PVRDMA_RING_H__
 -
 -#include "standard-headers/linux/types.h"
 -
 -#define PVRDMA_INVALID_IDX    -1    /* Invalid index. */
 -
 -struct pvrdma_ring {
 -    int prod_tail;    /* Producer tail. */
 -    int cons_head;    /* Consumer head. */
 -};
 -
 -struct pvrdma_ring_state {
 -    struct pvrdma_ring tx;    /* Tx ring. */
 -    struct pvrdma_ring rx;    /* Rx ring. */
 -};
 -
 -static inline int pvrdma_idx_valid(uint32_t idx, uint32_t max_elems)
 -{
 -    /* Generates fewer instructions than a less-than. */
 -    return (idx & ~((max_elems << 1) - 1)) == 0;
 -}
 -
 -static inline int32_t pvrdma_idx(int *var, uint32_t max_elems)
 -{
 -    const unsigned int idx = qatomic_read(var);
 -
 -    if (pvrdma_idx_valid(idx, max_elems))
 -        return idx & (max_elems - 1);
 -    return PVRDMA_INVALID_IDX;
 -}
 -
 -static inline void pvrdma_idx_ring_inc(int *var, uint32_t max_elems)
 -{
 -    uint32_t idx = qatomic_read(var) + 1;    /* Increment. */
 -
 -    idx &= (max_elems << 1) - 1;        /* Modulo size, flip gen. */
 -    qatomic_set(var, idx);
 -}
 -
 -static inline int32_t pvrdma_idx_ring_has_space(const struct pvrdma_ring *r,
 -                          uint32_t max_elems, uint32_t *out_tail)
 -{
 -    const uint32_t tail = qatomic_read(&r->prod_tail);
 -    const uint32_t head = qatomic_read(&r->cons_head);
 -
 -    if (pvrdma_idx_valid(tail, max_elems) &&
 -        pvrdma_idx_valid(head, max_elems)) {
 -        *out_tail = tail & (max_elems - 1);
 -        return tail != (head ^ max_elems);
 -    }
 -    return PVRDMA_INVALID_IDX;
 -}
 -
 -static inline int32_t pvrdma_idx_ring_has_data(const struct pvrdma_ring *r,
 -                         uint32_t max_elems, uint32_t *out_head)
 -{
 -    const uint32_t tail = qatomic_read(&r->prod_tail);
 -    const uint32_t head = qatomic_read(&r->cons_head);
 -
 -    if (pvrdma_idx_valid(tail, max_elems) &&
 -        pvrdma_idx_valid(head, max_elems)) {
 -        *out_head = head & (max_elems - 1);
 -        return tail != head;
 -    }
 -    return PVRDMA_INVALID_IDX;
 -}
 -
 -#endif /* __PVRDMA_RING_H__ */
 diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
 index XXXXXXX..XXXXXXX 100755
 --- a/scripts/update-linux-headers.sh
 +++ b/scripts/update-linux-headers.sh
@@ -XXX,XX +XXX,XX @@ sed  -e '1h;2,$H;$!d;g'  -e 's/[^};]*pvrdma[^(| ]*([^)]*);//g' \
      "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h" > \
      "$tmp_pvrdma_verbs";
 -for i in "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h" \
 -         "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h" \
 +for i in "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h" \
           "$tmp_pvrdma_verbs"; do \
      cp_portable "$i" \
           "$output/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/"
 --
 .7.4

-[PULL V2 12/20] pcnet: switch to use qemu_receive_packet() for loopback
+[PULL V2 09/15] util: Add iova_tree_alloc_map
-From: Alexander Bulekov <alxndr@bu.edu>
+From: Eugenio Pérez <eperezma@redhat.com>
-This patch switches to use qemu_receive_packet() which can detect
+This iova tree function allows it to look for a hole in allocated
-reentrancy and return early.
+regions and return a totally new translation for a given translated
+address.
-This is intended to address CVE-2021-3416.
+It's usage is mainly to allow devices to access qemu address space,
-Cc: Prasad J Pandit <ppandit@redhat.com>
+remapping guest's one into a new iova space where qemu can add chunks of
-Cc: qemu-stable@nongnu.org
+addresses.
-Buglink: https://bugs.launchpad.net/qemu/+bug/1917085
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
-Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Reviewed-by: Peter Xu <peterx@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/pcnet.c | 2 +-
+ include/qemu/iova-tree.h |  18 +++++++
-file changed, 1 insertion(+), 1 deletion(-)
+ util/iova-tree.c         | 135 +++++++++++++++++++++++++++++++++++++++++++++++
+files changed, 153 insertions(+)
-diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c
 diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/pcnet.c
+--- a/include/qemu/iova-tree.h
-+++ b/hw/net/pcnet.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ txagain:
+@@ -XXX,XX +XXX,XX @@
-             if (BCR_SWSTYLE(s) == 1)
+ #define  IOVA_OK           (0)
-                 add_crc = !GET_FIELD(tmd.status, TMDS, NOFCS);
+ #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
-             s->looptest = add_crc ? PCNET_LOOPTEST_CRC : PCNET_LOOPTEST_NOCRC;
+ #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
--            pcnet_receive(qemu_get_queue(s->nic), s->buffer, s->xmit_pos);
++#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
-+            qemu_receive_packet(qemu_get_queue(s->nic), s->buffer, s->xmit_pos);
-             s->looptest = 0;
+ typedef struct IOVATree IOVATree;
-         } else {
+ typedef struct DMAMap {
-             if (s->nic) {
+@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
  void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
  /**
 + * iova_tree_alloc_map:
 + *
 + * @tree: the iova tree to allocate from
 + * @map: the new map (as translated addr & size) to allocate in the iova region
 + * @iova_begin: the minimum address of the allocation
 + * @iova_end: the maximum addressable direction of the allocation
 + *
 + * Allocates a new region of a given size, between iova_min and iova_max.
 + *
 + * Return: Same as iova_tree_insert, but cannot overlap and can return error if
 + * iova tree is out of free contiguous range. The caller gets the assigned iova
 + * in map->iova.
 + */
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_end);
 +
 +/**
   * iova_tree_destroy:
   *
   * @tree: the iova tree to destroy
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
      GTree *tree;
  };
 +/* Args to pass to iova_tree_alloc foreach function. */
 +struct IOVATreeAllocArgs {
 +    /* Size of the desired allocation */
 +    size_t new_size;
 +
 +    /* The minimum address allowed in the allocation */
 +    hwaddr iova_begin;
 +
 +    /* Map at the left of the hole, can be NULL if "this" is first one */
 +    const DMAMap *prev;
 +
 +    /* Map at the right of the hole, can be NULL if "prev" is the last one */
 +    const DMAMap *this;
 +
 +    /* If found, we fill in the IOVA here */
 +    hwaddr iova_result;
 +
 +    /* Whether have we found a valid IOVA */
 +    bool iova_found;
 +};
 +
 +/**
 + * Iterate args to the next hole
 + *
 + * @args: The alloc arguments
 + * @next: The next mapping in the tree. Can be NULL to signal the last one
 + */
 +static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
 +                                         const DMAMap *next) {
 +    args->prev = args->this;
 +    args->this = next;
 +}
 +
  static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
  {
      const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
      return IOVA_OK;
  }
 +/**
 + * Try to find an unallocated IOVA range between prev and this elements.
 + *
 + * @args: Arguments to allocation
 + *
 + * Cases:
 + *
 + * (1) !prev, !this: No entries allocated, always succeed
 + *
 + * (2) !prev, this: We're iterating at the 1st element.
 + *
 + * (3) prev, !this: We're iterating at the last element.
 + *
 + * (4) prev, this: this is the most common case, we'll try to find a hole
 + * between "prev" and "this" mapping.
 + *
 + * Note that this function assumes the last valid iova is HWADDR_MAX, but it
 + * searches linearly so it's easy to discard the result if it's not the case.
 + */
 +static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
 +{
 +    const DMAMap *prev = args->prev, *this = args->this;
 +    uint64_t hole_start, hole_last;
 +
 +    if (this && this->iova + this->size < args->iova_begin) {
 +        return;
 +    }
 +
 +    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
 +    hole_last = this ? this->iova : HWADDR_MAX;
 +
 +    if (hole_last - hole_start > args->new_size) {
 +        args->iova_result = hole_start;
 +        args->iova_found = true;
 +    }
 +}
 +
 +/**
 + * Foreach dma node in the tree, compare if there is a hole with its previous
 + * node (or minimum iova address allowed) and the node.
 + *
 + * @key: Node iterating
 + * @value: Node iterating
 + * @pargs: Struct to communicate with the outside world
 + *
 + * Return: false to keep iterating, true if needs break.
 + */
 +static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
 +                                         gpointer pargs)
 +{
 +    struct IOVATreeAllocArgs *args = pargs;
 +    DMAMap *node = value;
 +
 +    assert(key == value);
 +
 +    iova_tree_alloc_args_iterate(args, node);
 +    iova_tree_alloc_map_in_hole(args);
 +    return args->iova_found;
 +}
 +
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_last)
 +{
 +    struct IOVATreeAllocArgs args = {
 +        .new_size = map->size,
 +        .iova_begin = iova_begin,
 +    };
 +
 +    if (unlikely(iova_last < iova_begin)) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
 +    /*
 +     * Find a valid hole for the mapping
 +     *
 +     * Assuming low iova_begin, so no need to do a binary search to
 +     * locate the first node.
 +     *
 +     * TODO: Replace all this with g_tree_node_first/next/last when available
 +     * (from glib since 2.68). To do it with g_tree_foreach complicates the
 +     * code a lot.
 +     *
 +     */
 +    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
 +    if (!args.iova_found) {
 +        /*
 +         * Either tree is empty or the last hole is still not checked.
 +         * g_tree_foreach does not compare (last, iova_last] range, so we check
 +         * it here.
 +         */
 +        iova_tree_alloc_args_iterate(&args, NULL);
 +        iova_tree_alloc_map_in_hole(&args);
 +    }
 +
 +    if (!args.iova_found || args.iova_result + map->size > iova_last) {
 +        return IOVA_ERR_NOMEM;
 +    }
 +
 +    map->iova = args.iova_result;
 +    return iova_tree_insert(tree, map);
 +}
 +
  void iova_tree_destroy(IOVATree *tree)
  {
      g_tree_destroy(tree->tree);
 --
 .7.4

-[PULL V2 20/20] net: Do not fill legacy info_str for backends
+[PULL V2 10/15] util: add iova_tree_find_iova
-From: Alexey Kirillov <lekiravi@yandex-team.ru>
+From: Eugenio Pérez <eperezma@redhat.com>
-As we use QAPI NetClientState->stored_config to store and get information
+This function does the reverse operation of iova_tree_find: To look for
-about backend network devices, we can drop fill of legacy field info_str
+a mapping that match a translated address so we can do the reverse.
 for them.
-We still use info_str field for NIC and hubports, so we can not completely
+This have linear complexity instead of logarithmic, but it supports
-remove it.
+overlapping HVA. Future developments could reduce it.
-Signed-off-by: Alexey Kirillov <lekiravi@yandex-team.ru>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/l2tpv3.c     |  2 --
+ include/qemu/iova-tree.h | 20 +++++++++++++++++++-
- net/slirp.c      |  3 ---
+ util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
- net/socket.c     | 28 ----------------------------
+files changed, 53 insertions(+), 1 deletion(-)
  net/tap-win32.c  |  2 --
  net/tap.c        |  9 ---------
  net/vde.c        |  2 --
  net/vhost-user.c |  1 -
  net/vhost-vdpa.c |  1 -
 files changed, 48 deletions(-)
-diff --git a/net/l2tpv3.c b/net/l2tpv3.c
+diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/net/l2tpv3.c
+--- a/include/qemu/iova-tree.h
-+++ b/net/l2tpv3.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ int net_init_l2tpv3(const Netdev *netdev,
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
+  * @tree: the iova tree to search from
-     QAPI_CLONE_MEMBERS(NetdevL2TPv3Options,
+  * @map: the mapping to search
-                        &nc->stored_config->u.l2tpv3, l2tpv3);
+  *
--
+- * Search for a mapping in the iova tree that overlaps with the
--    s->nc.info_str = g_strdup_printf("l2tpv3: connected");
++ * Search for a mapping in the iova tree that iova overlaps with the
-     return 0;
+  * mapping range specified.  Only the first found mapping will be
- outerr:
+  * returned.
-     qemu_del_net_client(nc);
+  *
-diff --git a/net/slirp.c b/net/slirp.c
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
  /**
 + * iova_tree_find_iova:
 + *
 + * @tree: the iova tree to search from
 + * @map: the mapping to search
 + *
 + * Search for a mapping in the iova tree that translated_addr overlaps with the
 + * mapping range specified.  Only the first found mapping will be
 + * returned.
 + *
 + * Return: DMAMap pointer if found, or NULL if not found.  Note that
 + * the returned DMAMap pointer is maintained internally.  User should
 + * only read the content but never modify or free the content.  Also,
 + * user is responsible to make sure the pointer is valid (say, no
 + * concurrent deletion in progress).
 + */
 +const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
 +
 +/**
   * iova_tree_find_address:
   *
   * @tree: the iova tree to search from
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/slirp.c
+--- a/util/iova-tree.c
-+++ b/net/slirp.c
++++ b/util/iova-tree.c
-@@ -XXX,XX +XXX,XX @@ static int net_slirp_init(NetClientState *peer, const char *model,
+@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
-         stored->tftp_server_name = g_strdup(tftp_server_name);
+     bool iova_found;
-     }
+ };
--    nc->info_str = g_strdup_printf("net=%s,restrict=%s", inet_ntoa(net),
++typedef struct IOVATreeFindIOVAArgs {
--                                   restricted ? "on" : "off");
++    const DMAMap *needle;
--
++    const DMAMap *result;
-     s = DO_UPCAST(SlirpState, nc, nc);
++} IOVATreeFindIOVAArgs;
++
-     s->slirp = slirp_init(restricted, ipv4, net, mask, host,
+ /**
-diff --git a/net/socket.c b/net/socket.c
+  * Iterate args to the next hole
-index XXXXXXX..XXXXXXX 100644
+  *
---- a/net/socket.c
+@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
-+++ b/net/socket.c
+     return g_tree_lookup(tree->tree, map);
@@ -XXX,XX +XXX,XX @@ static void net_socket_send(void *opaque)
          s->fd = -1;
          net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
          s->nc.link_down = true;
 -        g_free(s->nc.info_str);
 -        s->nc.info_str = g_new0(char, 1);
          return;
      }
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer,
          stored->mcast = g_strdup(mcast);
          s->dgram_dst = saddr;
 -        nc->info_str = g_strdup_printf("socket: fd=%d (cloned mcast=%s:%d)",
 -                                       fd, inet_ntoa(saddr.sin_addr),
 -                                       ntohs(saddr.sin_port));
      } else {
          if (sa_type == SOCKET_ADDRESS_TYPE_UNIX) {
              s->dgram_dst.sin_family = AF_UNIX;
          }
 -
 -        nc->info_str = g_strdup_printf("socket: fd=%d %s",
 -                                       fd, SocketAddressType_str(sa_type));
      }
      return s;
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_stream(NetClientState *peer,
      nc = qemu_new_net_client(&net_socket_info, peer, model, name);
 -    nc->info_str = g_strdup_printf("socket: fd=%d", fd);
 -
      s = DO_UPCAST(NetSocketState, nc, nc);
      s->fd = fd;
@@ -XXX,XX +XXX,XX @@ static void net_socket_accept(void *opaque)
      stored->has_fd = true;
      stored->fd = g_strdup_printf("%d", fd);
 -
 -    g_free(s->nc.info_str);
 -    s->nc.info_str = g_strdup_printf("socket: connection from %s:%d",
 -                                     inet_ntoa(saddr.sin_addr),
 -                                     ntohs(saddr.sin_port));
  }
- static int net_socket_listen_init(NetClientState *peer,
++static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
-@@ -XXX,XX +XXX,XX @@ static int net_socket_connect_init(NetClientState *peer,
++                                                gpointer data)
-     stored->has_connect = true;
++{
-     stored->connect = g_strdup(host_str);
++    const DMAMap *map = key;
++    IOVATreeFindIOVAArgs *args = data;
--    g_free(s->nc.info_str);
++    const DMAMap *needle;
--    s->nc.info_str = g_strdup_printf("socket: connect to %s:%d",
++
--                                     inet_ntoa(saddr.sin_addr),
++    g_assert(key == value);
--                                     ntohs(saddr.sin_port));
++
-     return 0;
++    needle = args->needle;
- }
++    if (map->translated_addr + map->size < needle->translated_addr ||
++        needle->translated_addr + needle->size < map->translated_addr) {
-@@ -XXX,XX +XXX,XX @@ static int net_socket_mcast_init(NetClientState *peer,
++        return false;
-         stored->localaddr = g_strdup(localaddr_str);
++    }
-     }
++
++    args->result = map;
--    g_free(s->nc.info_str);
++    return true;
--    s->nc.info_str = g_strdup_printf("socket: mcast=%s:%d",
++}
--                                     inet_ntoa(saddr.sin_addr),
++
--                                     ntohs(saddr.sin_port));
++const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
-     return 0;
++{
--
++    IOVATreeFindIOVAArgs args = {
- }
++        .needle = map,
++    };
- static int net_socket_udp_init(NetClientState *peer,
++
-@@ -XXX,XX +XXX,XX @@ static int net_socket_udp_init(NetClientState *peer,
++    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
-     stored->has_udp = true;
++    return args.result;
-     stored->udp = g_strdup(rhost);
++}
++
--    g_free(s->nc.info_str);
+ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
--    s->nc.info_str = g_strdup_printf("socket: udp=%s:%d",
+ {
--                                     inet_ntoa(raddr.sin_addr),
+     const DMAMap map = { .iova = iova, .size = 0 };
 -                                     ntohs(raddr.sin_port));
      return 0;
  }
 diff --git a/net/tap-win32.c b/net/tap-win32.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/tap-win32.c
 +++ b/net/tap-win32.c
@@ -XXX,XX +XXX,XX @@ static int tap_win32_init(NetClientState *peer, const char *model,
      stored->has_ifname = true;
      stored->ifname = g_strdup(ifname);
 -    s->nc.info_str = g_strdup_printf("tap: ifname=%s", ifname);
 -
      s->handle = handle;
      qemu_add_wait_object(s->handle->tap_semaphore, tap_win32_send, s);
 diff --git a/net/tap.c b/net/tap.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/tap.c
 +++ b/net/tap.c
@@ -XXX,XX +XXX,XX @@ int net_init_bridge(const Netdev *netdev, const char *name,
          stored->helper = g_strdup(helper);
      }
 -    s->nc.info_str = g_strdup_printf("helper=%s,br=%s", helper, br);
 -
      return 0;
  }
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
              stored->fds = g_strdup_printf("%s:%d", stored->fds, fd);
              g_free(tmp_s);
          }
 -
 -        s->nc.info_str = g_strdup_printf("fd=%d", fd);
      } else if (tap->has_helper) {
          if (!stored->has_helper) {
              stored->has_helper = true;
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
              stored->br = tap->has_br ? g_strdup(tap->br) :
                                         g_strdup(DEFAULT_BRIDGE_INTERFACE);
          }
 -
 -        s->nc.info_str = g_strdup_printf("helper=%s", tap->helper);
      } else {
          if (ifname && !stored->has_ifname) {
              stored->has_ifname = true;
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
              stored->downscript = g_strdup(downscript);
          }
 -        s->nc.info_str = g_strdup_printf("ifname=%s,script=%s,downscript=%s",
 -                                         ifname, script, downscript);
 -
          if (strcmp(downscript, "no") != 0) {
              snprintf(s->down_script, sizeof(s->down_script), "%s", downscript);
              snprintf(s->down_script_arg, sizeof(s->down_script_arg),
 diff --git a/net/vde.c b/net/vde.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/vde.c
 +++ b/net/vde.c
@@ -XXX,XX +XXX,XX @@ static int net_vde_init(NetClientState *peer, const char *model,
      nc = qemu_new_net_client(&net_vde_info, peer, model, name);
 -    nc->info_str = g_strdup_printf("sock=%s,fd=%d", sock, vde_datafd(vde));
 -
      s = DO_UPCAST(VDEState, nc, nc);
      s->vde = vde;
 diff --git a/net/vhost-user.c b/net/vhost-user.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/vhost-user.c
 +++ b/net/vhost-user.c
@@ -XXX,XX +XXX,XX @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
      user = g_new0(struct VhostUserState, 1);
      for (i = 0; i < queues; i++) {
          nc = qemu_new_net_client(&net_vhost_user_info, peer, device, name);
 -        nc->info_str = g_strdup_printf("vhost-user%d to %s", i, chr->label);
          nc->queue_index = i;
          if (!nc0) {
              nc0 = nc;
 diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/vhost-vdpa.c
 +++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int net_vhost_vdpa_init(NetClientState *peer, const char *device,
      stored->has_queues = true;
      stored->queues = 1; /* TODO: change when support multiqueue */
 -    nc->info_str = g_strdup_printf(TYPE_VHOST_VDPA);
      nc->queue_index = 0;
      s = DO_UPCAST(VhostVDPAState, nc, nc);
      vdpa_device_fd = qemu_open_old(vhostdev, O_RDWR);
 --
 .7.4

-[PULL V2 17/20] tests: Add tests for query-netdev command
+[PULL V2 11/15] vhost: Add VhostIOVATree
-From: Alexey Kirillov <lekiravi@yandex-team.ru>
+From: Eugenio Pérez <eperezma@redhat.com>
-A simply qtest that checks for correct number of netdevs in the response
+This tree is able to look for a translated address from an IOVA address.
 of the query-netdev.
-Signed-off-by: Alexey Kirillov <lekiravi@yandex-team.ru>
+At first glance it is similar to util/iova-tree. However, SVQ working on
-Acked-by: Thomas Huth <thuth@redhat.com>
+devices with limited IOVA space need more capabilities, like allocating
 IOVA chunks or performing reverse translations (qemu addresses to iova).
 The allocation capability, as "assign a free IOVA address to this chunk
 of memory in qemu's address space" allows shadow virtqueue to create a
 new address space that is not restricted by guest's addressable one, so
 we can allocate shadow vqs vrings outside of it.
 It duplicates the tree so it can search efficiently in both directions,
 and it will signal overlap if iova or the translated address is present
 in any tree.
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- tests/qtest/meson.build         |   3 +
+ hw/virtio/meson.build       |   2 +-
- tests/qtest/test-query-netdev.c | 120 ++++++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
-files changed, 123 insertions(+)
+ hw/virtio/vhost-iova-tree.h |  27 +++++++++++
- create mode 100644 tests/qtest/test-query-netdev.c
+files changed, 138 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-iova-tree.c
  create mode 100644 hw/virtio/vhost-iova-tree.h
-diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qtest/meson.build
+--- a/hw/virtio/meson.build
-+++ b/tests/qtest/meson.build
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ qtests_generic = [
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
- if config_host.has_key('CONFIG_MODULES')
-   qtests_generic += [ 'modules-test' ]
+ virtio_ss = ss.source_set()
- endif
+ virtio_ss.add(files('virtio.c'))
-+if slirp.found()
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
-+  qtests_generic += [ 'test-query-netdev' ]
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
-+endif
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
- qtests_pci = \
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
-   (config_all_devices.has_key('CONFIG_VGA') ? ['display-vga-test'] : []) +                  \
+diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
 diff --git a/tests/qtest/test-query-netdev.c b/tests/qtest/test-query-netdev.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/tests/qtest/test-query-netdev.c
++++ b/hw/virtio/vhost-iova-tree.c
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * QTest testcase for the query-netdev
++ * vhost software live migration iova tree
 + *
-+ * Copyright Yandex N.V., 2019
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * SPDX-License-Identifier: GPL-2.0-or-later
 + * See the COPYING file in the top-level directory.
 + *
 + */
 +
 +#include "qemu/osdep.h"
++#include "qemu/iova-tree.h"
++#include "vhost-iova-tree.h"
 +
-+#include "libqos/libqtest.h"
++#define iova_min_addr qemu_real_host_page_size
 +#include "qapi/qmp/qdict.h"
 +#include "qapi/qmp/qlist.h"
 +
-+/*
++/**
-+ * Events can get in the way of responses we are actually waiting for.
++ * VhostIOVATree, able to:
 + * - Translate iova address
 + * - Reverse translate iova address (from translated to iova)
 + * - Allocate IOVA regions for translated range (linear operation)
 + */
-+GCC_FMT_ATTR(2, 3)
++struct VhostIOVATree {
-+static QObject *wait_command(QTestState *who, const char *command, ...)
++    /* First addressable iova address in the device */
 +    uint64_t iova_first;
 +
 +    /* Last addressable iova address in the device */
 +    uint64_t iova_last;
 +
 +    /* IOVA address to qemu memory maps. */
 +    IOVATree *iova_taddr_map;
 +};
 +
 +/**
 + * Create a new IOVA tree
 + *
 + * Returns the new IOVA tree
 + */
 +VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
 +{
-+    va_list ap;
++    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
 +    QDict *response;
 +    QObject *result;
 +
-+    va_start(ap, command);
++    /* Some devices do not like 0 addresses */
-+    qtest_qmp_vsend(who, command, ap);
++    tree->iova_first = MAX(iova_first, iova_min_addr);
-+    va_end(ap);
++    tree->iova_last = iova_last;
 +
-+    response = qtest_qmp_receive(who);
++    tree->iova_taddr_map = iova_tree_new();
-+
++    return tree;
 +    result = qdict_get(response, "return");
 +    g_assert(result);
 +    qobject_ref(result);
 +    qobject_unref(response);
 +
 +    return result;
 +}
 +
-+static void qmp_query_netdev_no_error(QTestState *qts, size_t netdevs_count)
++/**
 + * Delete an iova tree
 + */
 +void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
 +{
-+    QObject *resp;
++    iova_tree_destroy(iova_tree->iova_taddr_map);
-+    QList *netdevs;
++    g_free(iova_tree);
 +
 +    resp = wait_command(qts, "{'execute': 'query-netdev'}");
 +
 +    netdevs = qobject_to(QList, resp);
 +    g_assert(netdevs);
 +    g_assert(qlist_size(netdevs) == netdevs_count);
 +
 +    qobject_unref(resp);
 +}
 +
-+static void test_query_netdev(void)
++/**
 + * Find the IOVA address stored from a memory address
 + *
 + * @tree: The iova tree
 + * @map: The map with the memory address
 + *
 + * Return the stored mapping, or NULL if not found.
 + */
 +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
 +                                        const DMAMap *map)
 +{
-+    const char *arch = qtest_get_arch();
++    return iova_tree_find_iova(tree->iova_taddr_map, map);
 +    QObject *resp;
 +    QTestState *state;
 +
 +    /* Choosing machine for platforms without default one */
 +    if (g_str_equal(arch, "arm") ||
 +        g_str_equal(arch, "aarch64")) {
 +        state = qtest_init(
 +            "-nodefaults "
 +            "-M virt "
 +            "-netdev user,id=slirp0");
 +    } else if (g_str_equal(arch, "tricore")) {
 +        state = qtest_init(
 +            "-nodefaults "
 +            "-M tricore_testboard "
 +            "-netdev user,id=slirp0");
 +    } else if (g_str_equal(arch, "avr")) {
 +        state = qtest_init(
 +            "-nodefaults "
 +            "-M mega2560 "
 +            "-netdev user,id=slirp0");
 +    } else if (g_str_equal(arch, "rx")) {
 +        state = qtest_init(
 +            "-nodefaults "
 +            "-M gdbsim-r5f562n8 "
 +            "-netdev user,id=slirp0");
 +    } else {
 +        state = qtest_init(
 +            "-nodefaults "
 +            "-netdev user,id=slirp0");
 +    }
 +    g_assert(state);
 +
 +    qmp_query_netdev_no_error(state, 1);
 +
 +    resp = wait_command(state,
 +        "{'execute': 'netdev_add', 'arguments': {"
 +        " 'id': 'slirp1',"
 +        " 'type': 'user'}}");
 +    qobject_unref(resp);
 +
 +    qmp_query_netdev_no_error(state, 2);
 +
 +    resp = wait_command(state,
 +        "{'execute': 'netdev_del', 'arguments': {"
 +        " 'id': 'slirp1'}}");
 +    qobject_unref(resp);
 +
 +    qmp_query_netdev_no_error(state, 1);
 +
 +    qtest_quit(state);
 +}
 +
-+int main(int argc, char **argv)
++/**
 + * Allocate a new mapping
 + *
 + * @tree: The iova tree
 + * @map: The iova map
 + *
 + * Returns:
 + * - IOVA_OK if the map fits in the container
 + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
 + * - IOVA_ERR_NOMEM if tree cannot allocate more space.
 + *
 + * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
 + */
 +int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
 +{
-+    int ret = 0;
++    /* Some vhost devices do not like addr 0. Skip first page */
-+    g_test_init(&argc, &argv, NULL);
++    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
 +
-+    qtest_add_func("/net/qapi/query_netdev", test_query_netdev);
++    if (map->translated_addr + map->size < map->translated_addr ||
 +        map->perm == IOMMU_NONE) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
-+    ret = g_test_run();
++    /* Allocate a node in IOVA address */
 +    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
 +                               tree->iova_last);
 +}
 +
-+    return ret;
++/**
 + * Remove existing mappings from iova tree
 + *
 + * @iova_tree: The vhost iova tree
 + * @map: The map to remove
 + */
 +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
 +{
 +    iova_tree_remove(iova_tree->iova_taddr_map, map);
 +}
+diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
+new file mode 100644
+index XXXXXXX..XXXXXXX
+--- /dev/null
++++ b/hw/virtio/vhost-iova-tree.h
+@@ -XXX,XX +XXX,XX @@
++/*
++ * vhost software live migration iova tree
++ *
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
++ *
++ * SPDX-License-Identifier: GPL-2.0-or-later
++ */
++
++#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
++#define HW_VIRTIO_VHOST_IOVA_TREE_H
++
++#include "qemu/iova-tree.h"
++#include "exec/memory.h"
++
++typedef struct VhostIOVATree VhostIOVATree;
++
++VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
++void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
++G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
++
++const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
++                                        const DMAMap *map);
++int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
++void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
++
++#endif
 --
 .7.4

-[PULL V2 11/20] rtl8139: switch to use qemu_receive_packet() for loopback
+[PULL V2 12/15] vdpa: Add custom IOTLB translations to SVQ
-From: Alexander Bulekov <alxndr@bu.edu>
+From: Eugenio Pérez <eperezma@redhat.com>
-This patch switches to use qemu_receive_packet() which can detect
+Use translations added in VhostIOVATree in SVQ.
-reentrancy and return early.
+Only introduce usage here, not allocation and deallocation. As with
-This is intended to address CVE-2021-3416.
+previous patches, we use the dead code paths of shadow_vqs_enabled to
+avoid commiting too many changes at once. These are impossible to take
-Cc: Prasad J Pandit <ppandit@redhat.com>
+at the moment.
-Cc: qemu-stable@nongnu.org
-Buglink: https://bugs.launchpad.net/qemu/+bug/1910826
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/rtl8139.c | 2 +-
+ hw/virtio/vhost-shadow-virtqueue.c |  75 +++++++++++++++++++++--
-file changed, 1 insertion(+), 1 deletion(-)
+ hw/virtio/vhost-shadow-virtqueue.h |   6 +-
+ hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
-diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
+ include/hw/virtio/vhost-vdpa.h     |   3 +
 files changed, 181 insertions(+), 25 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/rtl8139.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/rtl8139.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void rtl8139_transfer_frame(RTL8139State *s, uint8_t *buf, int size,
+@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
      return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
  }
 +/**
 + * Translate addresses between the qemu's virtual address and the SVQ IOVA
 + *
 + * @svq: Shadow VirtQueue
 + * @vaddr: Translated IOVA addresses
 + * @iovec: Source qemu's VA addresses
 + * @num: Length of iovec and minimum length of vaddr
 + */
 +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
 +                                     void **addrs, const struct iovec *iovec,
 +                                     size_t num)
 +{
 +    if (num == 0) {
 +        return true;
 +    }
 +
 +    for (size_t i = 0; i < num; ++i) {
 +        DMAMap needle = {
 +            .translated_addr = (hwaddr)iovec[i].iov_base,
 +            .size = iovec[i].iov_len,
 +        };
 +        size_t off;
 +
 +        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
 +        /*
 +         * Map cannot be NULL since iova map contains all guest space and
 +         * qemu already has a physical address mapped
 +         */
 +        if (unlikely(!map)) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
 +                          needle.translated_addr);
 +            return false;
 +        }
 +
 +        off = needle.translated_addr - map->translated_addr;
 +        addrs[i] = (void *)(map->iova + off);
 +
 +        if (unlikely(int128_gt(int128_add(needle.translated_addr,
 +                                          iovec[i].iov_len),
 +                               map->translated_addr + map->size))) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Guest buffer expands over iova range");
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
  static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
 +                                    void * const *sg,
                                      const struct iovec *iovec,
                                      size_t num, bool more_descs, bool write)
  {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
          } else {
              descs[i].flags = flags;
          }
+-        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
-         DPRINTF("+++ transmit loopback mode\n");
++        descs[i].addr = cpu_to_le64((hwaddr)sg[n]);
--        rtl8139_do_receive(qemu_get_queue(s->nic), buf, size, do_interrupt);
+         descs[i].len = cpu_to_le32(iovec[n].iov_len);
-+        qemu_receive_packet(qemu_get_queue(s->nic), buf, size);
+         last = i;
-         if (iov) {
+@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
-             g_free(buf2);
+ {
      unsigned avail_idx;
      vring_avail_t *avail = svq->vring.avail;
 +    bool ok;
 +    g_autofree void **sgs = g_new(void *, MAX(elem->out_num, elem->in_num));
      *head = svq->free_head;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
          return false;
      }
 -    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
                              elem->in_num > 0, false);
 -    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +
 +
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
      /*
       * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
   * shadow methods and file descriptors.
   *
 + * @iova_tree: Tree to perform descriptors translations
 + *
   * Returns the new virtqueue or NULL.
   *
   * In case of error, reason is reported through error_report.
   */
 -VhostShadowVirtqueue *vhost_svq_new(void)
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
  {
      g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
      int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
 +    svq->iova_tree = iova_tree;
      return g_steal_pointer(&svq);
  err_init_hdev_call:
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
  #include "qemu/event_notifier.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
 +#include "hw/virtio/vhost-iova-tree.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Virtio device */
      VirtIODevice *vdev;
 +    /* IOVA mapping */
 +    VhostIOVATree *iova_tree;
 +
      /* Map for use the guest's descriptors */
      VirtQueueElement **ring_id_maps;
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                       VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 -VhostShadowVirtqueue *vhost_svq_new(void);
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
  void vhost_svq_free(gpointer vq);
  G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                           vaddr, section->readonly);
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
 +        };
 +
 +        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
 +        if (unlikely(r != IOVA_OK)) {
 +            error_report("Can't allocate a mapping (%d)", r);
 +            goto fail;
 +        }
 +
 +        iova = mem_region.iova;
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        const DMAMap *result;
 +        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
 +            section->offset_within_region +
 +            (iova - section->offset_within_address_space);
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +        };
 +
 +        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
 +        iova = result->iova;
 +        vhost_iova_tree_remove(v->iova_tree, &mem_region);
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
 -        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
          if (unlikely(!svq)) {
              error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
  /**
   * Unmap a SVQ area in the device
   */
 -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 -                                      hwaddr size)
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
 +                                      const DMAMap *needle)
  {
 +    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
 +    hwaddr size;
      int r;
 -    size = ROUND_UP(size, qemu_real_host_page_size);
 -    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    if (unlikely(!result)) {
 +        error_report("Unable to find SVQ address to unmap");
 +        return false;
 +    }
 +
 +    size = ROUND_UP(result->size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
      return r == 0;
  }
  static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                         const VhostShadowVirtqueue *svq)
  {
 +    DMAMap needle = {};
      struct vhost_vdpa *v = dev->opaque;
      struct vhost_vring_addr svq_addr;
 -    size_t device_size = vhost_svq_device_area_size(svq);
 -    size_t driver_size = vhost_svq_driver_area_size(svq);
      bool ok;
      vhost_svq_get_vring_addr(svq, &svq_addr);
 -    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    needle.translated_addr = svq_addr.desc_user_addr;
 +    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
      if (unlikely(!ok)) {
          return false;
      }
 -    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +    needle.translated_addr = svq_addr.used_user_addr;
 +    return vhost_vdpa_svq_unmap_ring(v, &needle);
 +}
 +
 +/**
 + * Map the SVQ area in the device
 + *
 + * @v: Vhost-vdpa device
 + * @needle: The area to search iova
 + * @errorp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
 +                                    Error **errp)
 +{
 +    int r;
 +
 +    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
 +    if (unlikely(r != IOVA_OK)) {
 +        error_setg(errp, "Cannot allocate iova (%d)", r);
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
 +                           (void *)needle->translated_addr,
 +                           needle->perm == IOMMU_RO);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot map region to device");
 +        vhost_iova_tree_remove(v->iova_tree, needle);
 +    }
 +
 +    return r == 0;
  }
  /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                       struct vhost_vring_addr *addr,
                                       Error **errp)
  {
 +    DMAMap device_region, driver_region;
 +    struct vhost_vring_addr svq_addr;
      struct vhost_vdpa *v = dev->opaque;
      size_t device_size = vhost_svq_device_area_size(svq);
      size_t driver_size = vhost_svq_driver_area_size(svq);
 -    int r;
 +    size_t avail_offset;
 +    bool ok;
      ERRP_GUARD();
 -    vhost_svq_get_vring_addr(svq, addr);
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 -                           (void *)addr->desc_user_addr, true);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +    driver_region = (DMAMap) {
 +        .translated_addr = svq_addr.desc_user_addr,
 +        .size = driver_size - 1,
 +        .perm = IOMMU_RO,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq driver region: ");
          return false;
      }
 +    addr->desc_user_addr = driver_region.iova;
 +    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
 +    addr->avail_user_addr = driver_region.iova + avail_offset;
 -    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 -                           (void *)addr->used_user_addr, false);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    device_region = (DMAMap) {
 +        .translated_addr = svq_addr.used_user_addr,
 +        .size = device_size - 1,
 +        .perm = IOMMU_RW,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq device region: ");
 +        vhost_vdpa_svq_unmap_ring(v, &driver_region);
      }
 +    addr->used_user_addr = device_region.iova;
 -    return r == 0;
 +    return ok;
  }
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #include <gmodule.h>
 +#include "hw/virtio/vhost-iova-tree.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
      bool shadow_vqs_enabled;
 +    /* IOVA mapping used by the Shadow Virtqueue */
 +    VhostIOVATree *iova_tree;
      GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 --
 .7.4

-[PULL V2 10/20] tx_pkt: switch to use qemu_receive_packet_iov() for loopback
+[PULL V2 13/15] vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
-This patch switches to use qemu_receive_receive_iov() which can detect
+From: Eugenio Pérez <eperezma@redhat.com>
 reentrancy and return early.
-This is intended to address CVE-2021-3416.
+This is needed to achieve migration, so the destination can restore its
 index.
-Cc: Prasad J Pandit <ppandit@redhat.com>
+Setting base as last used idx, so destination will see as available all
-Cc: qemu-stable@nongnu.org
+the entries that the device did not use, including the in-flight
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+processing ones.
 This is ok for networking, but other kinds of devices might have
 problems with these retransmissions.
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/net_tx_pkt.c | 2 +-
+ hw/virtio/vhost-vdpa.c | 17 +++++++++++++++++
-file changed, 1 insertion(+), 1 deletion(-)
+file changed, 17 insertions(+)
-diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/net_tx_pkt.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/net_tx_pkt.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static inline void net_tx_pkt_sendv(struct NetTxPkt *pkt,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
-     NetClientState *nc, const struct iovec *iov, int iov_cnt)
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                         struct vhost_vring_state *ring)
  {
-     if (pkt->is_loopback) {
++    struct vhost_vdpa *v = dev->opaque;
--        nc->info->receive_iov(nc, iov, iov_cnt);
+     int ret;
-+        qemu_receive_packet_iov(nc, iov, iov_cnt);
-     } else {
++    if (v->shadow_vqs_enabled) {
-         qemu_sendv_packet(nc, iov, iov_cnt);
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
-     }
++                                                      ring->index);
 +
 +        /*
 +         * Setting base as last used idx, so destination will see as available
 +         * all the entries that the device did not use, including the in-flight
 +         * processing ones.
 +         *
 +         * TODO: This is ok for networking, but other kinds of devices might
 +         * have problems with these retransmissions.
 +         */
 +        ring->num = svq->last_used_idx;
 +        return 0;
 +    }
 +
      ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
      trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
      return ret;
 --
 .7.4

-[PULL V2 04/20] e1000: fail early for evil descriptor
+[PULL V2 14/15] vdpa: Never set log_base addr if SVQ is enabled
-During procss_tx_desc(), driver can try to chain data descriptor with
+From: Eugenio Pérez <eperezma@redhat.com>
 legacy descriptor, when will lead underflow for the following
 calculation in process_tx_desc() for bytes:
-            if (tp->size + bytes > msh)
+Setting the log address would make the device start reporting invalid
-                bytes = msh - tp->size;
+dirty memory because the SVQ vrings are located in qemu's memory.
-This will lead a infinite loop. So check and fail early if tp->size if
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
-greater or equal to msh.
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Reported-by: Alexander Bulekov <alxndr@bu.edu>
 Reported-by: Cheolwoo Myung <cwmyung@snu.ac.kr>
 Reported-by: Ruhr-University Bochum <bugs-syssec@rub.de>
 Cc: Prasad J Pandit <ppandit@redhat.com>
 Cc: qemu-stable@nongnu.org
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/e1000.c | 4 ++++
+ hw/virtio/vhost-vdpa.c | 3 ++-
-file changed, 4 insertions(+)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/hw/net/e1000.c b/hw/net/e1000.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/e1000.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
-         msh = tp->tso_props.hdr_len + tp->tso_props.mss;
+ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
-         do {
+                                      struct vhost_log *log)
-             bytes = split_size;
+ {
-+            if (tp->size >= msh) {
+-    if (vhost_vdpa_one_time_request(dev)) {
-+                goto eop;
++    struct vhost_vdpa *v = dev->opaque;
-+            }
++    if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
-             if (tp->size + bytes > msh)
+         return 0;
                  bytes = msh - tp->size;
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
          tp->size += split_size;
      }
-+eop:
-     if (!(txd_lower & E1000_TXD_CMD_EOP))
-         return;
-     if (!(tp->cptse && tp->size < tp->tso_props.hdr_len)) {
 --
 .7.4

-[PULL V2 03/20] net: validate that ids are well formed
+[PULL V2 15/15] vdpa: Expose VHOST_F_LOG_ALL on SVQ
-From: Paolo Bonzini <pbonzini@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-When a network or network device is created from the command line or HMP,
+SVQ is able to log the dirty bits by itself, so let's use it to not
-QemuOpts ensures that the id passes the id_wellformed check.  However,
+block migration.
 QMP skips this:
-   $ qemu-system-x86_64 -qmp stdio -S -nic user,id=123/456
+Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
-   qemu-system-x86_64: -nic user,id=123/456: Parameter id expects an identifier
+enabled. Even if the device supports it, the reports would be nonsense
-   Identifiers consist of letters, digits, -, ., _, starting with a letter.
+because SVQ memory is in the qemu region.
-   $ qemu-system-x86_64 -qmp stdio -S
+The log region is still allocated. Future changes might skip that, but
-   {"execute":"qmp_capabilities"}
+this series is already long enough.
    {"return": {}}
    {"execute":"netdev_add", "arguments": {"type": "user", "id": "123/456"}}
    {"return": {}}
-After:
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
    $ qemu-system-x86_64 -qmp stdio -S
    {"execute":"qmp_capabilities"}
    {"return": {}}
    {"execute":"netdev_add", "arguments": {"type": "user", "id": "123/456"}}
    {"error": {"class": "GenericError", "desc": "Parameter "id" expects an identifier"}}
 Validity checks should be performed always at the bottom of the call chain,
 because QMP skips all the steps above.  At the same time we know that every
 call chain should go through either QMP or (for legacy) through QemuOpts.
 Because the id for -net and -nic is automatically generated and not
 well-formed by design, just add the check to QMP.
 Cc: Jason Wang <jasowang@redhat.com>
 Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/net.c | 5 +++++
+ hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
-file changed, 5 insertions(+)
+ include/hw/virtio/vhost-vdpa.h |  1 +
 files changed, 36 insertions(+), 4 deletions(-)
-diff --git a/net/net.c b/net/net.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/net.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/net/net.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ void netdev_add(QemuOpts *opts, Error **errp)
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
+     return v->index != 0;
- void qmp_netdev_add(Netdev *netdev, Error **errp)
+ }
 +static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
 +                                       uint64_t *features)
 +{
 +    int ret;
 +
 +    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 +    trace_vhost_vdpa_get_features(dev, *features);
 +    return ret;
 +}
 +
  static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
-+    if (!id_wellformed(netdev->id)) {
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
-+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "id", "an identifier");
+         return 0;
-+        return;
+     }
 -    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
 +    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
      if (r != 0) {
          error_setg_errno(errp, -r, "Can't get vdpa device features");
          return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
  static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                     uint64_t features)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      if (vhost_vdpa_one_time_request(dev)) {
          return 0;
      }
 +    if (v->shadow_vqs_enabled) {
 +        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
 +            /*
 +             * QEMU is just trying to enable or disable logging. SVQ handles
 +             * this sepparately, so no need to forward this.
 +             */
 +            v->acked_features = features;
 +            return 0;
 +        }
 +
 +        v->acked_features = features;
 +
 +        /* We must not ack _F_LOG if SVQ is enabled */
 +        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 +
-     net_client_init1(netdev, true, errp);
+     trace_vhost_vdpa_set_features(dev, features);
      ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
  static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                       uint64_t *features)
  {
 -    int ret;
 +    struct vhost_vdpa *v = dev->opaque;
 +    int ret = vhost_vdpa_get_dev_features(dev, features);
 +
 +    if (ret == 0 && v->shadow_vqs_enabled) {
 +        /* Add SVQ logging capabilities */
 +        *features |= BIT_ULL(VHOST_F_LOG_ALL);
 +    }
 -    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 -    trace_vhost_vdpa_get_features(dev, *features);
      return ret;
  }
+diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
+index XXXXXXX..XXXXXXX 100644
+--- a/include/hw/virtio/vhost-vdpa.h
++++ b/include/hw/virtio/vhost-vdpa.h
+@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
+     bool iotlb_batch_begin_sent;
+     MemoryListener listener;
+     struct vhost_vdpa_iova_range iova_range;
++    uint64_t acked_features;
+     bool shadow_vqs_enabled;
+     /* IOVA mapping used by the Shadow Virtqueue */
+     VhostIOVATree *iova_tree;
 --
 .7.4

-[PULL V2 06/20] e1000: switch to use qemu_receive_packet() for loopback
+Deleted patch
-This patch switches to use qemu_receive_packet() which can detect
-reentrancy and return early.
-This is intended to address CVE-2021-3416.
-Cc: Prasad J Pandit <ppandit@redhat.com>
-Cc: qemu-stable@nongnu.org
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/e1000.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/hw/net/e1000.c b/hw/net/e1000.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000.c
-+++ b/hw/net/e1000.c
-@@ -XXX,XX +XXX,XX @@ e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
-     NetClientState *nc = qemu_get_queue(s->nic);
-     if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) {
--        nc->info->receive(nc, buf, size);
-+        qemu_receive_packet(nc, buf, size);
-     } else {
-         qemu_send_packet(nc, buf, size);
-     }
---
-.7.4

-[PULL V2 07/20] dp8393x: switch to use qemu_receive_packet() for loopback packet
+Deleted patch
-This patch switches to use qemu_receive_packet() which can detect
-reentrancy and return early.
-This is intended to address CVE-2021-3416.
-Cc: Prasad J Pandit <ppandit@redhat.com>
-Cc: qemu-stable@nongnu.org
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/dp8393x.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
-+++ b/hw/net/dp8393x.c
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_transmit_packets(dp8393xState *s)
-             s->regs[SONIC_TCR] |= SONIC_TCR_CRSL;
-             if (nc->info->can_receive(nc)) {
-                 s->loopback_packet = 1;
--                nc->info->receive(nc, s->tx_buffer, tx_len);
-+                qemu_receive_packet(nc, s->tx_buffer, tx_len);
-             }
-         } else {
-             /* Transmit packet */
---
-.7.4

-[PULL V2 08/20] msf2-mac: switch to use qemu_receive_packet() for loopback
+Deleted patch
-This patch switches to use qemu_receive_packet() which can detect
-reentrancy and return early.
-This is intended to address CVE-2021-3416.
-Cc: Prasad J Pandit <ppandit@redhat.com>
-Cc: qemu-stable@nongnu.org
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/msf2-emac.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/hw/net/msf2-emac.c b/hw/net/msf2-emac.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/msf2-emac.c
-+++ b/hw/net/msf2-emac.c
-@@ -XXX,XX +XXX,XX @@ static void msf2_dma_tx(MSF2EmacState *s)
-          * R_CFG1 bit 0 is set.
-          */
-         if (s->regs[R_CFG1] & R_CFG1_LB_EN_MASK) {
--            nc->info->receive(nc, buf, size);
-+            qemu_receive_packet(nc, buf, size);
-         } else {
-             qemu_send_packet(nc, buf, size);
-         }
---
-.7.4

-[PULL V2 09/20] sungem: switch to use qemu_receive_packet() for loopback
+Deleted patch
-This patch switches to use qemu_receive_packet() which can detect
-reentrancy and return early.
-This is intended to address CVE-2021-3416.
-Cc: Prasad J Pandit <ppandit@redhat.com>
-Cc: qemu-stable@nongnu.org
-Reviewed-by: Mark Cave-Ayland <mark.cave-ayland@ilande.co.uk>
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
-Signed-off-by: Jason Wang <jasowang@redhat.com>
----
- hw/net/sungem.c | 2 +-
-file changed, 1 insertion(+), 1 deletion(-)
-diff --git a/hw/net/sungem.c b/hw/net/sungem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/sungem.c
-+++ b/hw/net/sungem.c
-@@ -XXX,XX +XXX,XX @@ static void sungem_send_packet(SunGEMState *s, const uint8_t *buf,
-     NetClientState *nc = qemu_get_queue(s->nic);
-     if (s->macregs[MAC_XIFCFG >> 2] & MAC_XIFCFG_LBCK) {
--        nc->info->receive(nc, buf, size);
-+        qemu_receive_packet(nc, buf, size);
-     } else {
-         qemu_send_packet(nc, buf, size);
-     }
---
-.7.4

The following changes since commit 6157b0e19721aadb4c7fdcfe57b2924af6144b14:

Merge remote-tracking branch 'remotes/vivier2/tags/linux-user-for-6.0-pull-request' into staging (2021-03-14 17:47:49 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to f2e8319d456724c3d8514d943dc4607e2f08e88a:

net: Do not fill legacy info_str for backends (2021-03-15 16:41:22 +0800)

----------------------------------------------------------------

Changes since V1:
- drop the workaound of "-nic" id and fix the merge
- add the series of query-netdev

----------------------------------------------------------------
Alexander Bulekov (4):
      rtl8139: switch to use qemu_receive_packet() for loopback
      pcnet: switch to use qemu_receive_packet() for loopback
      cadence_gem: switch to use qemu_receive_packet() for loopback
      lan9118: switch to use qemu_receive_packet() for loopback

Alexey Kirillov (5):
      qapi: net: Add query-netdev command
      tests: Add tests for query-netdev command
      net: Move NetClientState.info_str to dynamic allocations
      hmp: Use QAPI NetdevInfo in hmp_info_network
      net: Do not fill legacy info_str for backends

Bin Meng (1):
      net: Fix build error when DEBUG_NET is on

Cornelia Huck (1):
      pvrdma: wean code off pvrdma_ring.h kernel header

Jason Wang (8):
      virtio-net: calculating proper msix vectors on init
      e1000: fail early for evil descriptor
      net: introduce qemu_receive_packet()
      e1000: switch to use qemu_receive_packet() for loopback
      dp8393x: switch to use qemu_receive_packet() for loopback packet
      msf2-mac: switch to use qemu_receive_packet() for loopback
      sungem: switch to use qemu_receive_packet() for loopback
      tx_pkt: switch to use qemu_receive_packet_iov() for loopback

Paolo Bonzini (1):
      net: validate that ids are well formed

Currently, the default msix vectors for virtio-net-pci is 3 which is
obvious not suitable for multiqueue guest, so we depends on the user
or management tools to pass a correct vectors parameter. In fact, we
can simplifying this by calculating the number of vectors on realize.

Consider we have N queues, the number of vectors needed is 2*N + 2
(#queue pairs + plus one config interrupt and control vq). We didn't
check whether or not host support control vq because it was added
unconditionally by qemu to avoid breaking legacy guests such as Minix.

Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com
Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
Reviewed-by: Stefan Hajnoczi <stefanha@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/core/machine.c          |  1 +
 hw/virtio/virtio-net-pci.c | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/hw/core/machine.c b/hw/core/machine.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -XXX,XX +XXX,XX @@
 GlobalProperty hw_compat_5_2[] = {
     { "ICH9-LPC", "smm-compat", "on"},
     { "PIIX4_PM", "smm-compat", "on"},
+    { "virtio-net-pci", "vectors", "3"},
 };
 const size_t hw_compat_5_2_len = G_N_ELEMENTS(hw_compat_5_2);
 
diff --git a/hw/virtio/virtio-net-pci.c b/hw/virtio/virtio-net-pci.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/virtio-net-pci.c
+++ b/hw/virtio/virtio-net-pci.c
@@ -XXX,XX +XXX,XX @@ struct VirtIONetPCI {
 static Property virtio_net_properties[] = {
     DEFINE_PROP_BIT("ioeventfd", VirtIOPCIProxy, flags,
                     VIRTIO_PCI_FLAG_USE_IOEVENTFD_BIT, true),
-    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors, 3),
+    DEFINE_PROP_UINT32("vectors", VirtIOPCIProxy, nvectors,
+                       DEV_NVECTORS_UNSPECIFIED),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -XXX,XX +XXX,XX @@ static void virtio_net_pci_realize(VirtIOPCIProxy *vpci_dev, Error **errp)
     DeviceState *qdev = DEVICE(vpci_dev);
     VirtIONetPCI *dev = VIRTIO_NET_PCI(vpci_dev);
     DeviceState *vdev = DEVICE(&dev->vdev);
+    VirtIONet *net = VIRTIO_NET(vdev);
+
+    if (vpci_dev->nvectors == DEV_NVECTORS_UNSPECIFIED) {
+        vpci_dev->nvectors = 2 * MAX(net->nic_conf.peers.queues, 1)
+            + 1 /* Config interrupt */
+            + 1 /* Control vq */;
+    }
 
     virtio_net_set_netclient_name(&dev->vdev, qdev->id,
                                   object_get_typename(OBJECT(qdev)));
-- 
2.7.4

From: Paolo Bonzini <pbonzini@redhat.com>

When a network or network device is created from the command line or HMP,
QemuOpts ensures that the id passes the id_wellformed check.  However,
QMP skips this:

$ qemu-system-x86_64 -qmp stdio -S -nic user,id=123/456
   qemu-system-x86_64: -nic user,id=123/456: Parameter id expects an identifier
   Identifiers consist of letters, digits, -, ., _, starting with a letter.

$ qemu-system-x86_64 -qmp stdio -S
   {"execute":"qmp_capabilities"}
   {"return": {}}
   {"execute":"netdev_add", "arguments": {"type": "user", "id": "123/456"}}
   {"return": {}}

After:

$ qemu-system-x86_64 -qmp stdio -S
   {"execute":"qmp_capabilities"}
   {"return": {}}
   {"execute":"netdev_add", "arguments": {"type": "user", "id": "123/456"}}
   {"error": {"class": "GenericError", "desc": "Parameter "id" expects an identifier"}}

Validity checks should be performed always at the bottom of the call chain,
because QMP skips all the steps above.  At the same time we know that every
call chain should go through either QMP or (for legacy) through QemuOpts.
Because the id for -net and -nic is automatically generated and not
well-formed by design, just add the check to QMP.

Cc: Jason Wang <jasowang@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/net.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ void netdev_add(QemuOpts *opts, Error **errp)
 
 void qmp_netdev_add(Netdev *netdev, Error **errp)
 {
+    if (!id_wellformed(netdev->id)) {
+        error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "id", "an identifier");
+        return;
+    }
+
     net_client_init1(netdev, true, errp);
 }
 
-- 
2.7.4

During procss_tx_desc(), driver can try to chain data descriptor with
legacy descriptor, when will lead underflow for the following
calculation in process_tx_desc() for bytes:

if (tp->size + bytes > msh)
                bytes = msh - tp->size;

This will lead a infinite loop. So check and fail early if tp->size if
greater or equal to msh.

Reported-by: Alexander Bulekov <alxndr@bu.edu>
Reported-by: Cheolwoo Myung <cwmyung@snu.ac.kr>
Reported-by: Ruhr-University Bochum <bugs-syssec@rub.de>
Cc: Prasad J Pandit <ppandit@redhat.com>
Cc: qemu-stable@nongnu.org
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/e1000.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
         msh = tp->tso_props.hdr_len + tp->tso_props.mss;
         do {
             bytes = split_size;
+            if (tp->size >= msh) {
+                goto eop;
+            }
             if (tp->size + bytes > msh)
                 bytes = msh - tp->size;
 
@@ -XXX,XX +XXX,XX @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
         tp->size += split_size;
     }
 
+eop:
     if (!(txd_lower & E1000_TXD_CMD_EOP))
         return;
     if (!(tp->cptse && tp->size < tp->tso_props.hdr_len)) {
-- 
2.7.4

Some NIC supports loopback mode and this is done by calling
nc->info->receive() directly which in fact suppresses the effort of
reentrancy check that is done in qemu_net_queue_send().

Unfortunately we can't use qemu_net_queue_send() here since for
loopback there's no sender as peer, so this patch introduce a
qemu_receive_packet() which is used for implementing loopback mode
for a NIC with this check.

NIC that supports loopback mode will be converted to this helper.

This is intended to address CVE-2021-3416.

Cc: Prasad J Pandit <ppandit@redhat.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Cc: qemu-stable@nongnu.org
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/net/net.h   |  5 +++++
 include/net/queue.h |  8 ++++++++
 net/net.c           | 38 +++++++++++++++++++++++++++++++-------
 net/queue.c         | 22 ++++++++++++++++++++++
 4 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/include/net/net.h b/include/net/net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -XXX,XX +XXX,XX @@ void *qemu_get_nic_opaque(NetClientState *nc);
 void qemu_del_net_client(NetClientState *nc);
 typedef void (*qemu_nic_foreach)(NICState *nic, void *opaque);
 void qemu_foreach_nic(qemu_nic_foreach func, void *opaque);
+int qemu_can_receive_packet(NetClientState *nc);
 int qemu_can_send_packet(NetClientState *nc);
 ssize_t qemu_sendv_packet(NetClientState *nc, const struct iovec *iov,
                           int iovcnt);
 ssize_t qemu_sendv_packet_async(NetClientState *nc, const struct iovec *iov,
                                 int iovcnt, NetPacketSent *sent_cb);
 ssize_t qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size);
+ssize_t qemu_receive_packet(NetClientState *nc, const uint8_t *buf, int size);
+ssize_t qemu_receive_packet_iov(NetClientState *nc,
+                                const struct iovec *iov,
+                                int iovcnt);
 ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size);
 ssize_t qemu_send_packet_async(NetClientState *nc, const uint8_t *buf,
                                int size, NetPacketSent *sent_cb);
diff --git a/include/net/queue.h b/include/net/queue.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/queue.h
+++ b/include/net/queue.h
@@ -XXX,XX +XXX,XX @@ void qemu_net_queue_append_iov(NetQueue *queue,
 
 void qemu_del_net_queue(NetQueue *queue);
 
+ssize_t qemu_net_queue_receive(NetQueue *queue,
+                               const uint8_t *data,
+                               size_t size);
+
+ssize_t qemu_net_queue_receive_iov(NetQueue *queue,
+                                   const struct iovec *iov,
+                                   int iovcnt);
+
 ssize_t qemu_net_queue_send(NetQueue *queue,
                             NetClientState *sender,
                             unsigned flags,
diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ int qemu_set_vnet_be(NetClientState *nc, bool is_be)
 #endif
 }
 
+int qemu_can_receive_packet(NetClientState *nc)
+{
+    if (nc->receive_disabled) {
+        return 0;
+    } else if (nc->info->can_receive &&
+               !nc->info->can_receive(nc)) {
+        return 0;
+    }
+    return 1;
+}
+
 int qemu_can_send_packet(NetClientState *sender)
 {
     int vm_running = runstate_is_running();
@@ -XXX,XX +XXX,XX @@ int qemu_can_send_packet(NetClientState *sender)
         return 1;
     }
 
-    if (sender->peer->receive_disabled) {
-        return 0;
-    } else if (sender->peer->info->can_receive &&
-               !sender->peer->info->can_receive(sender->peer)) {
-        return 0;
-    }
-    return 1;
+    return qemu_can_receive_packet(sender->peer);
 }
 
 static ssize_t filter_receive_iov(NetClientState *nc,
@@ -XXX,XX +XXX,XX @@ ssize_t qemu_send_packet(NetClientState *nc, const uint8_t *buf, int size)
     return qemu_send_packet_async(nc, buf, size, NULL);
 }
 
+ssize_t qemu_receive_packet(NetClientState *nc, const uint8_t *buf, int size)
+{
+    if (!qemu_can_receive_packet(nc)) {
+        return 0;
+    }
+
+    return qemu_net_queue_receive(nc->incoming_queue, buf, size);
+}
+
+ssize_t qemu_receive_packet_iov(NetClientState *nc, const struct iovec *iov,
+                                int iovcnt)
+{
+    if (!qemu_can_receive_packet(nc)) {
+        return 0;
+    }
+
+    return qemu_net_queue_receive_iov(nc->incoming_queue, iov, iovcnt);
+}
+
 ssize_t qemu_send_packet_raw(NetClientState *nc, const uint8_t *buf, int size)
 {
     return qemu_send_packet_async_with_flags(nc, QEMU_NET_PACKET_FLAG_RAW,
diff --git a/net/queue.c b/net/queue.c
index XXXXXXX..XXXXXXX 100644
--- a/net/queue.c
+++ b/net/queue.c
@@ -XXX,XX +XXX,XX @@ static ssize_t qemu_net_queue_deliver_iov(NetQueue *queue,
     return ret;
 }
 
+ssize_t qemu_net_queue_receive(NetQueue *queue,
+                               const uint8_t *data,
+                               size_t size)
+{
+    if (queue->delivering) {
+        return 0;
+    }
+
+    return qemu_net_queue_deliver(queue, NULL, 0, data, size);
+}
+
+ssize_t qemu_net_queue_receive_iov(NetQueue *queue,
+                                   const struct iovec *iov,
+                                   int iovcnt)
+{
+    if (queue->delivering) {
+        return 0;
+    }
+
+    return qemu_net_queue_deliver_iov(queue, NULL, 0, iov, iovcnt);
+}
+
 ssize_t qemu_net_queue_send(NetQueue *queue,
                             NetClientState *sender,
                             unsigned flags,
-- 
2.7.4

From: Alexander Bulekov <alxndr@bu.edu>

This patch switches to use qemu_receive_packet() which can detect
reentrancy and return early.

This is intended to address CVE-2021-3416.

Cc: Prasad J Pandit <ppandit@redhat.com>
Cc: qemu-stable@nongnu.org
Buglink: https://bugs.launchpad.net/qemu/+bug/1910826
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com
Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/rtl8139.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -XXX,XX +XXX,XX @@ static void rtl8139_transfer_frame(RTL8139State *s, uint8_t *buf, int size,
         }
 
         DPRINTF("+++ transmit loopback mode\n");
-        rtl8139_do_receive(qemu_get_queue(s->nic), buf, size, do_interrupt);
+        qemu_receive_packet(qemu_get_queue(s->nic), buf, size);
 
         if (iov) {
             g_free(buf2);
-- 
2.7.4

From: Alexander Bulekov <alxndr@bu.edu>

This patch switches to use qemu_receive_packet() which can detect
reentrancy and return early.

This is intended to address CVE-2021-3416.

Cc: Prasad J Pandit <ppandit@redhat.com>
Cc: qemu-stable@nongnu.org
Buglink: https://bugs.launchpad.net/qemu/+bug/1917085
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com
Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/pcnet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/pcnet.c b/hw/net/pcnet.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/pcnet.c
+++ b/hw/net/pcnet.c
@@ -XXX,XX +XXX,XX @@ txagain:
             if (BCR_SWSTYLE(s) == 1)
                 add_crc = !GET_FIELD(tmd.status, TMDS, NOFCS);
             s->looptest = add_crc ? PCNET_LOOPTEST_CRC : PCNET_LOOPTEST_NOCRC;
-            pcnet_receive(qemu_get_queue(s->nic), s->buffer, s->xmit_pos);
+            qemu_receive_packet(qemu_get_queue(s->nic), s->buffer, s->xmit_pos);
             s->looptest = 0;
         } else {
             if (s->nic) {
-- 
2.7.4

From: Alexander Bulekov <alxndr@bu.edu>

This patch switches to use qemu_receive_packet() which can detect
reentrancy and return early.

This is intended to address CVE-2021-3416.

Cc: Prasad J Pandit <ppandit@redhat.com>
Cc: qemu-stable@nongnu.org
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/cadence_gem.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@ static void gem_transmit(CadenceGEMState *s)
                 /* Send the packet somewhere */
                 if (s->phy_loop || (s->regs[GEM_NWCTRL] &
                                     GEM_NWCTRL_LOCALLOOP)) {
-                    gem_receive(qemu_get_queue(s->nic), s->tx_packet,
-                                total_bytes);
+                    qemu_receive_packet(qemu_get_queue(s->nic), s->tx_packet,
+                                        total_bytes);
                 } else {
                     qemu_send_packet(qemu_get_queue(s->nic), s->tx_packet,
                                      total_bytes);
-- 
2.7.4

From: Alexander Bulekov <alxndr@bu.edu>

This patch switches to use qemu_receive_packet() which can detect
reentrancy and return early.

This is intended to address CVE-2021-3416.

Cc: Prasad J Pandit <ppandit@redhat.com>
Cc: qemu-stable@nongnu.org
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com
Signed-off-by: Alexander Bulekov <alxndr@bu.edu>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/lan9118.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/lan9118.c b/hw/net/lan9118.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/lan9118.c
+++ b/hw/net/lan9118.c
@@ -XXX,XX +XXX,XX @@ static void do_tx_packet(lan9118_state *s)
     /* FIXME: Honor TX disable, and allow queueing of packets.  */
     if (s->phy_control & 0x4000)  {
         /* This assumes the receive routine doesn't touch the VLANClient.  */
-        lan9118_receive(qemu_get_queue(s->nic), s->txp->data, s->txp->len);
+        qemu_receive_packet(qemu_get_queue(s->nic), s->txp->data, s->txp->len);
     } else {
         qemu_send_packet(qemu_get_queue(s->nic), s->txp->data, s->txp->len);
     }
-- 
2.7.4

From: Cornelia Huck <cohuck@redhat.com>

The pvrdma code relies on the pvrdma_ring.h kernel header for some
basic ring buffer handling. The content of that header isn't very
exciting, but contains some (q)atomic_*() invocations that (a)
cause manual massaging when doing a headers update, and (b) are
an indication that we probably should not be importing that header
at all.

Let's reimplement the ring buffer handling directly in the pvrdma
code instead. This arguably also improves readability of the code.

Importing the header can now be dropped.

Signed-off-by: Cornelia Huck <cohuck@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Reviewed-by: Yuval Shaia <yuval.shaia.ml@gmail.com>
Tested-by: Yuval Shaia <yuval.shaia.ml@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/rdma/vmw/pvrdma.h                               |   5 +-
 hw/rdma/vmw/pvrdma_cmd.c                           |   6 +-
 hw/rdma/vmw/pvrdma_dev_ring.c                      |  41 ++++----
 hw/rdma/vmw/pvrdma_dev_ring.h                      |   9 +-
 hw/rdma/vmw/pvrdma_main.c                          |   4 +-
 .../drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h | 114 ---------------------
 scripts/update-linux-headers.sh                    |   3 +-
 7 files changed, 38 insertions(+), 144 deletions(-)
 delete mode 100644 include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h

diff --git a/hw/rdma/vmw/pvrdma.h b/hw/rdma/vmw/pvrdma.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/rdma/vmw/pvrdma.h
+++ b/hw/rdma/vmw/pvrdma.h
@@ -XXX,XX +XXX,XX @@
 #include "../rdma_backend_defs.h"
 #include "../rdma_rm_defs.h"
 
-#include "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h"
 #include "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h"
 #include "pvrdma_dev_ring.h"
 #include "qom/object.h"
@@ -XXX,XX +XXX,XX @@ typedef struct DSRInfo {
     union pvrdma_cmd_req *req;
     union pvrdma_cmd_resp *rsp;
 
-    struct pvrdma_ring *async_ring_state;
+    PvrdmaRingState *async_ring_state;
     PvrdmaRing async;
 
-    struct pvrdma_ring *cq_ring_state;
+    PvrdmaRingState *cq_ring_state;
     PvrdmaRing cq;
 } DSRInfo;
 
diff --git a/hw/rdma/vmw/pvrdma_cmd.c b/hw/rdma/vmw/pvrdma_cmd.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/rdma/vmw/pvrdma_cmd.c
+++ b/hw/rdma/vmw/pvrdma_cmd.c
@@ -XXX,XX +XXX,XX @@ static int create_cq_ring(PCIDevice *pci_dev , PvrdmaRing **ring,
     r = g_malloc(sizeof(*r));
     *ring = r;
 
-    r->ring_state = (struct pvrdma_ring *)
+    r->ring_state = (PvrdmaRingState *)
         rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE);
 
     if (!r->ring_state) {
@@ -XXX,XX +XXX,XX @@ static int create_qp_rings(PCIDevice *pci_dev, uint64_t pdir_dma,
     *rings = sr;
 
     /* Create send ring */
-    sr->ring_state = (struct pvrdma_ring *)
+    sr->ring_state = (PvrdmaRingState *)
         rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE);
     if (!sr->ring_state) {
         rdma_error_report("Failed to map to QP ring state");
@@ -XXX,XX +XXX,XX @@ static int create_srq_ring(PCIDevice *pci_dev, PvrdmaRing **ring,
     r = g_malloc(sizeof(*r));
     *ring = r;
 
-    r->ring_state = (struct pvrdma_ring *)
+    r->ring_state = (PvrdmaRingState *)
             rdma_pci_dma_map(pci_dev, tbl[0], TARGET_PAGE_SIZE);
     if (!r->ring_state) {
         rdma_error_report("Failed to map tp SRQ ring state");
diff --git a/hw/rdma/vmw/pvrdma_dev_ring.c b/hw/rdma/vmw/pvrdma_dev_ring.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/rdma/vmw/pvrdma_dev_ring.c
+++ b/hw/rdma/vmw/pvrdma_dev_ring.c
@@ -XXX,XX +XXX,XX @@
 #include "trace.h"
 
 #include "../rdma_utils.h"
-#include "standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h"
 #include "pvrdma_dev_ring.h"
 
 int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev,
-                     struct pvrdma_ring *ring_state, uint32_t max_elems,
+                     PvrdmaRingState *ring_state, uint32_t max_elems,
                      size_t elem_sz, dma_addr_t *tbl, uint32_t npages)
 {
     int i;
@@ -XXX,XX +XXX,XX @@ out:
 
 void *pvrdma_ring_next_elem_read(PvrdmaRing *ring)
 {
-    int e;
-    unsigned int idx = 0, offset;
+    unsigned int idx, offset;
+    const uint32_t tail = qatomic_read(&ring->ring_state->prod_tail);
+    const uint32_t head = qatomic_read(&ring->ring_state->cons_head);
 
-    e = pvrdma_idx_ring_has_data(ring->ring_state, ring->max_elems, &idx);
-    if (e <= 0) {
+    if (tail & ~((ring->max_elems << 1) - 1) ||
+        head & ~((ring->max_elems << 1) - 1) ||
+        tail == head) {
         trace_pvrdma_ring_next_elem_read_no_data(ring->name);
         return NULL;
     }
 
+    idx = head & (ring->max_elems - 1);
     offset = idx * ring->elem_sz;
     return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE);
 }
 
 void pvrdma_ring_read_inc(PvrdmaRing *ring)
 {
-    pvrdma_idx_ring_inc(&ring->ring_state->cons_head, ring->max_elems);
+    uint32_t idx = qatomic_read(&ring->ring_state->cons_head);
+
+    idx = (idx + 1) & ((ring->max_elems << 1) - 1);
+    qatomic_set(&ring->ring_state->cons_head, idx);
 }
 
 void *pvrdma_ring_next_elem_write(PvrdmaRing *ring)
 {
-    int idx;
-    unsigned int offset, tail;
+    unsigned int idx, offset;
+    const uint32_t tail = qatomic_read(&ring->ring_state->prod_tail);
+    const uint32_t head = qatomic_read(&ring->ring_state->cons_head);
 
-    idx = pvrdma_idx_ring_has_space(ring->ring_state, ring->max_elems, &tail);
-    if (idx <= 0) {
+    if (tail & ~((ring->max_elems << 1) - 1) ||
+        head & ~((ring->max_elems << 1) - 1) ||
+        tail == (head ^ ring->max_elems)) {
         rdma_error_report("CQ is full");
         return NULL;
     }
 
-    idx = pvrdma_idx(&ring->ring_state->prod_tail, ring->max_elems);
-    if (idx < 0 || tail != idx) {
-        rdma_error_report("Invalid idx %d", idx);
-        return NULL;
-    }
-
+    idx = tail & (ring->max_elems - 1);
     offset = idx * ring->elem_sz;
     return ring->pages[offset / TARGET_PAGE_SIZE] + (offset % TARGET_PAGE_SIZE);
 }
 
 void pvrdma_ring_write_inc(PvrdmaRing *ring)
 {
-    pvrdma_idx_ring_inc(&ring->ring_state->prod_tail, ring->max_elems);
+    uint32_t idx = qatomic_read(&ring->ring_state->prod_tail);
+
+    idx = (idx + 1) & ((ring->max_elems << 1) - 1);
+    qatomic_set(&ring->ring_state->prod_tail, idx);
 }
 
 void pvrdma_ring_free(PvrdmaRing *ring)
diff --git a/hw/rdma/vmw/pvrdma_dev_ring.h b/hw/rdma/vmw/pvrdma_dev_ring.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/rdma/vmw/pvrdma_dev_ring.h
+++ b/hw/rdma/vmw/pvrdma_dev_ring.h
@@ -XXX,XX +XXX,XX @@
 
 #define MAX_RING_NAME_SZ 32
 
+typedef struct PvrdmaRingState {
+    int prod_tail; /* producer tail */
+    int cons_head; /* consumer head */
+} PvrdmaRingState;
+
 typedef struct PvrdmaRing {
     char name[MAX_RING_NAME_SZ];
     PCIDevice *dev;
     uint32_t max_elems;
     size_t elem_sz;
-    struct pvrdma_ring *ring_state; /* used only for unmap */
+    PvrdmaRingState *ring_state; /* used only for unmap */
     int npages;
     void **pages;
 } PvrdmaRing;
 
 int pvrdma_ring_init(PvrdmaRing *ring, const char *name, PCIDevice *dev,
-                     struct pvrdma_ring *ring_state, uint32_t max_elems,
+                     PvrdmaRingState *ring_state, uint32_t max_elems,
                      size_t elem_sz, dma_addr_t *tbl, uint32_t npages);
 void *pvrdma_ring_next_elem_read(PvrdmaRing *ring);
 void pvrdma_ring_read_inc(PvrdmaRing *ring);
diff --git a/hw/rdma/vmw/pvrdma_main.c b/hw/rdma/vmw/pvrdma_main.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/rdma/vmw/pvrdma_main.c
+++ b/hw/rdma/vmw/pvrdma_main.c
@@ -XXX,XX +XXX,XX @@ static void free_dev_ring(PCIDevice *pci_dev, PvrdmaRing *ring,
     rdma_pci_dma_unmap(pci_dev, ring_state, TARGET_PAGE_SIZE);
 }
 
-static int init_dev_ring(PvrdmaRing *ring, struct pvrdma_ring **ring_state,
+static int init_dev_ring(PvrdmaRing *ring, PvrdmaRingState **ring_state,
                          const char *name, PCIDevice *pci_dev,
                          dma_addr_t dir_addr, uint32_t num_pages)
 {
@@ -XXX,XX +XXX,XX @@ static int init_dev_ring(PvrdmaRing *ring, struct pvrdma_ring **ring_state,
     /* RX ring is the second */
     (*ring_state)++;
     rc = pvrdma_ring_init(ring, name, pci_dev,
-                          (struct pvrdma_ring *)*ring_state,
+                          (PvrdmaRingState *)*ring_state,
                           (num_pages - 1) * TARGET_PAGE_SIZE /
                           sizeof(struct pvrdma_cqne),
                           sizeof(struct pvrdma_cqne),
diff --git a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h b/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h
deleted file mode 100644
index XXXXXXX..XXXXXXX
--- a/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h
+++ /dev/null
@@ -XXX,XX +XXX,XX @@
-/*
- * Copyright (c) 2012-2016 VMware, Inc.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of EITHER the GNU General Public License
- * version 2 as published by the Free Software Foundation or the BSD
- * 2-Clause License. This program is distributed in the hope that it
- * will be useful, but WITHOUT ANY WARRANTY; WITHOUT EVEN THE IMPLIED
- * WARRANTY OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
- * See the GNU General Public License version 2 for more details at
- * http://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program available in the file COPYING in the main
- * directory of this source tree.
- *
- * The BSD 2-Clause License
- *
- *     Redistribution and use in source and binary forms, with or
- *     without modification, are permitted provided that the following
- *     conditions are met:
- *
- *      - Redistributions of source code must retain the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer.
- *
- *      - Redistributions in binary form must reproduce the above
- *        copyright notice, this list of conditions and the following
- *        disclaimer in the documentation and/or other materials
- *        provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
- * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
- * OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __PVRDMA_RING_H__
-#define __PVRDMA_RING_H__
-
-#include "standard-headers/linux/types.h"
-
-#define PVRDMA_INVALID_IDX	-1	/* Invalid index. */
-
-struct pvrdma_ring {
-	int prod_tail;	/* Producer tail. */
-	int cons_head;	/* Consumer head. */
-};
-
-struct pvrdma_ring_state {
-	struct pvrdma_ring tx;	/* Tx ring. */
-	struct pvrdma_ring rx;	/* Rx ring. */
-};
-
-static inline int pvrdma_idx_valid(uint32_t idx, uint32_t max_elems)
-{
-	/* Generates fewer instructions than a less-than. */
-	return (idx & ~((max_elems << 1) - 1)) == 0;
-}
-
-static inline int32_t pvrdma_idx(int *var, uint32_t max_elems)
-{
-	const unsigned int idx = qatomic_read(var);
-
-	if (pvrdma_idx_valid(idx, max_elems))
-		return idx & (max_elems - 1);
-	return PVRDMA_INVALID_IDX;
-}
-
-static inline void pvrdma_idx_ring_inc(int *var, uint32_t max_elems)
-{
-	uint32_t idx = qatomic_read(var) + 1;	/* Increment. */
-
-	idx &= (max_elems << 1) - 1;		/* Modulo size, flip gen. */
-	qatomic_set(var, idx);
-}
-
-static inline int32_t pvrdma_idx_ring_has_space(const struct pvrdma_ring *r,
-					      uint32_t max_elems, uint32_t *out_tail)
-{
-	const uint32_t tail = qatomic_read(&r->prod_tail);
-	const uint32_t head = qatomic_read(&r->cons_head);
-
-	if (pvrdma_idx_valid(tail, max_elems) &&
-	    pvrdma_idx_valid(head, max_elems)) {
-		*out_tail = tail & (max_elems - 1);
-		return tail != (head ^ max_elems);
-	}
-	return PVRDMA_INVALID_IDX;
-}
-
-static inline int32_t pvrdma_idx_ring_has_data(const struct pvrdma_ring *r,
-					     uint32_t max_elems, uint32_t *out_head)
-{
-	const uint32_t tail = qatomic_read(&r->prod_tail);
-	const uint32_t head = qatomic_read(&r->cons_head);
-
-	if (pvrdma_idx_valid(tail, max_elems) &&
-	    pvrdma_idx_valid(head, max_elems)) {
-		*out_head = head & (max_elems - 1);
-		return tail != head;
-	}
-	return PVRDMA_INVALID_IDX;
-}
-
-#endif /* __PVRDMA_RING_H__ */
diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index XXXXXXX..XXXXXXX 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -XXX,XX +XXX,XX @@ sed  -e '1h;2,$H;$!d;g'  -e 's/[^};]*pvrdma[^(| ]*([^)]*);//g' \
     "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h" > \
     "$tmp_pvrdma_verbs";
 
-for i in "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_ring.h" \
-         "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h" \
+for i in "$linux/drivers/infiniband/hw/vmw_pvrdma/pvrdma_dev_api.h" \
          "$tmp_pvrdma_verbs"; do \
     cp_portable "$i" \
          "$output/include/standard-headers/drivers/infiniband/hw/vmw_pvrdma/"
-- 
2.7.4

From: Alexey Kirillov <lekiravi@yandex-team.ru>

The query-netdev command is used to get the configuration of the current
network device backends (netdevs).
This is the QMP analog of the HMP command "info network" but only for
netdevs (i.e. excluding NIC and hubports).

The query-netdev command returns an array of objects of the NetdevInfo
type, which are an extension of Netdev type. It means that response can
be used for netdev-add after small modification. This can be useful for
recreate the same netdev configuration.

Information about the network device is filled in when it is created or
modified and is available through the NetClientState->stored_config.

Signed-off-by: Alexey Kirillov <lekiravi@yandex-team.ru>
Acked-by: Markus Armbruster <armbru@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/net/net.h |   3 ++
 net/l2tpv3.c      |   7 ++++
 net/net.c         |  30 +++++++++++++-
 net/netmap.c      |   7 ++++
 net/slirp.c       | 122 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 net/socket.c      |  71 +++++++++++++++++++++++++++++++
 net/tap-win32.c   |   9 ++++
 net/tap.c         | 103 ++++++++++++++++++++++++++++++++++++++++++---
 net/vde.c         |  22 ++++++++++
 net/vhost-user.c  |  18 ++++++--
 net/vhost-vdpa.c  |  14 +++++++
 qapi/net.json     |  80 +++++++++++++++++++++++++++++++++++
 12 files changed, 477 insertions(+), 9 deletions(-)

diff --git a/include/net/net.h b/include/net/net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -XXX,XX +XXX,XX @@
 #include "qapi/qapi-types-net.h"
 #include "net/queue.h"
 #include "hw/qdev-properties-system.h"
+#include "qapi/clone-visitor.h"
+#include "qapi/qapi-visit-net.h"
 
 #define MAC_FMT "%02X:%02X:%02X:%02X:%02X:%02X"
 #define MAC_ARG(x) ((uint8_t *)(x))[0], ((uint8_t *)(x))[1], \
@@ -XXX,XX +XXX,XX @@ struct NetClientState {
     char *model;
     char *name;
     char info_str[256];
+    NetdevInfo *stored_config;
     unsigned receive_disabled : 1;
     NetClientDestructor *destructor;
     unsigned int queue_index;
diff --git a/net/l2tpv3.c b/net/l2tpv3.c
index XXXXXXX..XXXXXXX 100644
--- a/net/l2tpv3.c
+++ b/net/l2tpv3.c
@@ -XXX,XX +XXX,XX @@ int net_init_l2tpv3(const Netdev *netdev,
 
     l2tpv3_read_poll(s, true);
 
+    /* Store startup parameters */
+    nc->stored_config = g_new0(NetdevInfo, 1);
+    nc->stored_config->type = NET_BACKEND_L2TPV3;
+
+    QAPI_CLONE_MEMBERS(NetdevL2TPv3Options,
+                       &nc->stored_config->u.l2tpv3, l2tpv3);
+
     snprintf(s->nc.info_str, sizeof(s->nc.info_str),
              "l2tpv3: connected");
     return 0;
diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@
 #include "monitor/monitor.h"
 #include "qemu/help_option.h"
 #include "qapi/qapi-commands-net.h"
-#include "qapi/qapi-visit-net.h"
 #include "qapi/qmp/qdict.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
@@ -XXX,XX +XXX,XX @@ static void qemu_free_net_client(NetClientState *nc)
     }
     g_free(nc->name);
     g_free(nc->model);
+    qapi_free_NetdevInfo(nc->stored_config);
     if (nc->destructor) {
         nc->destructor(nc);
     }
@@ -XXX,XX +XXX,XX @@ RxFilterInfoList *qmp_query_rx_filter(bool has_name, const char *name,
     return filter_list;
 }
 
+NetdevInfoList *qmp_query_netdev(Error **errp)
+{
+    NetdevInfoList *list = NULL;
+    NetClientState *nc;
+
+    QTAILQ_FOREACH(nc, &net_clients, next) {
+        /*
+         * Only look at netdevs (backend network devices), not for each queue
+         * or NIC / hubport
+         */
+        if (nc->stored_config) {
+            NetdevInfo *element = QAPI_CLONE(NetdevInfo, nc->stored_config);
+
+            g_free(element->id); /* Need to dealloc empty id after clone */
+            element->id = g_strdup(nc->name);
+
+            element->has_peer_id = nc->peer != NULL;
+            if (element->has_peer_id) {
+                element->peer_id = g_strdup(nc->peer->name);
+            }
+
+            QAPI_LIST_PREPEND(list, element);
+        }
+    }
+
+    return list;
+}
+
 void hmp_info_network(Monitor *mon, const QDict *qdict)
 {
     NetClientState *nc, *peer;
diff --git a/net/netmap.c b/net/netmap.c
index XXXXXXX..XXXXXXX 100644
--- a/net/netmap.c
+++ b/net/netmap.c
@@ -XXX,XX +XXX,XX @@ int net_init_netmap(const Netdev *netdev,
     pstrcpy(s->ifname, sizeof(s->ifname), netmap_opts->ifname);
     netmap_read_poll(s, true); /* Initially only poll for reads. */
 
+    /* Store startup parameters */
+    nc->stored_config = g_new0(NetdevInfo, 1);
+    nc->stored_config->type = NET_BACKEND_NETMAP;
+
+    QAPI_CLONE_MEMBERS(NetdevNetmapOptions,
+                       &nc->stored_config->u.netmap, netmap_opts);
+
     return 0;
 }
 
diff --git a/net/slirp.c b/net/slirp.c
index XXXXXXX..XXXXXXX 100644
--- a/net/slirp.c
+++ b/net/slirp.c
@@ -XXX,XX +XXX,XX @@ static int net_slirp_init(NetClientState *peer, const char *model,
     int shift;
     char *end;
     struct slirp_config_str *config;
+    NetdevUserOptions *stored;
+    StringList **stored_hostfwd;
+    StringList **stored_guestfwd;
 
     if (!ipv4 && (vnetwork || vhost || vnameserver)) {
         error_setg(errp, "IPv4 disabled but netmask/host/dns provided");
@@ -XXX,XX +XXX,XX @@ static int net_slirp_init(NetClientState *peer, const char *model,
 
     nc = qemu_new_net_client(&net_slirp_info, peer, model, name);
 
+    /* Store startup parameters */
+    nc->stored_config = g_new0(NetdevInfo, 1);
+    nc->stored_config->type = NET_BACKEND_USER;
+    stored = &nc->stored_config->u.user;
+
+    if (vhostname) {
+        stored->has_hostname = true;
+        stored->hostname = g_strdup(vhostname);
+    }
+
+    stored->has_q_restrict = true;
+    stored->q_restrict = restricted;
+
+    stored->has_ipv4 = true;
+    stored->ipv4 = ipv4;
+
+    stored->has_ipv6 = true;
+    stored->ipv6 = ipv6;
+
+    if (ipv4) {
+        uint8_t *net_bytes = (uint8_t *)&net;
+        uint8_t *mask_bytes = (uint8_t *)&mask;
+
+        stored->has_net = true;
+        stored->net = g_strdup_printf("%d.%d.%d.%d/%d.%d.%d.%d",
+                                      net_bytes[0], net_bytes[1],
+                                      net_bytes[2], net_bytes[3],
+                                      mask_bytes[0], mask_bytes[1],
+                                      mask_bytes[2], mask_bytes[3]);
+
+        stored->has_host = true;
+        stored->host = g_strdup(inet_ntoa(host));
+    }
+
+    if (tftp_export) {
+        stored->has_tftp = true;
+        stored->tftp = g_strdup(tftp_export);
+    }
+
+    if (bootfile) {
+        stored->has_bootfile = true;
+        stored->bootfile = g_strdup(bootfile);
+    }
+
+    if (vdhcp_start) {
+        stored->has_dhcpstart = true;
+        stored->dhcpstart = g_strdup(vdhcp_start);
+    }
+
+    if (ipv4) {
+        stored->has_dns = true;
+        stored->dns = g_strdup(inet_ntoa(dns));
+    }
+
+    if (dnssearch) {
+        stored->has_dnssearch = true;
+        StringList **stored_list = &stored->dnssearch;
+
+        for (int i = 0; dnssearch[i]; i++) {
+            String *element = g_new0(String, 1);
+
+            element->str = g_strdup(dnssearch[i]);
+            QAPI_LIST_APPEND(stored_list, element);
+        }
+    }
+
+    if (vdomainname) {
+        stored->has_domainname = true;
+        stored->domainname = g_strdup(vdomainname);
+    }
+
+    if (ipv6) {
+        char addrstr[INET6_ADDRSTRLEN];
+        const char *res;
+
+        stored->has_ipv6_prefix = true;
+        stored->ipv6_prefix = g_strdup(vprefix6);
+
+        stored->has_ipv6_prefixlen = true;
+        stored->ipv6_prefixlen = vprefix6_len;
+
+        res = inet_ntop(AF_INET6, &ip6_host,
+                        addrstr, sizeof(addrstr));
+
+        stored->has_ipv6_host = true;
+        stored->ipv6_host = g_strdup(res);
+
+        res = inet_ntop(AF_INET6, &ip6_dns,
+                        addrstr, sizeof(addrstr));
+
+        stored->has_ipv6_dns = true;
+        stored->ipv6_dns = g_strdup(res);
+    }
+
+    if (smb_export) {
+        stored->has_smb = true;
+        stored->smb = g_strdup(smb_export);
+    }
+
+    if (vsmbserver) {
+        stored->has_smbserver = true;
+        stored->smbserver = g_strdup(vsmbserver);
+    }
+
+    if (tftp_server_name) {
+        stored->has_tftp_server_name = true;
+        stored->tftp_server_name = g_strdup(tftp_server_name);
+    }
+
     snprintf(nc->info_str, sizeof(nc->info_str),
              "net=%s,restrict=%s", inet_ntoa(net),
              restricted ? "on" : "off");
@@ -XXX,XX +XXX,XX @@ static int net_slirp_init(NetClientState *peer, const char *model,
     s->poll_notifier.notify = net_slirp_poll_notify;
     main_loop_poll_add_notifier(&s->poll_notifier);
 
+    stored_hostfwd = &stored->hostfwd;
+    stored_guestfwd = &stored->guestfwd;
+
     for (config = slirp_configs; config; config = config->next) {
+        String *element = g_new0(String, 1);
+
+        element->str = g_strdup(config->str);
         if (config->flags & SLIRP_CFG_HOSTFWD) {
             if (slirp_hostfwd(s, config->str, errp) < 0) {
                 goto error;
             }
+            stored->has_hostfwd = true;
+            QAPI_LIST_APPEND(stored_hostfwd, element);
         } else {
             if (slirp_guestfwd(s, config->str, errp) < 0) {
                 goto error;
             }
+            stored->has_guestfwd = true;
+            QAPI_LIST_APPEND(stored_guestfwd, element);
         }
     }
 #ifndef _WIN32
diff --git a/net/socket.c b/net/socket.c
index XXXXXXX..XXXXXXX 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer,
     NetSocketState *s;
     SocketAddress *sa;
     SocketAddressType sa_type;
+    NetdevSocketOptions *stored;
 
     sa = socket_local_address(fd, errp);
     if (!sa) {
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer,
     net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
     net_socket_read_poll(s, true);
 
+    /* Store startup parameters */
+    nc->stored_config = g_new0(NetdevInfo, 1);
+    nc->stored_config->type = NET_BACKEND_SOCKET;
+    stored = &nc->stored_config->u.socket;
+
+    stored->has_fd = true;
+    stored->fd = g_strdup_printf("%d", fd);
+
     /* mcast: save bound address as dst */
     if (is_connected && mcast != NULL) {
+        stored->has_mcast = true;
+        stored->mcast = g_strdup(mcast);
+
         s->dgram_dst = saddr;
         snprintf(nc->info_str, sizeof(nc->info_str),
                  "socket: fd=%d (cloned mcast=%s:%d)",
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_stream(NetClientState *peer,
 {
     NetClientState *nc;
     NetSocketState *s;
+    NetdevSocketOptions *stored;
 
     nc = qemu_new_net_client(&net_socket_info, peer, model, name);
 
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_stream(NetClientState *peer,
     } else {
         qemu_set_fd_handler(s->fd, NULL, net_socket_connect, s);
     }
+
+    /* Store startup parameters */
+    nc->stored_config = g_new0(NetdevInfo, 1);
+    nc->stored_config->type = NET_BACKEND_SOCKET;
+    stored = &nc->stored_config->u.socket;
+
+    stored->has_fd = true;
+    stored->fd = g_strdup_printf("%d", fd);
+
     return s;
 }
 
@@ -XXX,XX +XXX,XX @@ static void net_socket_accept(void *opaque)
     struct sockaddr_in saddr;
     socklen_t len;
     int fd;
+    NetdevSocketOptions *stored;
 
     for(;;) {
         len = sizeof(saddr);
@@ -XXX,XX +XXX,XX @@ static void net_socket_accept(void *opaque)
     s->fd = fd;
     s->nc.link_down = false;
     net_socket_connect(s);
+
+    /* Store additional startup parameters (extend net_socket_listen_init) */
+    stored = &s->nc.stored_config->u.socket;
+
+    stored->has_fd = true;
+    stored->fd = g_strdup_printf("%d", fd);
+
     snprintf(s->nc.info_str, sizeof(s->nc.info_str),
              "socket: connection from %s:%d",
              inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
@@ -XXX,XX +XXX,XX @@ static int net_socket_listen_init(NetClientState *peer,
     NetSocketState *s;
     struct sockaddr_in saddr;
     int fd, ret;
+    NetdevSocketOptions *stored;
 
     if (parse_host_port(&saddr, host_str, errp) < 0) {
         return -1;
@@ -XXX,XX +XXX,XX @@ static int net_socket_listen_init(NetClientState *peer,
     net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
 
     qemu_set_fd_handler(s->listen_fd, net_socket_accept, NULL, s);
+
+    /* Store startup parameters */
+    nc->stored_config = g_new0(NetdevInfo, 1);
+    nc->stored_config->type = NET_BACKEND_SOCKET;
+    stored = &nc->stored_config->u.socket;
+
+    stored->has_listen = true;
+    stored->listen = g_strdup(host_str);
+
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static int net_socket_connect_init(NetClientState *peer,
     NetSocketState *s;
     int fd, connected, ret;
     struct sockaddr_in saddr;
+    NetdevSocketOptions *stored;
 
     if (parse_host_port(&saddr, host_str, errp) < 0) {
         return -1;
@@ -XXX,XX +XXX,XX @@ static int net_socket_connect_init(NetClientState *peer,
         return -1;
     }
 
+    /* Store additional startup parameters (extend net_socket_fd_init) */
+    stored = &s->nc.stored_config->u.socket;
+
+    stored->has_connect = true;
+    stored->connect = g_strdup(host_str);
+
     snprintf(s->nc.info_str, sizeof(s->nc.info_str),
              "socket: connect to %s:%d",
              inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
@@ -XXX,XX +XXX,XX @@ static int net_socket_mcast_init(NetClientState *peer,
     int fd;
     struct sockaddr_in saddr;
     struct in_addr localaddr, *param_localaddr;
+    NetdevSocketOptions *stored;
 
     if (parse_host_port(&saddr, host_str, errp) < 0) {
         return -1;
@@ -XXX,XX +XXX,XX @@ static int net_socket_mcast_init(NetClientState *peer,
 
     s->dgram_dst = saddr;
 
+    /* Store additional startup parameters (extend net_socket_fd_init) */
+    stored = &s->nc.stored_config->u.socket;
+
+    if (!stored->has_mcast) {
+        stored->has_mcast = true;
+        stored->mcast = g_strdup(host_str);
+    }
+
+    if (localaddr_str) {
+        stored->has_localaddr = true;
+        stored->localaddr = g_strdup(localaddr_str);
+    }
+
     snprintf(s->nc.info_str, sizeof(s->nc.info_str),
              "socket: mcast=%s:%d",
              inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
@@ -XXX,XX +XXX,XX @@ static int net_socket_udp_init(NetClientState *peer,
     NetSocketState *s;
     int fd, ret;
     struct sockaddr_in laddr, raddr;
+    NetdevSocketOptions *stored;
 
     if (parse_host_port(&laddr, lhost, errp) < 0) {
         return -1;
@@ -XXX,XX +XXX,XX @@ static int net_socket_udp_init(NetClientState *peer,
 
     s->dgram_dst = raddr;
 
+    /* Store additional startup parameters (extend net_socket_fd_init) */
+    stored = &s->nc.stored_config->u.socket;
+
+    stored->has_localaddr = true;
+    stored->localaddr = g_strdup(lhost);
+
+    stored->has_udp = true;
+    stored->udp = g_strdup(rhost);
+
     snprintf(s->nc.info_str, sizeof(s->nc.info_str),
              "socket: udp=%s:%d",
              inet_ntoa(raddr.sin_addr), ntohs(raddr.sin_port));
diff --git a/net/tap-win32.c b/net/tap-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/net/tap-win32.c
+++ b/net/tap-win32.c
@@ -XXX,XX +XXX,XX @@ static int tap_win32_init(NetClientState *peer, const char *model,
     NetClientState *nc;
     TAPState *s;
     tap_win32_overlapped_t *handle;
+    NetdevTapOptions *stored;
 
     if (tap_win32_open(&handle, ifname) < 0) {
         printf("tap: Could not open '%s'\n", ifname);
@@ -XXX,XX +XXX,XX @@ static int tap_win32_init(NetClientState *peer, const char *model,
 
     s = DO_UPCAST(TAPState, nc, nc);
 
+    /* Store startup parameters */
+    nc->stored_config = g_new0(NetdevInfo, 1);
+    nc->stored_config->type = NET_BACKEND_TAP;
+    stored = &nc->stored_config->u.tap;
+
+    stored->has_ifname = true;
+    stored->ifname = g_strdup(ifname);
+
     snprintf(s->nc.info_str, sizeof(s->nc.info_str),
              "tap: ifname=%s", ifname);
 
diff --git a/net/tap.c b/net/tap.c
index XXXXXXX..XXXXXXX 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -XXX,XX +XXX,XX @@ int net_init_bridge(const Netdev *netdev, const char *name,
     const char *helper, *br;
     TAPState *s;
     int fd, vnet_hdr;
+    NetdevBridgeOptions *stored;
 
     assert(netdev->type == NET_CLIENT_DRIVER_BRIDGE);
     bridge = &netdev->u.bridge;
@@ -XXX,XX +XXX,XX @@ int net_init_bridge(const Netdev *netdev, const char *name,
     }
     s = net_tap_fd_init(peer, "bridge", name, fd, vnet_hdr);
 
+    /* Store startup parameters */
+    s->nc.stored_config = g_new0(NetdevInfo, 1);
+    s->nc.stored_config->type = NET_BACKEND_BRIDGE;
+    stored = &s->nc.stored_config->u.bridge;
+
+    if (br) {
+        stored->has_br = true;
+        stored->br = g_strdup(br);
+    }
+
+    if (helper) {
+        stored->has_helper = true;
+        stored->helper = g_strdup(helper);
+    }
+
     snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s,br=%s", helper,
              br);
 
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
                              const char *model, const char *name,
                              const char *ifname, const char *script,
                              const char *downscript, const char *vhostfdname,
-                             int vnet_hdr, int fd, Error **errp)
+                             int vnet_hdr, int fd, NetdevInfo **common_stored,
+                             Error **errp)
 {
     Error *err = NULL;
     TAPState *s = net_tap_fd_init(peer, model, name, fd, vnet_hdr);
     int vhostfd;
+    NetdevTapOptions *stored;
 
     tap_set_sndbuf(s->fd, tap, &err);
     if (err) {
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
         return;
     }
 
+    /* Store startup parameters */
+    if (!*common_stored) {
+        *common_stored = g_new0(NetdevInfo, 1);
+        (*common_stored)->type = NET_BACKEND_TAP;
+        s->nc.stored_config = *common_stored;
+    }
+    stored = &(*common_stored)->u.tap;
+
+    if (tap->has_sndbuf && !stored->has_sndbuf) {
+        stored->has_sndbuf = true;
+        stored->sndbuf = tap->sndbuf;
+    }
+
+    if (vnet_hdr && !stored->has_vnet_hdr) {
+        stored->has_vnet_hdr = true;
+        stored->vnet_hdr = true;
+    }
+
     if (tap->has_fd || tap->has_fds) {
+        if (!stored->has_fds) {
+            stored->has_fds = true;
+            stored->fds = g_strdup_printf("%d", fd);
+        } else {
+            char *tmp_s = stored->fds;
+            stored->fds = g_strdup_printf("%s:%d", stored->fds, fd);
+            g_free(tmp_s);
+        }
+
         snprintf(s->nc.info_str, sizeof(s->nc.info_str), "fd=%d", fd);
     } else if (tap->has_helper) {
+        if (!stored->has_helper) {
+            stored->has_helper = true;
+            stored->helper = g_strdup(tap->helper);
+        }
+
+        if (!stored->has_br) {
+            stored->has_br = true;
+            stored->br = tap->has_br ? g_strdup(tap->br) :
+                                       g_strdup(DEFAULT_BRIDGE_INTERFACE);
+        }
+
         snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s",
                  tap->helper);
     } else {
+        if (ifname && !stored->has_ifname) {
+            stored->has_ifname = true;
+            stored->ifname = g_strdup(ifname);
+        }
+
+        if (script && !stored->has_script) {
+            stored->has_script = true;
+            stored->script = g_strdup(script);
+        }
+
+        if (downscript && !stored->has_downscript) {
+            stored->has_downscript = true;
+            stored->downscript = g_strdup(downscript);
+        }
+
         snprintf(s->nc.info_str, sizeof(s->nc.info_str),
                  "ifname=%s,script=%s,downscript=%s", ifname, script,
                  downscript);
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
         vhostfdname || (tap->has_vhostforce && tap->vhostforce)) {
         VhostNetOptions options;
 
+        stored->has_vhost = true;
+        stored->vhost = true;
+
+        if (tap->has_vhostforce && tap->vhostforce) {
+            stored->has_vhostforce = true;
+            stored->vhostforce = true;
+        }
+
         options.backend_type = VHOST_BACKEND_TYPE_KERNEL;
         options.net_backend = &s->nc;
         if (tap->has_poll_us) {
+            stored->has_poll_us = true;
+            stored->poll_us = tap->poll_us;
+
             options.busyloop_timeout = tap->poll_us;
         } else {
             options.busyloop_timeout = 0;
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
         }
         options.opaque = (void *)(uintptr_t)vhostfd;
 
+        if (!stored->has_vhostfds) {
+            stored->has_vhostfds = true;
+            stored->vhostfds = g_strdup_printf("%d", vhostfd);
+        } else {
+            char *tmp_s = stored->vhostfds;
+            stored->vhostfds = g_strdup_printf("%s:%d", stored->fds, vhostfd);
+            g_free(tmp_s);
+        }
+
         s->vhost_net = vhost_net_init(&options);
         if (!s->vhost_net) {
             if (tap->has_vhostforce && tap->vhostforce) {
@@ -XXX,XX +XXX,XX @@ int net_init_tap(const Netdev *netdev, const char *name,
     const char *vhostfdname;
     char ifname[128];
     int ret = 0;
+    NetdevInfo *common_stored = NULL; /* will store configuration */
 
     assert(netdev->type == NET_CLIENT_DRIVER_TAP);
     tap = &netdev->u.tap;
@@ -XXX,XX +XXX,XX @@ int net_init_tap(const Netdev *netdev, const char *name,
 
         net_init_tap_one(tap, peer, "tap", name, NULL,
                          script, downscript,
-                         vhostfdname, vnet_hdr, fd, &err);
+                         vhostfdname, vnet_hdr, fd, &common_stored, &err);
         if (err) {
             error_propagate(errp, err);
             close(fd);
@@ -XXX,XX +XXX,XX @@ int net_init_tap(const Netdev *netdev, const char *name,
             net_init_tap_one(tap, peer, "tap", name, ifname,
                              script, downscript,
                              tap->has_vhostfds ? vhost_fds[i] : NULL,
-                             vnet_hdr, fd, &err);
+                             vnet_hdr, fd, &common_stored, &err);
             if (err) {
                 error_propagate(errp, err);
                 ret = -1;
@@ -XXX,XX +XXX,XX @@ free_fail:
 
         net_init_tap_one(tap, peer, "bridge", name, ifname,
                          script, downscript, vhostfdname,
-                         vnet_hdr, fd, &err);
+                         vnet_hdr, fd, &common_stored, &err);
         if (err) {
             error_propagate(errp, err);
             close(fd);
@@ -XXX,XX +XXX,XX @@ free_fail:
             net_init_tap_one(tap, peer, "tap", name, ifname,
                              i >= 1 ? "no" : script,
                              i >= 1 ? "no" : downscript,
-                             vhostfdname, vnet_hdr, fd, &err);
+                             vhostfdname, vnet_hdr, fd,
+                             &common_stored, &err);
             if (err) {
                 error_propagate(errp, err);
                 close(fd);
diff --git a/net/vde.c b/net/vde.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vde.c
+++ b/net/vde.c
@@ -XXX,XX +XXX,XX @@ static int net_vde_init(NetClientState *peer, const char *model,
     VDECONN *vde;
     char *init_group = (char *)group;
     char *init_sock = (char *)sock;
+    NetdevVdeOptions *stored;
 
     struct vde_open_args args = {
         .port = port,
@@ -XXX,XX +XXX,XX @@ static int net_vde_init(NetClientState *peer, const char *model,
 
     qemu_set_fd_handler(vde_datafd(s->vde), vde_to_qemu, NULL, s);
 
+    /* Store startup parameters */
+    nc->stored_config = g_new0(NetdevInfo, 1);
+    nc->stored_config->type = NET_BACKEND_VDE;
+    stored = &nc->stored_config->u.vde;
+
+    if (sock) {
+        stored->has_sock = true;
+        stored->sock = g_strdup(sock);
+    }
+
+    stored->has_port = true;
+    stored->port = port;
+
+    if (group) {
+        stored->has_group = true;
+        stored->group = g_strdup(group);
+    }
+
+    stored->has_mode = true;
+    stored->mode = mode;
+
     return 0;
 }
 
diff --git a/net/vhost-user.c b/net/vhost-user.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -XXX,XX +XXX,XX @@ static void net_vhost_user_event(void *opaque, QEMUChrEvent event)
 }
 
 static int net_vhost_user_init(NetClientState *peer, const char *device,
-                               const char *name, Chardev *chr,
-                               int queues)
+                               const char *name, const char *chardev,
+                               Chardev *chr, int queues)
 {
     Error *err = NULL;
     NetClientState *nc, *nc0 = NULL;
     NetVhostUserState *s = NULL;
     VhostUserState *user;
     int i;
+    NetdevVhostUserOptions *stored;
 
     assert(name);
     assert(queues > 0);
@@ -XXX,XX +XXX,XX @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
 
     assert(s->vhost_net);
 
+    /* Store startup parameters */
+    nc0->stored_config = g_new0(NetdevInfo, 1);
+    nc0->stored_config->type = NET_BACKEND_VHOST_USER;
+    stored = &nc0->stored_config->u.vhost_user;
+
+    stored->chardev = g_strdup(chardev);
+
+    stored->has_queues = true;
+    stored->queues = queues;
+
     return 0;
 
 err:
@@ -XXX,XX +XXX,XX @@ int net_init_vhost_user(const Netdev *netdev, const char *name,
         return -1;
     }
 
-    return net_vhost_user_init(peer, "vhost_user", name, chr, queues);
+    return net_vhost_user_init(peer, "vhost_user", name,
+                               vhost_user_opts->chardev, chr, queues);
 }
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int net_vhost_vdpa_init(NetClientState *peer, const char *device,
     VhostVDPAState *s;
     int vdpa_device_fd = -1;
     int ret = 0;
+    NetdevVhostVDPAOptions *stored;
+
     assert(name);
     nc = qemu_new_net_client(&net_vhost_vdpa_info, peer, device, name);
+
+    /* Store startup parameters */
+    nc->stored_config = g_new0(NetdevInfo, 1);
+    nc->stored_config->type = NET_BACKEND_VHOST_VDPA;
+    stored = &nc->stored_config->u.vhost_vdpa;
+
+    stored->has_vhostdev = true;
+    stored->vhostdev = g_strdup(vhostdev);
+
+    stored->has_queues = true;
+    stored->queues = 1; /* TODO: change when support multiqueue */
+
     snprintf(nc->info_str, sizeof(nc->info_str), TYPE_VHOST_VDPA);
     nc->queue_index = 0;
     s = DO_UPCAST(VhostVDPAState, nc, nc);
diff --git a/qapi/net.json b/qapi/net.json
index XXXXXXX..XXXXXXX 100644
--- a/qapi/net.json
+++ b/qapi/net.json
@@ -XXX,XX +XXX,XX @@
 ##
 { 'event': 'FAILOVER_NEGOTIATED',
   'data': {'device-id': 'str'} }
+
+##
+# @NetBackend:
+#
+# Available netdev backend drivers.
+#
+# Since: 6.0
+##
+{ 'enum': 'NetBackend',
+  'data': [ 'bridge', 'l2tpv3', 'netmap', 'socket', 'tap', 'user', 'vde',
+            'vhost-user', 'vhost-vdpa' ] }
+
+##
+# @NetdevInfo:
+#
+# Configuration of a network backend device (netdev).
+#
+# @id: Device identifier.
+#
+# @type: Specify the driver used for interpreting remaining arguments.
+#
+# @peer-id: The connected frontend network device name (absent if no frontend
+#           is connected).
+#
+# Since: 6.0
+##
+{ 'union': 'NetdevInfo',
+  'base': { 'id': 'str',
+            'type': 'NetBackend',
+            '*peer-id': 'str' },
+  'discriminator': 'type',
+  'data': {
+      'bridge':     'NetdevBridgeOptions',
+      'l2tpv3':     'NetdevL2TPv3Options',
+      'netmap':     'NetdevNetmapOptions',
+      'socket':     'NetdevSocketOptions',
+      'tap':        'NetdevTapOptions',
+      'user':       'NetdevUserOptions',
+      'vde':        'NetdevVdeOptions',
+      'vhost-user': 'NetdevVhostUserOptions',
+      'vhost-vdpa': 'NetdevVhostVDPAOptions' } }
+
+##
+# @query-netdev:
+#
+# Get a list of @NetdevInfo for all virtual network backend devices (netdevs).
+#
+# Returns: a list of @NetdevInfo describing each netdev.
+#
+# Since: 6.0
+#
+# Example:
+#
+# -> { "execute": "query-netdev" }
+# <- { "return": [
+#          {
+#              "ipv6": true,
+#              "ipv4": true,
+#              "host": "10.0.2.2",
+#              "ipv6-dns": "fec0::3",
+#              "ipv6-prefix": "fec0::",
+#              "net": "10.0.2.0/255.255.255.0",
+#              "ipv6-host": "fec0::2",
+#              "type": "user",
+#              "peer-id": "net0",
+#              "dns": "10.0.2.3",
+#              "hostfwd": [
+#                  {
+#                      "str": "tcp::20004-:22"
+#                  }
+#              ],
+#              "ipv6-prefixlen": 64,
+#              "id": "netdev0",
+#              "restrict": false
+#          }
+#      ]
+#    }
+#
+##
+{ 'command': 'query-netdev', 'returns': ['NetdevInfo'] }
-- 
2.7.4

From: Alexey Kirillov <lekiravi@yandex-team.ru>

A simply qtest that checks for correct number of netdevs in the response
of the query-netdev.

Signed-off-by: Alexey Kirillov <lekiravi@yandex-team.ru>
Acked-by: Thomas Huth <thuth@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 tests/qtest/meson.build         |   3 +
 tests/qtest/test-query-netdev.c | 120 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 123 insertions(+)
 create mode 100644 tests/qtest/test-query-netdev.c

diff --git a/tests/qtest/meson.build b/tests/qtest/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/tests/qtest/meson.build
+++ b/tests/qtest/meson.build
@@ -XXX,XX +XXX,XX @@ qtests_generic = [
 if config_host.has_key('CONFIG_MODULES')
   qtests_generic += [ 'modules-test' ]
 endif
+if slirp.found()
+  qtests_generic += [ 'test-query-netdev' ]
+endif
 
 qtests_pci = \
   (config_all_devices.has_key('CONFIG_VGA') ? ['display-vga-test'] : []) +                  \
diff --git a/tests/qtest/test-query-netdev.c b/tests/qtest/test-query-netdev.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qtest/test-query-netdev.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QTest testcase for the query-netdev
+ *
+ * Copyright Yandex N.V., 2019
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "libqos/libqtest.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qlist.h"
+
+/*
+ * Events can get in the way of responses we are actually waiting for.
+ */
+GCC_FMT_ATTR(2, 3)
+static QObject *wait_command(QTestState *who, const char *command, ...)
+{
+    va_list ap;
+    QDict *response;
+    QObject *result;
+
+    va_start(ap, command);
+    qtest_qmp_vsend(who, command, ap);
+    va_end(ap);
+
+    response = qtest_qmp_receive(who);
+
+    result = qdict_get(response, "return");
+    g_assert(result);
+    qobject_ref(result);
+    qobject_unref(response);
+
+    return result;
+}
+
+static void qmp_query_netdev_no_error(QTestState *qts, size_t netdevs_count)
+{
+    QObject *resp;
+    QList *netdevs;
+
+    resp = wait_command(qts, "{'execute': 'query-netdev'}");
+
+    netdevs = qobject_to(QList, resp);
+    g_assert(netdevs);
+    g_assert(qlist_size(netdevs) == netdevs_count);
+
+    qobject_unref(resp);
+}
+
+static void test_query_netdev(void)
+{
+    const char *arch = qtest_get_arch();
+    QObject *resp;
+    QTestState *state;
+
+    /* Choosing machine for platforms without default one */
+    if (g_str_equal(arch, "arm") ||
+        g_str_equal(arch, "aarch64")) {
+        state = qtest_init(
+            "-nodefaults "
+            "-M virt "
+            "-netdev user,id=slirp0");
+    } else if (g_str_equal(arch, "tricore")) {
+        state = qtest_init(
+            "-nodefaults "
+            "-M tricore_testboard "
+            "-netdev user,id=slirp0");
+    } else if (g_str_equal(arch, "avr")) {
+        state = qtest_init(
+            "-nodefaults "
+            "-M mega2560 "
+            "-netdev user,id=slirp0");
+    } else if (g_str_equal(arch, "rx")) {
+        state = qtest_init(
+            "-nodefaults "
+            "-M gdbsim-r5f562n8 "
+            "-netdev user,id=slirp0");
+    } else {
+        state = qtest_init(
+            "-nodefaults "
+            "-netdev user,id=slirp0");
+    }
+    g_assert(state);
+
+    qmp_query_netdev_no_error(state, 1);
+
+    resp = wait_command(state,
+        "{'execute': 'netdev_add', 'arguments': {"
+        " 'id': 'slirp1',"
+        " 'type': 'user'}}");
+    qobject_unref(resp);
+
+    qmp_query_netdev_no_error(state, 2);
+
+    resp = wait_command(state,
+        "{'execute': 'netdev_del', 'arguments': {"
+        " 'id': 'slirp1'}}");
+    qobject_unref(resp);
+
+    qmp_query_netdev_no_error(state, 1);
+
+    qtest_quit(state);
+}
+
+int main(int argc, char **argv)
+{
+    int ret = 0;
+    g_test_init(&argc, &argv, NULL);
+
+    qtest_add_func("/net/qapi/query_netdev", test_query_netdev);
+
+    ret = g_test_run();
+
+    return ret;
+}
-- 
2.7.4

From: Alexey Kirillov <lekiravi@yandex-team.ru>

The info_str field of the NetClientState structure is static and has a size
of 256 bytes. This amount is often unclaimed, and the field itself is used
exclusively for HMP "info network".

The patch translates info_str to dynamic memory allocation.

This action is also allows us to painlessly discard usage of this field
for backend devices.

diff --git a/hw/net/xen_nic.c b/hw/net/xen_nic.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/xen_nic.c
+++ b/hw/net/xen_nic.c
@@ -XXX,XX +XXX,XX @@ static int net_init(struct XenLegacyDevice *xendev)
     netdev->nic = qemu_new_nic(&net_xen_info, &netdev->conf,
                                "xen", NULL, netdev);
 
-    snprintf(qemu_get_queue(netdev->nic)->info_str,
-             sizeof(qemu_get_queue(netdev->nic)->info_str),
-             "nic: xenbus vif macaddr=%s", netdev->mac);
+    qemu_get_queue(netdev->nic)->info_str = g_strdup_printf(
+        "nic: xenbus vif macaddr=%s", netdev->mac);
 
     /* fill info */
     xenstore_write_be_int(&netdev->xendev, "feature-rx-copy", 1);
diff --git a/include/net/net.h b/include/net/net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -XXX,XX +XXX,XX @@ struct NetClientState {
     NetQueue *incoming_queue;
     char *model;
     char *name;
-    char info_str[256];
+    char *info_str;
     NetdevInfo *stored_config;
     unsigned receive_disabled : 1;
     NetClientDestructor *destructor;
diff --git a/net/l2tpv3.c b/net/l2tpv3.c
index XXXXXXX..XXXXXXX 100644
--- a/net/l2tpv3.c
+++ b/net/l2tpv3.c
@@ -XXX,XX +XXX,XX @@ int net_init_l2tpv3(const Netdev *netdev,
     QAPI_CLONE_MEMBERS(NetdevL2TPv3Options,
                        &nc->stored_config->u.l2tpv3, l2tpv3);
 
-    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
-             "l2tpv3: connected");
+    s->nc.info_str = g_strdup_printf("l2tpv3: connected");
     return 0;
 outerr:
     qemu_del_net_client(nc);
diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@ char *qemu_mac_strdup_printf(const uint8_t *macaddr)
 
 void qemu_format_nic_info_str(NetClientState *nc, uint8_t macaddr[6])
 {
-    snprintf(nc->info_str, sizeof(nc->info_str),
-             "model=%s,macaddr=%02x:%02x:%02x:%02x:%02x:%02x",
-             nc->model,
-             macaddr[0], macaddr[1], macaddr[2],
-             macaddr[3], macaddr[4], macaddr[5]);
+    g_free(nc->info_str);
+    nc->info_str = g_strdup_printf(
+        "model=%s,macaddr=%02x:%02x:%02x:%02x:%02x:%02x",
+        nc->model,
+        macaddr[0], macaddr[1], macaddr[2],
+        macaddr[3], macaddr[4], macaddr[5]);
 }
 
 static int mac_table[256] = {0};
@@ -XXX,XX +XXX,XX @@ static void qemu_free_net_client(NetClientState *nc)
     }
     g_free(nc->name);
     g_free(nc->model);
+    g_free(nc->info_str);
     qapi_free_NetdevInfo(nc->stored_config);
     if (nc->destructor) {
         nc->destructor(nc);
@@ -XXX,XX +XXX,XX @@ void print_net_client(Monitor *mon, NetClientState *nc)
     monitor_printf(mon, "%s: index=%d,type=%s,%s\n", nc->name,
                    nc->queue_index,
                    NetClientDriver_str(nc->info->type),
-                   nc->info_str);
+                   nc->info_str ? nc->info_str : "");
     if (!QTAILQ_EMPTY(&nc->filters)) {
         monitor_printf(mon, "filters:\n");
     }
diff --git a/net/slirp.c b/net/slirp.c
index XXXXXXX..XXXXXXX 100644
--- a/net/slirp.c
+++ b/net/slirp.c
@@ -XXX,XX +XXX,XX @@ static int net_slirp_init(NetClientState *peer, const char *model,
         stored->tftp_server_name = g_strdup(tftp_server_name);
     }
 
-    snprintf(nc->info_str, sizeof(nc->info_str),
-             "net=%s,restrict=%s", inet_ntoa(net),
-             restricted ? "on" : "off");
+    nc->info_str = g_strdup_printf("net=%s,restrict=%s", inet_ntoa(net),
+                                   restricted ? "on" : "off");
 
     s = DO_UPCAST(SlirpState, nc, nc);
 
diff --git a/net/socket.c b/net/socket.c
index XXXXXXX..XXXXXXX 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -XXX,XX +XXX,XX @@ static void net_socket_send(void *opaque)
         s->fd = -1;
         net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
         s->nc.link_down = true;
-        memset(s->nc.info_str, 0, sizeof(s->nc.info_str));
+        g_free(s->nc.info_str);
+        s->nc.info_str = g_new0(char, 1);
 
         return;
     }
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer,
         stored->mcast = g_strdup(mcast);
 
         s->dgram_dst = saddr;
-        snprintf(nc->info_str, sizeof(nc->info_str),
-                 "socket: fd=%d (cloned mcast=%s:%d)",
-                 fd, inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
+        nc->info_str = g_strdup_printf("socket: fd=%d (cloned mcast=%s:%d)",
+                                       fd, inet_ntoa(saddr.sin_addr),
+                                       ntohs(saddr.sin_port));
     } else {
         if (sa_type == SOCKET_ADDRESS_TYPE_UNIX) {
             s->dgram_dst.sin_family = AF_UNIX;
         }
 
-        snprintf(nc->info_str, sizeof(nc->info_str),
-                 "socket: fd=%d %s", fd, SocketAddressType_str(sa_type));
+        nc->info_str = g_strdup_printf("socket: fd=%d %s",
+                                       fd, SocketAddressType_str(sa_type));
     }
 
     return s;
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_stream(NetClientState *peer,
 
     nc = qemu_new_net_client(&net_socket_info, peer, model, name);
 
-    snprintf(nc->info_str, sizeof(nc->info_str), "socket: fd=%d", fd);
+    nc->info_str = g_strdup_printf("socket: fd=%d", fd);
 
     s = DO_UPCAST(NetSocketState, nc, nc);
 
@@ -XXX,XX +XXX,XX @@ static void net_socket_accept(void *opaque)
     stored->has_fd = true;
     stored->fd = g_strdup_printf("%d", fd);
 
-    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
-             "socket: connection from %s:%d",
-             inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
+    g_free(s->nc.info_str);
+    s->nc.info_str = g_strdup_printf("socket: connection from %s:%d",
+                                     inet_ntoa(saddr.sin_addr),
+                                     ntohs(saddr.sin_port));
 }
 
 static int net_socket_listen_init(NetClientState *peer,
@@ -XXX,XX +XXX,XX @@ static int net_socket_connect_init(NetClientState *peer,
     stored->has_connect = true;
     stored->connect = g_strdup(host_str);
 
-    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
-             "socket: connect to %s:%d",
-             inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
+    g_free(s->nc.info_str);
+    s->nc.info_str = g_strdup_printf("socket: connect to %s:%d",
+                                     inet_ntoa(saddr.sin_addr),
+                                     ntohs(saddr.sin_port));
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static int net_socket_mcast_init(NetClientState *peer,
         stored->localaddr = g_strdup(localaddr_str);
     }
 
-    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
-             "socket: mcast=%s:%d",
-             inet_ntoa(saddr.sin_addr), ntohs(saddr.sin_port));
+    g_free(s->nc.info_str);
+    s->nc.info_str = g_strdup_printf("socket: mcast=%s:%d",
+                                     inet_ntoa(saddr.sin_addr),
+                                     ntohs(saddr.sin_port));
     return 0;
 
 }
@@ -XXX,XX +XXX,XX @@ static int net_socket_udp_init(NetClientState *peer,
     stored->has_udp = true;
     stored->udp = g_strdup(rhost);
 
-    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
-             "socket: udp=%s:%d",
-             inet_ntoa(raddr.sin_addr), ntohs(raddr.sin_port));
+    g_free(s->nc.info_str);
+    s->nc.info_str = g_strdup_printf("socket: udp=%s:%d",
+                                     inet_ntoa(raddr.sin_addr),
+                                     ntohs(raddr.sin_port));
     return 0;
 }
 
diff --git a/net/tap-win32.c b/net/tap-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/net/tap-win32.c
+++ b/net/tap-win32.c
@@ -XXX,XX +XXX,XX @@ static int tap_win32_init(NetClientState *peer, const char *model,
     stored->has_ifname = true;
     stored->ifname = g_strdup(ifname);
 
-    snprintf(s->nc.info_str, sizeof(s->nc.info_str),
-             "tap: ifname=%s", ifname);
+    s->nc.info_str = g_strdup_printf("tap: ifname=%s", ifname);
 
     s->handle = handle;
 
diff --git a/net/tap.c b/net/tap.c
index XXXXXXX..XXXXXXX 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -XXX,XX +XXX,XX @@ int net_init_bridge(const Netdev *netdev, const char *name,
         stored->helper = g_strdup(helper);
     }
 
-    snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s,br=%s", helper,
-             br);
+    s->nc.info_str = g_strdup_printf("helper=%s,br=%s", helper, br);
 
     return 0;
 }
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
             g_free(tmp_s);
         }
 
-        snprintf(s->nc.info_str, sizeof(s->nc.info_str), "fd=%d", fd);
+        s->nc.info_str = g_strdup_printf("fd=%d", fd);
     } else if (tap->has_helper) {
         if (!stored->has_helper) {
             stored->has_helper = true;
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
                                        g_strdup(DEFAULT_BRIDGE_INTERFACE);
         }
 
-        snprintf(s->nc.info_str, sizeof(s->nc.info_str), "helper=%s",
-                 tap->helper);
+        s->nc.info_str = g_strdup_printf("helper=%s", tap->helper);
     } else {
         if (ifname && !stored->has_ifname) {
             stored->has_ifname = true;
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
             stored->downscript = g_strdup(downscript);
         }
 
-        snprintf(s->nc.info_str, sizeof(s->nc.info_str),
-                 "ifname=%s,script=%s,downscript=%s", ifname, script,
-                 downscript);
+        s->nc.info_str = g_strdup_printf("ifname=%s,script=%s,downscript=%s",
+                                         ifname, script, downscript);
 
         if (strcmp(downscript, "no") != 0) {
             snprintf(s->down_script, sizeof(s->down_script), "%s", downscript);
diff --git a/net/vde.c b/net/vde.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vde.c
+++ b/net/vde.c
@@ -XXX,XX +XXX,XX @@ static int net_vde_init(NetClientState *peer, const char *model,
 
     nc = qemu_new_net_client(&net_vde_info, peer, model, name);
 
-    snprintf(nc->info_str, sizeof(nc->info_str), "sock=%s,fd=%d",
-             sock, vde_datafd(vde));
+    nc->info_str = g_strdup_printf("sock=%s,fd=%d", sock, vde_datafd(vde));
 
     s = DO_UPCAST(VDEState, nc, nc);
 
diff --git a/net/vhost-user.c b/net/vhost-user.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -XXX,XX +XXX,XX @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
     user = g_new0(struct VhostUserState, 1);
     for (i = 0; i < queues; i++) {
         nc = qemu_new_net_client(&net_vhost_user_info, peer, device, name);
-        snprintf(nc->info_str, sizeof(nc->info_str), "vhost-user%d to %s",
-                 i, chr->label);
+        nc->info_str = g_strdup_printf("vhost-user%d to %s", i, chr->label);
         nc->queue_index = i;
         if (!nc0) {
             nc0 = nc;
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int net_vhost_vdpa_init(NetClientState *peer, const char *device,
     stored->has_queues = true;
     stored->queues = 1; /* TODO: change when support multiqueue */
 
-    snprintf(nc->info_str, sizeof(nc->info_str), TYPE_VHOST_VDPA);
+    nc->info_str = g_strdup_printf(TYPE_VHOST_VDPA);
     nc->queue_index = 0;
     s = DO_UPCAST(VhostVDPAState, nc, nc);
     vdpa_device_fd = qemu_open_old(vhostdev, O_RDWR);
-- 
2.7.4

From: Alexey Kirillov <lekiravi@yandex-team.ru>

Replace usage of legacy field info_str of NetClientState for backend
network devices with QAPI NetdevInfo stored_config that already used
in QMP query-netdev.

This change increases the detail of the "info network" output and takes
a more general approach to composing the output.

NIC and hubports still use legacy info_str field.

Signed-off-by: Alexey Kirillov <lekiravi@yandex-team.ru>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qapi/hmp-output-visitor.h |  30 ++++++
 net/net.c                         |  31 +++++-
 qapi/hmp-output-visitor.c         | 193 ++++++++++++++++++++++++++++++++++++++
 qapi/meson.build                  |   1 +
 4 files changed, 254 insertions(+), 1 deletion(-)
 create mode 100644 include/qapi/hmp-output-visitor.h
 create mode 100644 qapi/hmp-output-visitor.c

diff --git a/include/qapi/hmp-output-visitor.h b/include/qapi/hmp-output-visitor.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/include/qapi/hmp-output-visitor.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * HMP string output Visitor
+ *
+ * Copyright Yandex N.V., 2021
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef HMP_OUTPUT_VISITOR_H
+#define HMP_OUTPUT_VISITOR_H
+
+#include "qapi/visitor.h"
+
+typedef struct HMPOutputVisitor HMPOutputVisitor;
+
+/**
+ * Create a HMP string output visitor for @obj
+ *
+ * Flattens dicts/structures, only shows arrays borders.
+ *
+ * Errors are not expected to happen.
+ *
+ * The caller is responsible for freeing the visitor with
+ * visit_free().
+ */
+Visitor *hmp_output_visitor_new(char **result);
+
+#endif
diff --git a/net/net.c b/net/net.c
index XXXXXXX..XXXXXXX 100644
--- a/net/net.c
+++ b/net/net.c
@@ -XXX,XX +XXX,XX @@
 #include "sysemu/sysemu.h"
 #include "net/filter.h"
 #include "qapi/string-output-visitor.h"
+#include "qapi/hmp-output-visitor.h"
 
 /* Net bridge is currently not supported for W32. */
 #if !defined(_WIN32)
@@ -XXX,XX +XXX,XX @@ static void netfilter_print_info(Monitor *mon, NetFilterState *nf)
     monitor_printf(mon, "\n");
 }
 
+static char *generate_info_str(NetClientState *nc)
+{
+    NetdevInfo *ni = nc->stored_config;
+    char *ret_out = NULL;
+    Visitor *v;
+
+    /* Use legacy field info_str for NIC and hubports */
+    if ((nc->info->type == NET_CLIENT_DRIVER_NIC) ||
+        (nc->info->type == NET_CLIENT_DRIVER_HUBPORT)) {
+        return g_strdup(nc->info_str ? nc->info_str : "");
+    }
+
+    if (!ni) {
+        return g_malloc0(1);
+    }
+
+    v = hmp_output_visitor_new(&ret_out);
+    if (visit_type_NetdevInfo(v, "", &ni, NULL)) {
+        visit_complete(v, &ret_out);
+    }
+    visit_free(v);
+
+    return ret_out;
+}
+
 void print_net_client(Monitor *mon, NetClientState *nc)
 {
     NetFilterState *nf;
+    char *info_str = generate_info_str(nc);
 
     monitor_printf(mon, "%s: index=%d,type=%s,%s\n", nc->name,
                    nc->queue_index,
                    NetClientDriver_str(nc->info->type),
-                   nc->info_str ? nc->info_str : "");
+                   info_str);
+    g_free(info_str);
+
     if (!QTAILQ_EMPTY(&nc->filters)) {
         monitor_printf(mon, "filters:\n");
     }
diff --git a/qapi/hmp-output-visitor.c b/qapi/hmp-output-visitor.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/qapi/hmp-output-visitor.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * HMP string output Visitor
+ *
+ * Copyright Yandex N.V., 2021
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/cutils.h"
+#include "qapi/hmp-output-visitor.h"
+#include "qapi/visitor-impl.h"
+
+struct HMPOutputVisitor {
+    Visitor visitor;
+    char **result;
+    GString *buffer;
+    bool is_continue;
+};
+
+static HMPOutputVisitor *to_hov(Visitor *v)
+{
+    return container_of(v, HMPOutputVisitor, visitor);
+}
+
+static void hmp_output_append_formatted(Visitor *v, const char *fmt, ...)
+{
+    HMPOutputVisitor *ov = to_hov(v);
+    va_list args;
+
+    if (ov->is_continue) {
+        g_string_append(ov->buffer, ",");
+    } else {
+        ov->is_continue = true;
+    }
+
+    va_start(args, fmt);
+    g_string_append_vprintf(ov->buffer, fmt, args);
+    va_end(args);
+}
+
+static void hmp_output_skip_comma(Visitor *v)
+{
+    HMPOutputVisitor *ov = to_hov(v);
+
+    ov->is_continue = false;
+}
+
+static bool hmp_output_start_struct(Visitor *v, const char *name,
+                                    void **obj, size_t unused, Error **errp)
+{
+    return true;
+}
+
+static void hmp_output_end_struct(Visitor *v, void **obj) {}
+
+static bool hmp_output_start_list(Visitor *v, const char *name,
+                                  GenericList **listp, size_t size,
+                                  Error **errp)
+{
+    hmp_output_append_formatted(v, "%s=[", name);
+    /* First element in array without comma before it */
+    hmp_output_skip_comma(v);
+
+    return true;
+}
+
+static GenericList *hmp_output_next_list(Visitor *v, GenericList *tail,
+                                         size_t size)
+{
+    return tail->next;
+}
+
+static void hmp_output_end_list(Visitor *v, void **obj)
+{
+    /* Don't need comma after last array element */
+    hmp_output_skip_comma(v);
+    hmp_output_append_formatted(v, "]");
+}
+
+static bool hmp_output_type_int64(Visitor *v, const char *name,
+                                  int64_t *obj, Error **errp)
+{
+    hmp_output_append_formatted(v, "%s=%" PRId64, name, *obj);
+
+    return true;
+}
+
+static bool hmp_output_type_uint64(Visitor *v, const char *name,
+                                   uint64_t *obj, Error **errp)
+{
+    hmp_output_append_formatted(v, "%s=%" PRIu64, name, *obj);
+
+    return true;
+}
+
+static bool hmp_output_type_bool(Visitor *v, const char *name, bool *obj,
+                                 Error **errp)
+{
+    hmp_output_append_formatted(v, "%s=%s", name, *obj ? "true" : "false");
+
+    return true;
+}
+
+static bool hmp_output_type_str(Visitor *v, const char *name, char **obj,
+                                Error **errp)
+{
+    /* Skip already printed or unused fields */
+    if (!*obj || g_str_equal(name, "id") || g_str_equal(name, "type")) {
+        return true;
+    }
+
+    /* Do not print stub name for StringList elements */
+    if (g_str_equal(name, "str")) {
+        hmp_output_append_formatted(v, "%s", *obj);
+    } else {
+        hmp_output_append_formatted(v, "%s=%s", name, *obj);
+    }
+
+    return true;
+}
+
+static bool hmp_output_type_number(Visitor *v, const char *name,
+                                   double *obj, Error **errp)
+{
+    hmp_output_append_formatted(v, "%s=%.17g", name, *obj);
+
+    return true;
+}
+
+/* TODO: remove this function? */
+static bool hmp_output_type_any(Visitor *v, const char *name,
+                                QObject **obj, Error **errp)
+{
+    return true;
+}
+
+static bool hmp_output_type_null(Visitor *v, const char *name,
+                                 QNull **obj, Error **errp)
+{
+    hmp_output_append_formatted(v, "%s=NULL", name);
+
+    return true;
+}
+
+static void hmp_output_complete(Visitor *v, void *opaque)
+{
+    HMPOutputVisitor *ov = to_hov(v);
+
+    *ov->result = g_string_free(ov->buffer, false);
+    ov->buffer = NULL;
+}
+
+static void hmp_output_free(Visitor *v)
+{
+    HMPOutputVisitor *ov = to_hov(v);
+
+    if (ov->buffer) {
+        g_string_free(ov->buffer, true);
+    }
+    g_free(v);
+}
+
+Visitor *hmp_output_visitor_new(char **result)
+{
+    HMPOutputVisitor *v;
+
+    v = g_malloc0(sizeof(*v));
+
+    v->visitor.type = VISITOR_OUTPUT;
+    v->visitor.start_struct = hmp_output_start_struct;
+    v->visitor.end_struct = hmp_output_end_struct;
+    v->visitor.start_list = hmp_output_start_list;
+    v->visitor.next_list = hmp_output_next_list;
+    v->visitor.end_list = hmp_output_end_list;
+    v->visitor.type_int64 = hmp_output_type_int64;
+    v->visitor.type_uint64 = hmp_output_type_uint64;
+    v->visitor.type_bool = hmp_output_type_bool;
+    v->visitor.type_str = hmp_output_type_str;
+    v->visitor.type_number = hmp_output_type_number;
+    v->visitor.type_any = hmp_output_type_any;
+    v->visitor.type_null = hmp_output_type_null;
+    v->visitor.complete = hmp_output_complete;
+    v->visitor.free = hmp_output_free;
+
+    v->result = result;
+    v->buffer = g_string_new("");
+    v->is_continue = false;
+
+    return &v->visitor;
+}
diff --git a/qapi/meson.build b/qapi/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/qapi/meson.build
+++ b/qapi/meson.build
@@ -XXX,XX +XXX,XX @@ util_ss.add(files(
   'qobject-output-visitor.c',
   'string-input-visitor.c',
   'string-output-visitor.c',
+  'hmp-output-visitor.c',
 ))
 if have_system or have_tools
   util_ss.add(files(
-- 
2.7.4

From: Alexey Kirillov <lekiravi@yandex-team.ru>

As we use QAPI NetClientState->stored_config to store and get information
about backend network devices, we can drop fill of legacy field info_str
for them.

We still use info_str field for NIC and hubports, so we can not completely
remove it.

diff --git a/net/l2tpv3.c b/net/l2tpv3.c
index XXXXXXX..XXXXXXX 100644
--- a/net/l2tpv3.c
+++ b/net/l2tpv3.c
@@ -XXX,XX +XXX,XX @@ int net_init_l2tpv3(const Netdev *netdev,
 
     QAPI_CLONE_MEMBERS(NetdevL2TPv3Options,
                        &nc->stored_config->u.l2tpv3, l2tpv3);
-
-    s->nc.info_str = g_strdup_printf("l2tpv3: connected");
     return 0;
 outerr:
     qemu_del_net_client(nc);
diff --git a/net/slirp.c b/net/slirp.c
index XXXXXXX..XXXXXXX 100644
--- a/net/slirp.c
+++ b/net/slirp.c
@@ -XXX,XX +XXX,XX @@ static int net_slirp_init(NetClientState *peer, const char *model,
         stored->tftp_server_name = g_strdup(tftp_server_name);
     }
 
-    nc->info_str = g_strdup_printf("net=%s,restrict=%s", inet_ntoa(net),
-                                   restricted ? "on" : "off");
-
     s = DO_UPCAST(SlirpState, nc, nc);
 
     s->slirp = slirp_init(restricted, ipv4, net, mask, host,
diff --git a/net/socket.c b/net/socket.c
index XXXXXXX..XXXXXXX 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -XXX,XX +XXX,XX @@ static void net_socket_send(void *opaque)
         s->fd = -1;
         net_socket_rs_init(&s->rs, net_socket_rs_finalize, false);
         s->nc.link_down = true;
-        g_free(s->nc.info_str);
-        s->nc.info_str = g_new0(char, 1);
 
         return;
     }
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_dgram(NetClientState *peer,
         stored->mcast = g_strdup(mcast);
 
         s->dgram_dst = saddr;
-        nc->info_str = g_strdup_printf("socket: fd=%d (cloned mcast=%s:%d)",
-                                       fd, inet_ntoa(saddr.sin_addr),
-                                       ntohs(saddr.sin_port));
     } else {
         if (sa_type == SOCKET_ADDRESS_TYPE_UNIX) {
             s->dgram_dst.sin_family = AF_UNIX;
         }
-
-        nc->info_str = g_strdup_printf("socket: fd=%d %s",
-                                       fd, SocketAddressType_str(sa_type));
     }
 
     return s;
@@ -XXX,XX +XXX,XX @@ static NetSocketState *net_socket_fd_init_stream(NetClientState *peer,
 
     nc = qemu_new_net_client(&net_socket_info, peer, model, name);
 
-    nc->info_str = g_strdup_printf("socket: fd=%d", fd);
-
     s = DO_UPCAST(NetSocketState, nc, nc);
 
     s->fd = fd;
@@ -XXX,XX +XXX,XX @@ static void net_socket_accept(void *opaque)
 
     stored->has_fd = true;
     stored->fd = g_strdup_printf("%d", fd);
-
-    g_free(s->nc.info_str);
-    s->nc.info_str = g_strdup_printf("socket: connection from %s:%d",
-                                     inet_ntoa(saddr.sin_addr),
-                                     ntohs(saddr.sin_port));
 }
 
 static int net_socket_listen_init(NetClientState *peer,
@@ -XXX,XX +XXX,XX @@ static int net_socket_connect_init(NetClientState *peer,
     stored->has_connect = true;
     stored->connect = g_strdup(host_str);
 
-    g_free(s->nc.info_str);
-    s->nc.info_str = g_strdup_printf("socket: connect to %s:%d",
-                                     inet_ntoa(saddr.sin_addr),
-                                     ntohs(saddr.sin_port));
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static int net_socket_mcast_init(NetClientState *peer,
         stored->localaddr = g_strdup(localaddr_str);
     }
 
-    g_free(s->nc.info_str);
-    s->nc.info_str = g_strdup_printf("socket: mcast=%s:%d",
-                                     inet_ntoa(saddr.sin_addr),
-                                     ntohs(saddr.sin_port));
     return 0;
-
 }
 
 static int net_socket_udp_init(NetClientState *peer,
@@ -XXX,XX +XXX,XX @@ static int net_socket_udp_init(NetClientState *peer,
     stored->has_udp = true;
     stored->udp = g_strdup(rhost);
 
-    g_free(s->nc.info_str);
-    s->nc.info_str = g_strdup_printf("socket: udp=%s:%d",
-                                     inet_ntoa(raddr.sin_addr),
-                                     ntohs(raddr.sin_port));
     return 0;
 }
 
diff --git a/net/tap-win32.c b/net/tap-win32.c
index XXXXXXX..XXXXXXX 100644
--- a/net/tap-win32.c
+++ b/net/tap-win32.c
@@ -XXX,XX +XXX,XX @@ static int tap_win32_init(NetClientState *peer, const char *model,
     stored->has_ifname = true;
     stored->ifname = g_strdup(ifname);
 
-    s->nc.info_str = g_strdup_printf("tap: ifname=%s", ifname);
-
     s->handle = handle;
 
     qemu_add_wait_object(s->handle->tap_semaphore, tap_win32_send, s);
diff --git a/net/tap.c b/net/tap.c
index XXXXXXX..XXXXXXX 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -XXX,XX +XXX,XX @@ int net_init_bridge(const Netdev *netdev, const char *name,
         stored->helper = g_strdup(helper);
     }
 
-    s->nc.info_str = g_strdup_printf("helper=%s,br=%s", helper, br);
-
     return 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
             stored->fds = g_strdup_printf("%s:%d", stored->fds, fd);
             g_free(tmp_s);
         }
-
-        s->nc.info_str = g_strdup_printf("fd=%d", fd);
     } else if (tap->has_helper) {
         if (!stored->has_helper) {
             stored->has_helper = true;
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
             stored->br = tap->has_br ? g_strdup(tap->br) :
                                        g_strdup(DEFAULT_BRIDGE_INTERFACE);
         }
-
-        s->nc.info_str = g_strdup_printf("helper=%s", tap->helper);
     } else {
         if (ifname && !stored->has_ifname) {
             stored->has_ifname = true;
@@ -XXX,XX +XXX,XX @@ static void net_init_tap_one(const NetdevTapOptions *tap, NetClientState *peer,
             stored->downscript = g_strdup(downscript);
         }
 
-        s->nc.info_str = g_strdup_printf("ifname=%s,script=%s,downscript=%s",
-                                         ifname, script, downscript);
-
         if (strcmp(downscript, "no") != 0) {
             snprintf(s->down_script, sizeof(s->down_script), "%s", downscript);
             snprintf(s->down_script_arg, sizeof(s->down_script_arg),
diff --git a/net/vde.c b/net/vde.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vde.c
+++ b/net/vde.c
@@ -XXX,XX +XXX,XX @@ static int net_vde_init(NetClientState *peer, const char *model,
 
     nc = qemu_new_net_client(&net_vde_info, peer, model, name);
 
-    nc->info_str = g_strdup_printf("sock=%s,fd=%d", sock, vde_datafd(vde));
-
     s = DO_UPCAST(VDEState, nc, nc);
 
     s->vde = vde;
diff --git a/net/vhost-user.c b/net/vhost-user.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -XXX,XX +XXX,XX @@ static int net_vhost_user_init(NetClientState *peer, const char *device,
     user = g_new0(struct VhostUserState, 1);
     for (i = 0; i < queues; i++) {
         nc = qemu_new_net_client(&net_vhost_user_info, peer, device, name);
-        nc->info_str = g_strdup_printf("vhost-user%d to %s", i, chr->label);
         nc->queue_index = i;
         if (!nc0) {
             nc0 = nc;
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int net_vhost_vdpa_init(NetClientState *peer, const char *device,
     stored->has_queues = true;
     stored->queues = 1; /* TODO: change when support multiqueue */
 
-    nc->info_str = g_strdup_printf(TYPE_VHOST_VDPA);
     nc->queue_index = 0;
     s = DO_UPCAST(VhostVDPAState, nc, nc);
     vdpa_device_fd = qemu_open_old(vhostdev, O_RDWR);
-- 
2.7.4

The following changes since commit d9ccf33f9479201e5add8db0af68ca9ca8da358b:

Merge remote-tracking branch 'remotes/lvivier-gitlab/tags/linux-user-for-7.0-pull-request' into staging (2022-03-09 20:01:17 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to eea40402ecf895ed345f8e8eb07dbb484f4542c5:

vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-10 10:26:32 +0800)

----------------------------------------------------------------

----------------------------------------------------------------
Eugenio Pérez (14):
      vhost: Add VhostShadowVirtqueue
      vhost: Add Shadow VirtQueue kick forwarding capabilities
      vhost: Add Shadow VirtQueue call forwarding capabilities
      vhost: Add vhost_svq_valid_features to shadow vq
      virtio: Add vhost_svq_get_vring_addr
      vdpa: adapt vhost_ops callbacks to svq
      vhost: Shadow virtqueue buffers forwarding
      util: Add iova_tree_alloc_map
      util: add iova_tree_find_iova
      vhost: Add VhostIOVATree
      vdpa: Add custom IOTLB translations to SVQ
      vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
      vdpa: Never set log_base addr if SVQ is enabled
      vdpa: Expose VHOST_F_LOG_ALL on SVQ

Jason Wang (1):
      virtio-net: fix map leaking on error during receive

hw/net/virtio-net.c                |   1 +
 hw/virtio/meson.build              |   2 +-
 hw/virtio/vhost-iova-tree.c        | 110 +++++++
 hw/virtio/vhost-iova-tree.h        |  27 ++
 hw/virtio/vhost-shadow-virtqueue.c | 638 +++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
 hw/virtio/vhost-vdpa.c             | 525 +++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   8 +
 include/qemu/iova-tree.h           |  38 ++-
 util/iova-tree.c                   | 169 ++++++++++
 10 files changed, 1588 insertions(+), 17 deletions(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
tries to fix the use after free of the sg by caching the virtqueue
elements in an array and unmap them at once after receiving the
packets, But it forgot to unmap the cached elements on error which
will lead to leaking of mapping and other unexpected results.

Fixing this by detaching the cached elements on error. This addresses
CVE-2022-26353.

Reported-by: Victor Tom <vv474172261@gmail.com>
Cc: qemu-stable@nongnu.org
Fixes: CVE-2022-26353
Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
 
 err:
     for (j = 0; j < i; j++) {
+        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
         g_free(elems[j]);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
notifications and buffers, allowing qemu to track them. While qemu is
forwarding the buffers and virtqueue changes, it is able to commit the
memory it's being dirtied, the same way regular qemu's VirtIO devices
do.

This commit only exposes basic SVQ allocation and free. Next patches of
the series add functionality like notifications and buffers forwarding.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build              |  2 +-
 hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
+
+#include "qemu/error-report.h"
+
+/**
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
+ * shadow methods and file descriptors.
+ *
+ * Returns the new virtqueue or NULL.
+ *
+ * In case of error, reason is reported through error_report.
+ */
+VhostShadowVirtqueue *vhost_svq_new(void)
+{
+    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
+    int r;
+
+    r = event_notifier_init(&svq->hdev_kick, 0);
+    if (r != 0) {
+        error_report("Couldn't create kick event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_kick;
+    }
+
+    r = event_notifier_init(&svq->hdev_call, 0);
+    if (r != 0) {
+        error_report("Couldn't create call event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_call;
+    }
+
+    return g_steal_pointer(&svq);
+
+err_init_hdev_call:
+    event_notifier_cleanup(&svq->hdev_kick);
+
+err_init_hdev_kick:
+    return NULL;
+}
+
+/**
+ * Free the resources of the shadow virtqueue.
+ *
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
+ */
+void vhost_svq_free(gpointer pvq)
+{
+    VhostShadowVirtqueue *vq = pvq;
+    event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_cleanup(&vq->hdev_call);
+    g_free(vq);
+}
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
+#define VHOST_SHADOW_VIRTQUEUE_H
+
+#include "qemu/event_notifier.h"
+
+/* Shadow virtqueue to relay notifications */
+typedef struct VhostShadowVirtqueue {
+    /* Shadow kick notifier, sent to vhost */
+    EventNotifier hdev_kick;
+    /* Shadow call notifier, sent to vhost */
+    EventNotifier hdev_call;
+} VhostShadowVirtqueue;
+
+VhostShadowVirtqueue *vhost_svq_new(void);
+
+void vhost_svq_free(gpointer vq);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

At this mode no buffer forwarding will be performed in SVQ mode: Qemu
will just forward the guest's kicks to the device.

Host memory notifiers regions are left out for simplicity, and they will
not be addressed in this series.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  56 ++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
 hw/virtio/vhost-vdpa.c             | 145 ++++++++++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   4 +
 4 files changed, 217 insertions(+), 2 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

This will make qemu aware of the device used buffers, allowing it to
write the guest memory with its contents if needed.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
 hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 3 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
 }
 
 /**
+ * Forward vhost notifications
+ *
+ * @n: hdev call event notifier, the one that device set to notify svq.
+ */
+static void vhost_svq_handle_call(EventNotifier *n)
+{
+    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+                                             hdev_call);
+    event_notifier_test_and_clear(n);
+    event_notifier_set(&svq->svq_call);
+}
+
+/**
+ * Set the call notifier for the SVQ to call the guest
+ *
+ * @svq: Shadow virtqueue
+ * @call_fd: call notifier
+ *
+ * Called on BQL context.
+ */
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+{
+    if (call_fd == VHOST_FILE_UNBIND) {
+        /*
+         * Fail event_notifier_set if called handling device call.
+         *
+         * SVQ still needs device notifications, since it needs to keep
+         * forwarding used buffers even with the unbind.
+         */
+        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
+    } else {
+        event_notifier_init_fd(&svq->svq_call, call_fd);
+    }
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
     }
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
+    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
     VhostShadowVirtqueue *vq = pvq;
     vhost_svq_stop(vq);
     event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_set_handler(&vq->hdev_call, NULL);
     event_notifier_cleanup(&vq->hdev_call);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
      */
     EventNotifier svq_kick;
+
+    /* Guest's call notifier, where the SVQ calls guest. */
+    EventNotifier svq_call;
 } VhostShadowVirtqueue;
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 }
 
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+                                         struct vhost_vring_file *file)
+{
+    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
  * @svq: The shadow virtqueue
  * @idx: The index of the virtqueue in the vhost device
  * @errp: Error
+ *
+ * Note that this function does not rewind kick file descriptor if cannot set
+ * call one.
  */
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                  VhostShadowVirtqueue *svq,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
+        return false;
+    }
+
+    event_notifier = &svq->hdev_call;
+    file.fd = event_notifier_get_fd(event_notifier);
+    r = vhost_vdpa_set_vring_dev_call(dev, &file);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
     return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                        struct vhost_vring_file *file)
 {
-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        int vdpa_idx = file->index - dev->vq_index;
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+
+        vhost_svq_set_svq_call_fd(svq, file->fd);
+        return 0;
+    } else {
+        return vhost_vdpa_set_vring_dev_call(dev, file);
+    }
 }
 
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows SVQ to negotiate features with the guest and the device. For
the device, SVQ is a driver. While this function bypasses all
non-transport features, it needs to disable the features that SVQ does
not support when forwarding buffers. This includes packed vq layout,
indirect descriptors or event idx.

Future changes can add support to offer more features to the guest,
since the use of VirtQueue gives this for free. This is left out at the
moment for simplicity.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  2 ++
 hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 3 files changed, 61 insertions(+)

From: Eugenio Pérez <eperezma@redhat.com>

It reports the shadow virtqueue address from qemu virtual address space.

Since this will be different from the guest's vaddr, but the device can
access it, SVQ takes special care about its alignment & lack of garbage
data. It assumes that IOMMU will work in host_page_size ranges for that.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  9 +++++++++
 2 files changed, 38 insertions(+)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 }
 
 /**
+ * Get the shadow vq vring address.
+ * @svq: Shadow virtqueue
+ * @addr: Destination to store address
+ */
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr)
+{
+    addr->desc_user_addr = (uint64_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)svq->vring.used;
+}
+
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    size_t avail_size = offsetof(vring_avail_t, ring) +
+                                             sizeof(uint16_t) * svq->vring.num;
+
+    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
+}
+
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t used_size = offsetof(vring_used_t, ring) +
+                                    sizeof(vring_used_elem_t) * svq->vring.num;
+    return ROUND_UP(used_size, qemu_real_host_page_size);
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #define VHOST_SHADOW_VIRTQUEUE_H
 
 #include "qemu/event_notifier.h"
+#include "hw/virtio/virtio.h"
+#include "standard-headers/linux/vhost_types.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
+    /* Shadow vring */
+    struct vring vring;
+
     /* Shadow kick notifier, sent to vhost */
     EventNotifier hdev_kick;
     /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr);
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

First half of the buffers forwarding part, preparing vhost-vdpa
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
this is effectively dead code at the moment, but it helps to reduce
patch size.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
     return ret;
  }
 
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
+                                         struct vhost_vring_state *ring)
+{
+    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                          struct vhost_vring_file *file)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 }
 
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
+                                         struct vhost_vring_addr *addr)
+{
+    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
+                                addr->desc_user_addr, addr->used_user_addr,
+                                addr->avail_user_addr,
+                                addr->log_guest_addr);
+
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                        struct vhost_vring_addr *addr)
 {
-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-                                    addr->desc_user_addr, addr->used_user_addr,
-                                    addr->avail_user_addr,
-                                    addr->log_guest_addr);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring addr was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_vring_dev_addr(dev, addr);
 }
 
 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring base was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_dev_vring_base(dev, ring);
 }
 
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Initial version of shadow virtqueue that actually forward buffers. There
is no iommu support at the moment, and that will be addressed in future
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
this means that SVQ is not usable at this point of the series on any
device.

For simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors. Support for them will not be added in this series.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

However, forwarding buffers have some particular pieces: One of the most
unexpected ones is that a guest's buffer can expand through more than
one descriptor in SVQ. While this is handled gracefully by qemu's
emulated virtio devices, it may cause unexpected SVQ queue full. This
patch also solves it by checking for this condition at both guest's
kicks and device's calls. The code may be more elegant in the future if
SVQ code runs in its own iocontext.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 354 ++++++++++++++++++++++++++++++++++++-
 hw/virtio/vhost-shadow-virtqueue.h |  26 +++
 hw/virtio/vhost-vdpa.c             | 159 ++++++++++++++++-
 3 files changed, 527 insertions(+), 12 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "linux-headers/linux/vhost.h"
 
 /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
 }
 
 /**
- * Forward guest notifications.
+ * Number of descriptors that the SVQ can make available from the guest.
+ *
+ * @svq: The svq
+ */
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
+{
+    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    const struct iovec *iovec,
+                                    size_t num, bool more_descs, bool write)
+{
+    uint16_t i = svq->free_head, last = svq->free_head;
+    unsigned n;
+    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = svq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
+        descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+        last = i;
+        i = cpu_to_le16(descs[i].next);
+    }
+
+    svq->free_head = le16_to_cpu(descs[last].next);
+}
+
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+                                VirtQueueElement *elem,
+                                unsigned *head)
+{
+    unsigned avail_idx;
+    vring_avail_t *avail = svq->vring.avail;
+
+    *head = svq->free_head;
+
+    /* We need some descriptors here */
+    if (unlikely(!elem->out_num && !elem->in_num)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+            "Guest provided element with no descriptors");
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
+                            elem->in_num > 0, false);
+    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+    /*
+     * Put the entry in the available array (but don't update avail->idx until
+     * they do sync).
+     */
+    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
+    avail->ring[avail_idx] = cpu_to_le16(*head);
+    svq->shadow_avail_idx++;
+
+    /* Update the avail index after write the descriptor */
+    smp_wmb();
+    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
+
+    return true;
+}
+
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+{
+    unsigned qemu_head;
+    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    svq->ring_id_maps[qemu_head] = elem;
+    return true;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /*
+     * We need to expose the available array entries before checking the used
+     * flags
+     */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+        return;
+    }
+
+    event_notifier_set(&svq->hdev_kick);
+}
+
+/**
+ * Forward available buffers.
+ *
+ * @svq: Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
+ * qemu vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until the
+ * device uses some buffers.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+    /* Clear event notifier */
+    event_notifier_test_and_clear(&svq->svq_kick);
+
+    /* Forward to the device as many available buffers as possible */
+    do {
+        virtio_queue_set_notification(svq->vq, false);
+
+        while (true) {
+            VirtQueueElement *elem;
+            bool ok;
+
+            if (svq->next_guest_avail_elem) {
+                elem = g_steal_pointer(&svq->next_guest_avail_elem);
+            } else {
+                elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            }
+
+            if (!elem) {
+                break;
+            }
+
+            if (elem->out_num + elem->in_num >
+                vhost_svq_available_slots(svq)) {
+                /*
+                 * This condition is possible since a contiguous buffer in GPA
+                 * does not imply a contiguous buffer in qemu's VA
+                 * scatter-gather segments. If that happens, the buffer exposed
+                 * to the device needs to be a chain of descriptors at this
+                 * moment.
+                 *
+                 * SVQ cannot hold more available buffers if we are here:
+                 * queue the current guest descriptor and ignore further kicks
+                 * until some elements are used.
+                 */
+                svq->next_guest_avail_elem = elem;
+                return;
+            }
+
+            ok = vhost_svq_add(svq, elem);
+            if (unlikely(!ok)) {
+                /* VQ is broken, just return and ignore any other kicks */
+                return;
+            }
+            vhost_svq_kick(svq);
+        }
+
+        virtio_queue_set_notification(svq->vq, true);
+    } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
  *
  * @n: guest kick event notifier, the one that guest set to notify svq.
  */
-static void vhost_handle_guest_kick(EventNotifier *n)
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              svq_kick);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->hdev_kick);
+    vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+    if (svq->last_used_idx != svq->shadow_used_idx) {
+        return true;
+    }
+
+    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+    return svq->last_used_idx != svq->shadow_used_idx;
 }
 
 /**
- * Forward vhost notifications
+ * Enable vhost device calls after disable them.
+ *
+ * @svq: The svq
+ *
+ * It returns false if there are pending used buffers from the vhost device,
+ * avoiding the possible races between SVQ checking for more work and enabling
+ * callbacks. True if SVQ used vring has no more pending buffers.
+ */
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+    /* Make sure the flag is written before the read of used_idx */
+    smp_mb();
+    return !vhost_svq_more_used(svq);
+}
+
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+                                           uint32_t *len)
+{
+    vring_desc_t *descs = svq->vring.desc;
+    const vring_used_t *used = svq->vring.used;
+    vring_used_elem_t used_elem;
+    uint16_t last_used;
+
+    if (!vhost_svq_more_used(svq)) {
+        return NULL;
+    }
+
+    /* Only get used array entries after they have been exposed by dev */
+    smp_rmb();
+    last_used = svq->last_used_idx & (svq->vring.num - 1);
+    used_elem.id = le32_to_cpu(used->ring[last_used].id);
+    used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+    svq->last_used_idx++;
+    if (unlikely(used_elem.id >= svq->vring.num)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
+                      svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+            "Device %s says index %u is used, but it was not available",
+            svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    descs[used_elem.id].next = svq->free_head;
+    svq->free_head = used_elem.id;
+
+    *len = used_elem.len;
+    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+                            bool check_for_avail_queue)
+{
+    VirtQueue *vq = svq->vq;
+
+    /* Forward as many used buffers as possible. */
+    do {
+        unsigned i = 0;
+
+        vhost_svq_disable_notification(svq);
+        while (true) {
+            uint32_t len;
+            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+            if (!elem) {
+                break;
+            }
+
+            if (unlikely(i >= svq->vring.num)) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                         "More than %u used buffers obtained in a %u size SVQ",
+                         i, svq->vring.num);
+                virtqueue_fill(vq, elem, len, i);
+                virtqueue_flush(vq, i);
+                return;
+            }
+            virtqueue_fill(vq, elem, len, i++);
+        }
+
+        virtqueue_flush(vq, i);
+        event_notifier_set(&svq->svq_call);
+
+        if (check_for_avail_queue && svq->next_guest_avail_elem) {
+            /*
+             * Avail ring was full when vhost_svq_flush was called, so it's a
+             * good moment to make more descriptors available if possible.
+             */
+            vhost_handle_guest_kick(svq);
+        }
+    } while (!vhost_svq_enable_notification(svq));
+}
+
+/**
+ * Forward used buffers.
  *
  * @n: hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
  */
 static void vhost_svq_handle_call(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              hdev_call);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->svq_call);
+    vhost_svq_flush(svq, true);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
     if (poll_start) {
         event_notifier_init_fd(svq_kick, svq_kick_fd);
         event_notifier_set(svq_kick);
-        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
+        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
+    }
+}
+
+/**
+ * Start the shadow virtqueue operation.
+ *
+ * @svq: Shadow Virtqueue
+ * @vdev: VirtIO device
+ * @vq: Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq)
+{
+    size_t desc_size, driver_size, device_size;
+
+    svq->next_guest_avail_elem = NULL;
+    svq->shadow_avail_idx = 0;
+    svq->shadow_used_idx = 0;
+    svq->last_used_idx = 0;
+    svq->vdev = vdev;
+    svq->vq = vq;
+
+    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+    driver_size = vhost_svq_driver_area_size(svq);
+    device_size = vhost_svq_device_area_size(svq);
+    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
+    desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+    memset(svq->vring.desc, 0, driver_size);
+    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+    memset(svq->vring.used, 0, device_size);
+    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+        svq->vring.desc[i].next = cpu_to_le16(i + 1);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 void vhost_svq_stop(VhostShadowVirtqueue *svq)
 {
     event_notifier_set_handler(&svq->svq_kick, NULL);
+    g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+    if (!svq->vq) {
+        return;
+    }
+
+    /* Send all pending used descriptors to guest */
+    vhost_svq_flush(svq, false);
+
+    for (unsigned i = 0; i < svq->vring.num; ++i) {
+        g_autofree VirtQueueElement *elem = NULL;
+        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        if (elem) {
+            virtqueue_detach_element(svq->vq, elem, 0);
+        }
+    }
+
+    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+    if (next_avail_elem) {
+        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+    }
+    svq->vq = NULL;
+    g_free(svq->ring_id_maps);
+    qemu_vfree(svq->vring.desc);
+    qemu_vfree(svq->vring.used);
 }
 
 /**
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
 
     /* Guest's call notifier, where the SVQ calls guest. */
     EventNotifier svq_call;
+
+    /* Virtio queue shadowing */
+    VirtQueue *vq;
+
+    /* Virtio device */
+    VirtIODevice *vdev;
+
+    /* Map for use the guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next VirtQueue element that guest made available */
+    VirtQueueElement *next_guest_avail_elem;
+
+    /* Next head to expose to the device */
+    uint16_t shadow_avail_idx;
+
+    /* Next free descriptor */
+    uint16_t free_head;
+
+    /* Last seen used idx */
+    uint16_t shadow_used_idx;
+
+    /* Next head to consume from the device */
+    uint16_t last_used_idx;
 } VhostShadowVirtqueue;
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
 VhostShadowVirtqueue *vhost_svq_new(void);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
  * Note that this function does not rewind kick file descriptor if cannot set
  * call one.
  */
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-                                 VhostShadowVirtqueue *svq,
-                                 unsigned idx,
-                                 Error **errp)
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq,
+                                  unsigned idx,
+                                  Error **errp)
 {
     struct vhost_vring_file file = {
         .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
-        return false;
+        return r;
     }
 
     event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
         error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
+    return r;
+}
+
+/**
+ * Unmap a SVQ area in the device
+ */
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
+                                      hwaddr size)
+{
+    int r;
+
+    size = ROUND_UP(size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, iova, size);
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+                                       const VhostShadowVirtqueue *svq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    struct vhost_vring_addr svq_addr;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    bool ok;
+
+    vhost_svq_get_vring_addr(svq, &svq_addr);
+
+    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+}
+
+/**
+ * Map the shadow virtqueue rings in the device
+ *
+ * @dev: The vhost device
+ * @svq: The shadow virtqueue
+ * @addr: Assigned IOVA addresses
+ * @errp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+                                     const VhostShadowVirtqueue *svq,
+                                     struct vhost_vring_addr *addr,
+                                     Error **errp)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    int r;
+
+    ERRP_GUARD();
+    vhost_svq_get_vring_addr(svq, addr);
+
+    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
+                           (void *)addr->desc_user_addr, true);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
+                           (void *)addr->used_user_addr, false);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    }
+
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                 VhostShadowVirtqueue *svq,
+                                 unsigned idx,
+                                 Error **errp)
+{
+    uint16_t vq_index = dev->vq_index + idx;
+    struct vhost_vring_state s = {
+        .index = vq_index,
+    };
+    int r;
+
+    r = vhost_vdpa_set_dev_vring_base(dev, &s);
+    if (unlikely(r)) {
+        error_setg_errno(errp, -r, "Cannot set vring base");
+        return false;
+    }
+
+    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
     return r == 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
     }
 
     for (i = 0; i < v->shadow_vqs->len; ++i) {
+        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        struct vhost_vring_addr addr = {
+            .index = i,
+        };
+        int r;
         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
         if (unlikely(!ok)) {
-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+            goto err;
+        }
+
+        vhost_svq_start(svq, dev->vdev, vq);
+        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
+        if (unlikely(!ok)) {
+            goto err_map;
+        }
+
+        /* Override vring GPA set by vhost subsystem */
+        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
+        if (unlikely(r != 0)) {
+            error_setg_errno(&err, -r, "Cannot set device address");
+            goto err_set_addr;
+        }
+    }
+
+    return true;
+
+err_set_addr:
+    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
+
+err_map:
+    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
+
+err:
+    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+    for (unsigned j = 0; j < i; ++j) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
+        vhost_vdpa_svq_unmap_rings(dev, svq);
+        vhost_svq_stop(svq);
+    }
+
+    return false;
+}
+
+static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (!v->shadow_vqs) {
+        return true;
+    }
+
+    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                      i);
+        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
+        if (unlikely(!ok)) {
             return false;
         }
     }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
         }
         vhost_vdpa_set_vring_ready(dev);
     } else {
+        ok = vhost_vdpa_svqs_stop(dev);
+        if (unlikely(!ok)) {
+            return -1;
+        }
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This iova tree function allows it to look for a hole in allocated
regions and return a totally new translation for a given translated
address.

It's usage is mainly to allow devices to access qemu address space,
remapping guest's one into a new iova space where qemu can add chunks of
addresses.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h |  18 +++++++
 util/iova-tree.c         | 135 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 153 insertions(+)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@
 #define  IOVA_OK           (0)
 #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
 #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
+#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
 
 typedef struct IOVATree IOVATree;
 typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
 void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
 
 /**
+ * iova_tree_alloc_map:
+ *
+ * @tree: the iova tree to allocate from
+ * @map: the new map (as translated addr & size) to allocate in the iova region
+ * @iova_begin: the minimum address of the allocation
+ * @iova_end: the maximum addressable direction of the allocation
+ *
+ * Allocates a new region of a given size, between iova_min and iova_max.
+ *
+ * Return: Same as iova_tree_insert, but cannot overlap and can return error if
+ * iova tree is out of free contiguous range. The caller gets the assigned iova
+ * in map->iova.
+ */
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_end);
+
+/**
  * iova_tree_destroy:
  *
  * @tree: the iova tree to destroy
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
     GTree *tree;
 };
 
+/* Args to pass to iova_tree_alloc foreach function. */
+struct IOVATreeAllocArgs {
+    /* Size of the desired allocation */
+    size_t new_size;
+
+    /* The minimum address allowed in the allocation */
+    hwaddr iova_begin;
+
+    /* Map at the left of the hole, can be NULL if "this" is first one */
+    const DMAMap *prev;
+
+    /* Map at the right of the hole, can be NULL if "prev" is the last one */
+    const DMAMap *this;
+
+    /* If found, we fill in the IOVA here */
+    hwaddr iova_result;
+
+    /* Whether have we found a valid IOVA */
+    bool iova_found;
+};
+
+/**
+ * Iterate args to the next hole
+ *
+ * @args: The alloc arguments
+ * @next: The next mapping in the tree. Can be NULL to signal the last one
+ */
+static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
+                                         const DMAMap *next) {
+    args->prev = args->this;
+    args->this = next;
+}
+
 static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
 {
     const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
     return IOVA_OK;
 }
 
+/**
+ * Try to find an unallocated IOVA range between prev and this elements.
+ *
+ * @args: Arguments to allocation
+ *
+ * Cases:
+ *
+ * (1) !prev, !this: No entries allocated, always succeed
+ *
+ * (2) !prev, this: We're iterating at the 1st element.
+ *
+ * (3) prev, !this: We're iterating at the last element.
+ *
+ * (4) prev, this: this is the most common case, we'll try to find a hole
+ * between "prev" and "this" mapping.
+ *
+ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
+ * searches linearly so it's easy to discard the result if it's not the case.
+ */
+static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
+{
+    const DMAMap *prev = args->prev, *this = args->this;
+    uint64_t hole_start, hole_last;
+
+    if (this && this->iova + this->size < args->iova_begin) {
+        return;
+    }
+
+    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
+    hole_last = this ? this->iova : HWADDR_MAX;
+
+    if (hole_last - hole_start > args->new_size) {
+        args->iova_result = hole_start;
+        args->iova_found = true;
+    }
+}
+
+/**
+ * Foreach dma node in the tree, compare if there is a hole with its previous
+ * node (or minimum iova address allowed) and the node.
+ *
+ * @key: Node iterating
+ * @value: Node iterating
+ * @pargs: Struct to communicate with the outside world
+ *
+ * Return: false to keep iterating, true if needs break.
+ */
+static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
+                                         gpointer pargs)
+{
+    struct IOVATreeAllocArgs *args = pargs;
+    DMAMap *node = value;
+
+    assert(key == value);
+
+    iova_tree_alloc_args_iterate(args, node);
+    iova_tree_alloc_map_in_hole(args);
+    return args->iova_found;
+}
+
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_last)
+{
+    struct IOVATreeAllocArgs args = {
+        .new_size = map->size,
+        .iova_begin = iova_begin,
+    };
+
+    if (unlikely(iova_last < iova_begin)) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /*
+     * Find a valid hole for the mapping
+     *
+     * Assuming low iova_begin, so no need to do a binary search to
+     * locate the first node.
+     *
+     * TODO: Replace all this with g_tree_node_first/next/last when available
+     * (from glib since 2.68). To do it with g_tree_foreach complicates the
+     * code a lot.
+     *
+     */
+    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
+    if (!args.iova_found) {
+        /*
+         * Either tree is empty or the last hole is still not checked.
+         * g_tree_foreach does not compare (last, iova_last] range, so we check
+         * it here.
+         */
+        iova_tree_alloc_args_iterate(&args, NULL);
+        iova_tree_alloc_map_in_hole(&args);
+    }
+
+    if (!args.iova_found || args.iova_result + map->size > iova_last) {
+        return IOVA_ERR_NOMEM;
+    }
+
+    map->iova = args.iova_result;
+    return iova_tree_insert(tree, map);
+}
+
 void iova_tree_destroy(IOVATree *tree)
 {
     g_tree_destroy(tree->tree);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This function does the reverse operation of iova_tree_find: To look for
a mapping that match a translated address so we can do the reverse.

This have linear complexity instead of logarithmic, but it supports
overlapping HVA. Future developments could reduce it.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h | 20 +++++++++++++++++++-
 util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  * @tree: the iova tree to search from
  * @map: the mapping to search
  *
- * Search for a mapping in the iova tree that overlaps with the
+ * Search for a mapping in the iova tree that iova overlaps with the
  * mapping range specified.  Only the first found mapping will be
  * returned.
  *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
 const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
 
 /**
+ * iova_tree_find_iova:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that translated_addr overlaps with the
+ * mapping range specified.  Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found.  Note that
+ * the returned DMAMap pointer is maintained internally.  User should
+ * only read the content but never modify or free the content.  Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
+
+/**
  * iova_tree_find_address:
  *
  * @tree: the iova tree to search from
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
     bool iova_found;
 };
 
+typedef struct IOVATreeFindIOVAArgs {
+    const DMAMap *needle;
+    const DMAMap *result;
+} IOVATreeFindIOVAArgs;
+
 /**
  * Iterate args to the next hole
  *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
     return g_tree_lookup(tree->tree, map);
 }
 
+static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
+                                                gpointer data)
+{
+    const DMAMap *map = key;
+    IOVATreeFindIOVAArgs *args = data;
+    const DMAMap *needle;
+
+    g_assert(key == value);
+
+    needle = args->needle;
+    if (map->translated_addr + map->size < needle->translated_addr ||
+        needle->translated_addr + needle->size < map->translated_addr) {
+        return false;
+    }
+
+    args->result = map;
+    return true;
+}
+
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
+{
+    IOVATreeFindIOVAArgs args = {
+        .needle = map,
+    };
+
+    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
+    return args.result;
+}
+
 const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
 {
     const DMAMap map = { .iova = iova, .size = 0 };
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This tree is able to look for a translated address from an IOVA address.

At first glance it is similar to util/iova-tree. However, SVQ working on
devices with limited IOVA space need more capabilities, like allocating
IOVA chunks or performing reverse translations (qemu addresses to iova).

The allocation capability, as "assign a free IOVA address to this chunk
of memory in qemu's address space" allows shadow virtqueue to create a
new address space that is not restricted by guest's addressable one, so
we can allocate shadow vqs vrings outside of it.

It duplicates the tree so it can search efficiently in both directions,
and it will signal overlap if iova or the translated address is present
in any tree.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build       |   2 +-
 hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+#include "vhost-iova-tree.h"
+
+#define iova_min_addr qemu_real_host_page_size
+
+/**
+ * VhostIOVATree, able to:
+ * - Translate iova address
+ * - Reverse translate iova address (from translated to iova)
+ * - Allocate IOVA regions for translated range (linear operation)
+ */
+struct VhostIOVATree {
+    /* First addressable iova address in the device */
+    uint64_t iova_first;
+
+    /* Last addressable iova address in the device */
+    uint64_t iova_last;
+
+    /* IOVA address to qemu memory maps. */
+    IOVATree *iova_taddr_map;
+};
+
+/**
+ * Create a new IOVA tree
+ *
+ * Returns the new IOVA tree
+ */
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
+{
+    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
+
+    /* Some devices do not like 0 addresses */
+    tree->iova_first = MAX(iova_first, iova_min_addr);
+    tree->iova_last = iova_last;
+
+    tree->iova_taddr_map = iova_tree_new();
+    return tree;
+}
+
+/**
+ * Delete an iova tree
+ */
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+{
+    iova_tree_destroy(iova_tree->iova_taddr_map);
+    g_free(iova_tree);
+}
+
+/**
+ * Find the IOVA address stored from a memory address
+ *
+ * @tree: The iova tree
+ * @map: The map with the memory address
+ *
+ * Return the stored mapping, or NULL if not found.
+ */
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
+                                        const DMAMap *map)
+{
+    return iova_tree_find_iova(tree->iova_taddr_map, map);
+}
+
+/**
+ * Allocate a new mapping
+ *
+ * @tree: The iova tree
+ * @map: The iova map
+ *
+ * Returns:
+ * - IOVA_OK if the map fits in the container
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
+ *
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
+ */
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
+{
+    /* Some vhost devices do not like addr 0. Skip first page */
+    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
+
+    if (map->translated_addr + map->size < map->translated_addr ||
+        map->perm == IOMMU_NONE) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /* Allocate a node in IOVA address */
+    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
+                               tree->iova_last);
+}
+
+/**
+ * Remove existing mappings from iova tree
+ *
+ * @iova_tree: The vhost iova tree
+ * @map: The map to remove
+ */
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
+{
+    iova_tree_remove(iova_tree->iova_taddr_map, map);
+}
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
+
+#include "qemu/iova-tree.h"
+#include "exec/memory.h"
+
+typedef struct VhostIOVATree VhostIOVATree;
+
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
+                                        const DMAMap *map);
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Use translations added in VhostIOVATree in SVQ.

Only introduce usage here, not allocation and deallocation. As with
previous patches, we use the dead code paths of shadow_vqs_enabled to
avoid commiting too many changes at once. These are impossible to take
at the moment.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  75 +++++++++++++++++++++--
 hw/virtio/vhost-shadow-virtqueue.h |   6 +-
 hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
 include/hw/virtio/vhost-vdpa.h     |   3 +
 4 files changed, 181 insertions(+), 25 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 }
 
+/**
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
+ *
+ * @svq: Shadow VirtQueue
+ * @vaddr: Translated IOVA addresses
+ * @iovec: Source qemu's VA addresses
+ * @num: Length of iovec and minimum length of vaddr
+ */
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+                                     void **addrs, const struct iovec *iovec,
+                                     size_t num)
+{
+    if (num == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+        DMAMap needle = {
+            .translated_addr = (hwaddr)iovec[i].iov_base,
+            .size = iovec[i].iov_len,
+        };
+        size_t off;
+
+        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
+        /*
+         * Map cannot be NULL since iova map contains all guest space and
+         * qemu already has a physical address mapped
+         */
+        if (unlikely(!map)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
+                          needle.translated_addr);
+            return false;
+        }
+
+        off = needle.translated_addr - map->translated_addr;
+        addrs[i] = (void *)(map->iova + off);
+
+        if (unlikely(int128_gt(int128_add(needle.translated_addr,
+                                          iovec[i].iov_len),
+                               map->translated_addr + map->size))) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Guest buffer expands over iova range");
+            return false;
+        }
+    }
+
+    return true;
+}
+
 static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    void * const *sg,
                                     const struct iovec *iovec,
                                     size_t num, bool more_descs, bool write)
 {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
         } else {
             descs[i].flags = flags;
         }
-        descs[i].addr = cpu_to_le64((hwaddr)iovec[n].iov_base);
+        descs[i].addr = cpu_to_le64((hwaddr)sg[n]);
         descs[i].len = cpu_to_le32(iovec[n].iov_len);
 
         last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
+    bool ok;
+    g_autofree void **sgs = g_new(void *, MAX(elem->out_num, elem->in_num));
 
     *head = svq->free_head;
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
         return false;
     }
 
-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num,
+    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
                             elem->in_num > 0, false);
-    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+
+    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
 
     /*
      * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
  * shadow methods and file descriptors.
  *
+ * @iova_tree: Tree to perform descriptors translations
+ *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(void)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+    svq->iova_tree = iova_tree;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/event_notifier.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-iova-tree.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
     /* Virtio device */
     VirtIODevice *vdev;
 
+    /* IOVA mapping */
+    VhostIOVATree *iova_tree;
+
     /* Map for use the guest's descriptors */
     VirtQueueElement **ring_id_maps;
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(void);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                          vaddr, section->readonly);
 
     llsize = int128_sub(llend, int128_make64(iova));
+    if (v->shadow_vqs_enabled) {
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)vaddr,
+            .size = int128_get64(llsize) - 1,
+            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+        };
+
+        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
+        if (unlikely(r != IOVA_OK)) {
+            error_report("Can't allocate a mapping (%d)", r);
+            goto fail;
+        }
+
+        iova = mem_region.iova;
+    }
 
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
 
     llsize = int128_sub(llend, int128_make64(iova));
 
+    if (v->shadow_vqs_enabled) {
+        const DMAMap *result;
+        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)vaddr,
+            .size = int128_get64(llsize) - 1,
+        };
+
+        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
+        iova = result->iova;
+        vhost_iova_tree_remove(v->iova_tree, &mem_region);
+    }
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
+        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
 
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 /**
  * Unmap a SVQ area in the device
  */
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
-                                      hwaddr size)
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
+                                      const DMAMap *needle)
 {
+    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
+    hwaddr size;
     int r;
 
-    size = ROUND_UP(size, qemu_real_host_page_size);
-    r = vhost_vdpa_dma_unmap(v, iova, size);
+    if (unlikely(!result)) {
+        error_report("Unable to find SVQ address to unmap");
+        return false;
+    }
+
+    size = ROUND_UP(result->size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, result->iova, size);
     return r == 0;
 }
 
 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                        const VhostShadowVirtqueue *svq)
 {
+    DMAMap needle = {};
     struct vhost_vdpa *v = dev->opaque;
     struct vhost_vring_addr svq_addr;
-    size_t device_size = vhost_svq_device_area_size(svq);
-    size_t driver_size = vhost_svq_driver_area_size(svq);
     bool ok;
 
     vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    needle.translated_addr = svq_addr.desc_user_addr;
+    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
     if (unlikely(!ok)) {
         return false;
     }
 
-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+    needle.translated_addr = svq_addr.used_user_addr;
+    return vhost_vdpa_svq_unmap_ring(v, &needle);
+}
+
+/**
+ * Map the SVQ area in the device
+ *
+ * @v: Vhost-vdpa device
+ * @needle: The area to search iova
+ * @errorp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
+                                    Error **errp)
+{
+    int r;
+
+    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
+    if (unlikely(r != IOVA_OK)) {
+        error_setg(errp, "Cannot allocate iova (%d)", r);
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
+                           (void *)needle->translated_addr,
+                           needle->perm == IOMMU_RO);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot map region to device");
+        vhost_iova_tree_remove(v->iova_tree, needle);
+    }
+
+    return r == 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                      struct vhost_vring_addr *addr,
                                      Error **errp)
 {
+    DMAMap device_region, driver_region;
+    struct vhost_vring_addr svq_addr;
     struct vhost_vdpa *v = dev->opaque;
     size_t device_size = vhost_svq_device_area_size(svq);
     size_t driver_size = vhost_svq_driver_area_size(svq);
-    int r;
+    size_t avail_offset;
+    bool ok;
 
     ERRP_GUARD();
-    vhost_svq_get_vring_addr(svq, addr);
+    vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
-                           (void *)addr->desc_user_addr, true);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+    driver_region = (DMAMap) {
+        .translated_addr = svq_addr.desc_user_addr,
+        .size = driver_size - 1,
+        .perm = IOMMU_RO,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq driver region: ");
         return false;
     }
+    addr->desc_user_addr = driver_region.iova;
+    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
+    addr->avail_user_addr = driver_region.iova + avail_offset;
 
-    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
-                           (void *)addr->used_user_addr, false);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    device_region = (DMAMap) {
+        .translated_addr = svq_addr.used_user_addr,
+        .size = device_size - 1,
+        .perm = IOMMU_RW,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq device region: ");
+        vhost_vdpa_svq_unmap_ring(v, &driver_region);
     }
+    addr->used_user_addr = device_region.iova;
 
-    return r == 0;
+    return ok;
 }
 
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
 
 #include <gmodule.h>
 
+#include "hw/virtio/vhost-iova-tree.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
     bool shadow_vqs_enabled;
+    /* IOVA mapping used by the Shadow Virtqueue */
+    VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This is needed to achieve migration, so the destination can restore its
index.

Setting base as last used idx, so destination will see as available all
the entries that the device did not use, including the in-flight
processing ones.

This is ok for networking, but other kinds of devices might have
problems with these retransmissions.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
+    if (v->shadow_vqs_enabled) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                      ring->index);
+
+        /*
+         * Setting base as last used idx, so destination will see as available
+         * all the entries that the device did not use, including the in-flight
+         * processing ones.
+         *
+         * TODO: This is ok for networking, but other kinds of devices might
+         * have problems with these retransmissions.
+         */
+        ring->num = svq->last_used_idx;
+        return 0;
+    }
+
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
     return ret;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

SVQ is able to log the dirty bits by itself, so let's use it to not
block migration.

Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
enabled. Even if the device supports it, the reports would be nonsense
because SVQ memory is in the qemu region.

The log region is still allocated. Future changes might skip that, but
this series is already long enough.

Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
 include/hw/virtio/vhost-vdpa.h |  1 +
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
     return v->index != 0;
 }
 
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
+                                       uint64_t *features)
+{
+    int ret;
+
+    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
+    trace_vhost_vdpa_get_features(dev, *features);
+    return ret;
+}
+
 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
         return 0;
     }
 
-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
+    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
     if (r != 0) {
         error_setg_errno(errp, -r, "Can't get vdpa device features");
         return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
 static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                    uint64_t features)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
     if (vhost_vdpa_one_time_request(dev)) {
         return 0;
     }
 
+    if (v->shadow_vqs_enabled) {
+        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
+            /*
+             * QEMU is just trying to enable or disable logging. SVQ handles
+             * this sepparately, so no need to forward this.
+             */
+            v->acked_features = features;
+            return 0;
+        }
+
+        v->acked_features = features;
+
+        /* We must not ack _F_LOG if SVQ is enabled */
+        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
+    }
+
     trace_vhost_vdpa_set_features(dev, features);
     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                      uint64_t *features)
 {
-    int ret;
+    struct vhost_vdpa *v = dev->opaque;
+    int ret = vhost_vdpa_get_dev_features(dev, features);
+
+    if (ret == 0 && v->shadow_vqs_enabled) {
+        /* Add SVQ logging capabilities */
+        *features |= BIT_ULL(VHOST_F_LOG_ALL);
+    }
 
-    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
-    trace_vhost_vdpa_get_features(dev, *features);
     return ret;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     bool iotlb_batch_begin_sent;
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
+    uint64_t acked_features;
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
-- 
2.7.4