Series comparison

-[PULL V2 00/14] Net patches
+[PULL V3 00/15] Net patches
-The following changes since commit 2a95551e8b1456aa53ce54fac573df18809340a6:
+The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:
-  Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20200330' into staging (2020-03-31 11:20:21 +0100)
+  Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)
 are available in the git repository at:
   https://github.com/jasowang/qemu.git tags/net-pull-request
-for you to fetch changes up to 1153cf9f5b67fad41ca6f8571e9a26e2c7c70759:
+for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:
-  qtest: add tulip test case (2020-03-31 21:14:35 +0800)
+  vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)
 ----------------------------------------------------------------
-Changes from V1:
+Changes since V2:
+- fix 32bit build errros
 - fix the compiling error
 - include qtest for tulip OOB
 ----------------------------------------------------------------
-Andrew Melnychenko (1):
+Eugenio Pérez (14):
-      Fixed integer overflow in e1000e
+      vhost: Add VhostShadowVirtqueue
       vhost: Add Shadow VirtQueue kick forwarding capabilities
       vhost: Add Shadow VirtQueue call forwarding capabilities
       vhost: Add vhost_svq_valid_features to shadow vq
       virtio: Add vhost_svq_get_vring_addr
       vdpa: adapt vhost_ops callbacks to svq
       vhost: Shadow virtqueue buffers forwarding
       util: Add iova_tree_alloc_map
       util: add iova_tree_find_iova
       vhost: Add VhostIOVATree
       vdpa: Add custom IOTLB translations to SVQ
       vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
       vdpa: Never set log_base addr if SVQ is enabled
       vdpa: Expose VHOST_F_LOG_ALL on SVQ
-Li Qiang (1):
+Jason Wang (1):
-      qtest: add tulip test case
+      virtio-net: fix map leaking on error during receive
-Peter Maydell (2):
+ hw/net/virtio-net.c                |   1 +
-      hw/net/i82596.c: Avoid reading off end of buffer in i82596_receive()
+ hw/virtio/meson.build              |   2 +-
-      hw/net/allwinner-sun8i-emac.c: Fix REG_ADDR_HIGH/LOW reads
+ hw/virtio/vhost-iova-tree.c        | 110 +++++++
+ hw/virtio/vhost-iova-tree.h        |  27 ++
-Philippe Mathieu-Daudé (7):
+ hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
-      hw/net/i82596: Correct command bitmask (CID 1419392)
+ hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
-      hw/net/e1000e_core: Let e1000e_can_receive() return a boolean
+ hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
-      hw/net/smc91c111: Let smc91c111_can_receive() return a boolean
+ include/hw/virtio/vhost-vdpa.h     |   8 +
-      hw/net/rtl8139: Simplify if/else statement
+ include/qemu/iova-tree.h           |  38 ++-
-      hw/net/rtl8139: Update coding style to make checkpatch.pl happy
+ util/iova-tree.c                   | 170 ++++++++++
-      hw/net: Make NetCanReceive() return a boolean
+files changed, 1584 insertions(+), 17 deletions(-)
-      hw/net/can: Make CanBusClientInfo::can_receive() return a boolean
+ create mode 100644 hw/virtio/vhost-iova-tree.c
+ create mode 100644 hw/virtio/vhost-iova-tree.h
-Prasad J Pandit (1):
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
-      net: tulip: check frame size and r/w data length
+ create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
 Zhang Chen (2):
       net/colo-compare.c: Expose "compare_timeout" to users
       net/colo-compare.c: Expose "expired_scan_cycle" to users
  hw/net/allwinner-sun8i-emac.c | 14 +++----
  hw/net/allwinner_emac.c       |  2 +-
  hw/net/cadence_gem.c          |  8 ++--
  hw/net/can/can_sja1000.c      |  8 ++--
  hw/net/can/can_sja1000.h      |  2 +-
  hw/net/dp8393x.c              |  8 ++--
  hw/net/e1000.c                |  2 +-
  hw/net/e1000e.c               |  4 +-
  hw/net/e1000e_core.c          |  2 +-
  hw/net/e1000e_core.h          |  2 +-
  hw/net/ftgmac100.c            |  6 +--
  hw/net/i82596.c               | 66 ++++++++++++++++++++----------
  hw/net/i82596.h               |  2 +-
  hw/net/imx_fec.c              |  2 +-
  hw/net/opencores_eth.c        |  5 +--
  hw/net/rtl8139.c              | 22 +++++-----
  hw/net/smc91c111.c            | 10 ++---
  hw/net/spapr_llan.c           |  4 +-
  hw/net/sungem.c               |  6 +--
  hw/net/sunhme.c               |  4 +-
  hw/net/tulip.c                | 36 ++++++++++++----
  hw/net/virtio-net.c           | 10 ++---
  hw/net/xilinx_ethlite.c       |  2 +-
  include/net/can_emu.h         |  2 +-
  include/net/net.h             |  2 +-
  net/can/can_socketcan.c       |  4 +-
  net/colo-compare.c            | 95 ++++++++++++++++++++++++++++++++++++++++---
  net/filter-buffer.c           |  2 +-
  net/hub.c                     |  6 +--
  qemu-options.hx               | 10 +++--
  tests/qtest/Makefile.include  |  1 +
  tests/qtest/tulip-test.c      | 91 +++++++++++++++++++++++++++++++++++++++++
 files changed, 328 insertions(+), 112 deletions(-)
  create mode 100644 tests/qtest/tulip-test.c

-[PULL V2 08/14] hw/net: Make NetCanReceive() return a boolean
+[PULL V3 01/15] virtio-net: fix map leaking on error during receive
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 tries to fix the use after free of the sg by caching the virtqueue
 elements in an array and unmap them at once after receiving the
 packets, But it forgot to unmap the cached elements on error which
 will lead to leaking of mapping and other unexpected results.
-The NetCanReceive handler return whether the device can or
+Fixing this by detaching the cached elements on error. This addresses
-can not receive new packets. Make it obvious by returning
+CVE-2022-26353.
 a boolean type.
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Reported-by: Victor Tom <vv474172261@gmail.com>
-Acked-by: David Gibson <david@gibson.dropbear.id.au>
+Cc: qemu-stable@nongnu.org
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
+Fixes: CVE-2022-26353
-Reviewed-by: Cédric Le Goater <clg@kaod.org>
+Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
 Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/allwinner_emac.c |  2 +-
+ hw/net/virtio-net.c | 1 +
- hw/net/cadence_gem.c    |  8 ++++----
+file changed, 1 insertion(+)
  hw/net/dp8393x.c        |  8 +++-----
  hw/net/e1000.c          |  2 +-
  hw/net/e1000e.c         |  2 +-
  hw/net/ftgmac100.c      |  6 +++---
  hw/net/i82596.c         | 10 +++++-----
  hw/net/i82596.h         |  2 +-
  hw/net/imx_fec.c        |  2 +-
  hw/net/opencores_eth.c  |  5 ++---
  hw/net/rtl8139.c        |  8 ++++----
  hw/net/smc91c111.c      |  2 +-
  hw/net/spapr_llan.c     |  4 ++--
  hw/net/sungem.c         |  6 +++---
  hw/net/sunhme.c         |  4 ++--
  hw/net/virtio-net.c     | 10 +++++-----
  hw/net/xilinx_ethlite.c |  2 +-
  include/net/net.h       |  2 +-
  net/filter-buffer.c     |  2 +-
  net/hub.c               |  6 +++---
 files changed, 45 insertions(+), 48 deletions(-)
-diff --git a/hw/net/allwinner_emac.c b/hw/net/allwinner_emac.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/allwinner_emac.c
-+++ b/hw/net/allwinner_emac.c
-@@ -XXX,XX +XXX,XX @@ static uint32_t fifo8_pop_word(Fifo8 *fifo)
-     return ret;
- }
--static int aw_emac_can_receive(NetClientState *nc)
-+static bool aw_emac_can_receive(NetClientState *nc)
- {
-     AwEmacState *s = qemu_get_nic_opaque(nc);
-diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/cadence_gem.c
-+++ b/hw/net/cadence_gem.c
-@@ -XXX,XX +XXX,XX @@ static void phy_update_link(CadenceGEMState *s)
-     }
- }
--static int gem_can_receive(NetClientState *nc)
-+static bool gem_can_receive(NetClientState *nc)
- {
-     CadenceGEMState *s;
-     int i;
-@@ -XXX,XX +XXX,XX @@ static int gem_can_receive(NetClientState *nc)
-             s->can_rx_state = 1;
-             DB_PRINT("can't receive - no enable\n");
-         }
--        return 0;
-+        return false;
-     }
-     for (i = 0; i < s->num_priority_queues; i++) {
-@@ -XXX,XX +XXX,XX @@ static int gem_can_receive(NetClientState *nc)
-             s->can_rx_state = 2;
-             DB_PRINT("can't receive - all the buffer descriptors are busy\n");
-         }
--        return 0;
-+        return false;
-     }
-     if (s->can_rx_state != 0) {
-         s->can_rx_state = 0;
-         DB_PRINT("can receive\n");
-     }
--    return 1;
-+    return true;
- }
- /*
-diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/dp8393x.c
-+++ b/hw/net/dp8393x.c
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_stop_timer(dp8393xState *s)
-     dp8393x_update_wt_regs(s);
- }
--static int dp8393x_can_receive(NetClientState *nc);
-+static bool dp8393x_can_receive(NetClientState *nc);
- static void dp8393x_do_receiver_enable(dp8393xState *s)
- {
-@@ -XXX,XX +XXX,XX @@ static void dp8393x_watchdog(void *opaque)
-     dp8393x_update_irq(s);
- }
--static int dp8393x_can_receive(NetClientState *nc)
-+static bool dp8393x_can_receive(NetClientState *nc)
- {
-     dp8393xState *s = qemu_get_nic_opaque(nc);
--    if (!(s->regs[SONIC_CR] & SONIC_CR_RXEN))
--        return 0;
--    return 1;
-+    return !!(s->regs[SONIC_CR] & SONIC_CR_RXEN);
- }
- static int dp8393x_receive_filter(dp8393xState *s, const uint8_t * buf,
-diff --git a/hw/net/e1000.c b/hw/net/e1000.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000.c
-+++ b/hw/net/e1000.c
-@@ -XXX,XX +XXX,XX @@ static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
-     return total_size <= bufs * s->rxbuf_size;
- }
--static int
-+static bool
- e1000_can_receive(NetClientState *nc)
- {
-     E1000State *s = qemu_get_nic_opaque(nc);
-diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000e.c
-+++ b/hw/net/e1000e.c
-@@ -XXX,XX +XXX,XX @@ static const MemoryRegionOps io_ops = {
-     },
- };
--static int
-+static bool
- e1000e_nc_can_receive(NetClientState *nc)
- {
-     E1000EState *s = qemu_get_nic_opaque(nc);
-diff --git a/hw/net/ftgmac100.c b/hw/net/ftgmac100.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/ftgmac100.c
-+++ b/hw/net/ftgmac100.c
-@@ -XXX,XX +XXX,XX @@ static void ftgmac100_do_tx(FTGMAC100State *s, uint32_t tx_ring,
-     ftgmac100_update_irq(s);
- }
--static int ftgmac100_can_receive(NetClientState *nc)
-+static bool ftgmac100_can_receive(NetClientState *nc)
- {
-     FTGMAC100State *s = FTGMAC100(qemu_get_nic_opaque(nc));
-     FTGMAC100Desc bd;
-     if ((s->maccr & (FTGMAC100_MACCR_RXDMA_EN | FTGMAC100_MACCR_RXMAC_EN))
-          != (FTGMAC100_MACCR_RXDMA_EN | FTGMAC100_MACCR_RXMAC_EN)) {
--        return 0;
-+        return false;
-     }
-     if (ftgmac100_read_bd(&bd, s->rx_descriptor)) {
--        return 0;
-+        return false;
-     }
-     return !(bd.des0 & FTGMAC100_RXDES0_RXPKT_RDY);
- }
-diff --git a/hw/net/i82596.c b/hw/net/i82596.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/i82596.c
-+++ b/hw/net/i82596.c
-@@ -XXX,XX +XXX,XX @@ void i82596_h_reset(void *opaque)
-     i82596_s_reset(s);
- }
--int i82596_can_receive(NetClientState *nc)
-+bool i82596_can_receive(NetClientState *nc)
- {
-     I82596State *s = qemu_get_nic_opaque(nc);
-     if (s->rx_status == RX_SUSPENDED) {
--        return 0;
-+        return false;
-     }
-     if (!s->lnkst) {
--        return 0;
-+        return false;
-     }
-     if (USE_TIMER && !timer_pending(s->flush_queue_timer)) {
--        return 1;
-+        return true;
-     }
--    return 1;
-+    return true;
- }
- #define MIN_BUF_SIZE 60
-diff --git a/hw/net/i82596.h b/hw/net/i82596.h
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/i82596.h
-+++ b/hw/net/i82596.h
-@@ -XXX,XX +XXX,XX @@ void i82596_ioport_writel(void *opaque, uint32_t addr, uint32_t val);
- uint32_t i82596_ioport_readl(void *opaque, uint32_t addr);
- uint32_t i82596_bcr_readw(I82596State *s, uint32_t rap);
- ssize_t i82596_receive(NetClientState *nc, const uint8_t *buf, size_t size_);
--int i82596_can_receive(NetClientState *nc);
-+bool i82596_can_receive(NetClientState *nc);
- void i82596_set_link_status(NetClientState *nc);
- void i82596_common_init(DeviceState *dev, I82596State *s, NetClientInfo *info);
- extern const VMStateDescription vmstate_i82596;
-diff --git a/hw/net/imx_fec.c b/hw/net/imx_fec.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/imx_fec.c
-+++ b/hw/net/imx_fec.c
-@@ -XXX,XX +XXX,XX @@ static void imx_eth_write(void *opaque, hwaddr offset, uint64_t value,
-     imx_eth_update(s);
- }
--static int imx_eth_can_receive(NetClientState *nc)
-+static bool imx_eth_can_receive(NetClientState *nc)
- {
-     IMXFECState *s = IMX_FEC(qemu_get_nic_opaque(nc));
-diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/opencores_eth.c
-+++ b/hw/net/opencores_eth.c
-@@ -XXX,XX +XXX,XX @@ static void open_eth_reset(void *opaque)
-     open_eth_set_link_status(qemu_get_queue(s->nic));
- }
--static int open_eth_can_receive(NetClientState *nc)
-+static bool open_eth_can_receive(NetClientState *nc)
- {
-     OpenEthState *s = qemu_get_nic_opaque(nc);
--    return GET_REGBIT(s, MODER, RXEN) &&
--        (s->regs[TX_BD_NUM] < 0x80);
-+    return GET_REGBIT(s, MODER, RXEN) && (s->regs[TX_BD_NUM] < 0x80);
- }
- static ssize_t open_eth_receive(NetClientState *nc,
-diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/rtl8139.c
-+++ b/hw/net/rtl8139.c
-@@ -XXX,XX +XXX,XX @@ static bool rtl8139_cp_rx_valid(RTL8139State *s)
-     return !(s->RxRingAddrLO == 0 && s->RxRingAddrHI == 0);
- }
--static int rtl8139_can_receive(NetClientState *nc)
-+static bool rtl8139_can_receive(NetClientState *nc)
- {
-     RTL8139State *s = qemu_get_nic_opaque(nc);
-     int avail;
-     /* Receive (drop) packets if card is disabled.  */
-     if (!s->clock_enabled) {
--        return 1;
-+        return true;
-     }
-     if (!rtl8139_receiver_enabled(s)) {
--        return 1;
-+        return true;
-     }
-     if (rtl8139_cp_receiver_enabled(s) && rtl8139_cp_rx_valid(s)) {
-         /* ??? Flow control not implemented in c+ mode.
-            This is a hack to work around slirp deficiencies anyway.  */
--        return 1;
-+        return true;
-     }
-     avail = MOD2(s->RxBufferSize + s->RxBufPtr - s->RxBufAddr,
-diff --git a/hw/net/smc91c111.c b/hw/net/smc91c111.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/smc91c111.c
-+++ b/hw/net/smc91c111.c
-@@ -XXX,XX +XXX,XX @@ static void smc91c111_writefn(void *opaque, hwaddr addr,
-     }
- }
--static int smc91c111_can_receive_nc(NetClientState *nc)
-+static bool smc91c111_can_receive_nc(NetClientState *nc)
- {
-     smc91c111_state *s = qemu_get_nic_opaque(nc);
-diff --git a/hw/net/spapr_llan.c b/hw/net/spapr_llan.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/spapr_llan.c
-+++ b/hw/net/spapr_llan.c
-@@ -XXX,XX +XXX,XX @@ typedef struct SpaprVioVlan {
-     RxBufPool *rx_pool[RX_MAX_POOLS];  /* Receive buffer descriptor pools */
- } SpaprVioVlan;
--static int spapr_vlan_can_receive(NetClientState *nc)
-+static bool spapr_vlan_can_receive(NetClientState *nc)
- {
-     SpaprVioVlan *dev = qemu_get_nic_opaque(nc);
--    return (dev->isopen && dev->rx_bufs > 0);
-+    return dev->isopen && dev->rx_bufs > 0;
- }
- /**
-diff --git a/hw/net/sungem.c b/hw/net/sungem.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/sungem.c
-+++ b/hw/net/sungem.c
-@@ -XXX,XX +XXX,XX @@ static bool sungem_rx_full(SunGEMState *s, uint32_t kick, uint32_t done)
-     return kick == ((done + 1) & s->rx_mask);
- }
--static int sungem_can_receive(NetClientState *nc)
-+static bool sungem_can_receive(NetClientState *nc)
- {
-     SunGEMState *s = qemu_get_nic_opaque(nc);
-     uint32_t kick, done, rxdma_cfg, rxmac_cfg;
-@@ -XXX,XX +XXX,XX @@ static int sungem_can_receive(NetClientState *nc)
-     /* If MAC disabled, can't receive */
-     if ((rxmac_cfg & MAC_RXCFG_ENAB) == 0) {
-         trace_sungem_rx_mac_disabled();
--        return 0;
-+        return false;
-     }
-     if ((rxdma_cfg & RXDMA_CFG_ENABLE) == 0) {
-         trace_sungem_rx_txdma_disabled();
--        return 0;
-+        return false;
-     }
-     /* Check RX availability */
-diff --git a/hw/net/sunhme.c b/hw/net/sunhme.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/sunhme.c
-+++ b/hw/net/sunhme.c
-@@ -XXX,XX +XXX,XX @@ static void sunhme_transmit(SunHMEState *s)
-     sunhme_update_irq(s);
- }
--static int sunhme_can_receive(NetClientState *nc)
-+static bool sunhme_can_receive(NetClientState *nc)
- {
-     SunHMEState *s = qemu_get_nic_opaque(nc);
--    return s->macregs[HME_MACI_RXCFG >> 2] & HME_MAC_RXCFG_ENABLE;
-+    return !!(s->macregs[HME_MACI_RXCFG >> 2] & HME_MAC_RXCFG_ENABLE);
- }
- static void sunhme_link_status_changed(NetClientState *nc)
 diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/net/virtio-net.c
 +++ b/hw/net/virtio-net.c
-@@ -XXX,XX +XXX,XX @@ static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
+@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
-     qemu_flush_queued_packets(qemu_get_subqueue(n->nic, queue_index));
- }
+ err:
+     for (j = 0; j < i; j++) {
--static int virtio_net_can_receive(NetClientState *nc)
++        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
-+static bool virtio_net_can_receive(NetClientState *nc)
+         g_free(elems[j]);
  {
      VirtIONet *n = qemu_get_nic_opaque(nc);
      VirtIODevice *vdev = VIRTIO_DEVICE(n);
      VirtIONetQueue *q = virtio_net_get_subqueue(nc);
      if (!vdev->vm_running) {
 -        return 0;
 +        return false;
      }
-     if (nc->queue_index >= n->curr_queues) {
--        return 0;
-+        return false;
-     }
-     if (!virtio_queue_ready(q->rx_vq) ||
-         !(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
--        return 0;
-+        return false;
-     }
--    return 1;
-+    return true;
- }
- static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize)
-diff --git a/hw/net/xilinx_ethlite.c b/hw/net/xilinx_ethlite.c
-index XXXXXXX..XXXXXXX 100644
---- a/hw/net/xilinx_ethlite.c
-+++ b/hw/net/xilinx_ethlite.c
-@@ -XXX,XX +XXX,XX @@ static const MemoryRegionOps eth_ops = {
-     }
- };
--static int eth_can_rx(NetClientState *nc)
-+static bool eth_can_rx(NetClientState *nc)
- {
-     struct xlx_ethlite *s = qemu_get_nic_opaque(nc);
-     unsigned int rxbase = s->rxbuf * (0x800 / 4);
-diff --git a/include/net/net.h b/include/net/net.h
-index XXXXXXX..XXXXXXX 100644
---- a/include/net/net.h
-+++ b/include/net/net.h
-@@ -XXX,XX +XXX,XX @@ typedef struct NICConf {
- /* Net clients */
- typedef void (NetPoll)(NetClientState *, bool enable);
--typedef int (NetCanReceive)(NetClientState *);
-+typedef bool (NetCanReceive)(NetClientState *);
- typedef ssize_t (NetReceive)(NetClientState *, const uint8_t *, size_t);
- typedef ssize_t (NetReceiveIOV)(NetClientState *, const struct iovec *, int);
- typedef void (NetCleanup) (NetClientState *);
-diff --git a/net/filter-buffer.c b/net/filter-buffer.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/filter-buffer.c
-+++ b/net/filter-buffer.c
-@@ -XXX,XX +XXX,XX @@ static ssize_t filter_buffer_receive_iov(NetFilterState *nf,
-      * the filter can still accept packets until its internal queue is full.
-      * For example:
-      *   For some reason, receiver could not receive more packets
--     * (.can_receive() returns zero). Without a filter, at most one packet
-+     * (.can_receive() returns false). Without a filter, at most one packet
-      * will be queued in incoming queue and sender's poll will be disabled
-      * unit its sent_cb() was called. With a filter, it will keep receiving
-      * the packets without caring about the receiver. This is suboptimal.
-diff --git a/net/hub.c b/net/hub.c
-index XXXXXXX..XXXXXXX 100644
---- a/net/hub.c
-+++ b/net/hub.c
-@@ -XXX,XX +XXX,XX @@ static NetHub *net_hub_new(int id)
-     return hub;
- }
--static int net_hub_port_can_receive(NetClientState *nc)
-+static bool net_hub_port_can_receive(NetClientState *nc)
- {
-     NetHubPort *port;
-     NetHubPort *src_port = DO_UPCAST(NetHubPort, nc, nc);
-@@ -XXX,XX +XXX,XX @@ static int net_hub_port_can_receive(NetClientState *nc)
-         }
-         if (qemu_can_send_packet(&port->nc)) {
--            return 1;
-+            return true;
-         }
-     }
--    return 0;
-+    return false;
- }
- static ssize_t net_hub_port_receive(NetClientState *nc,
 --
-.5.0
+.7.4

-[PULL V2 13/14] hw/net/allwinner-sun8i-emac.c: Fix REG_ADDR_HIGH/LOW reads
+[PULL V3 02/15] vhost: Add VhostShadowVirtqueue
-From: Peter Maydell <peter.maydell@linaro.org>
+From: Eugenio Pérez <eperezma@redhat.com>
-Coverity points out (CID 1421926) that the read code for
+Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
-REG_ADDR_HIGH reads off the end of the buffer, because it does a
+notifications and buffers, allowing qemu to track them. While qemu is
--bit read from byte 4 of a 6-byte buffer.
+forwarding the buffers and virtqueue changes, it is able to commit the
 memory it's being dirtied, the same way regular qemu's VirtIO devices
 do.
-The code also has an endianness issue for both REG_ADDR_HIGH and
+This commit only exposes basic SVQ allocation and free. Next patches of
-REG_ADDR_LOW, because it will do the wrong thing on a big-endian
+the series add functionality like notifications and buffers forwarding.
 host.
-Rewrite the read code to use ldl_le_p() and lduw_le_p() to fix this;
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-the write code is not incorrect, but for consistency we make it use
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 stl_le_p() and stw_le_p().
 Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
 Tested-by: Niek Linnenbank <nieklinnenbank@gmail.com>
 Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com>
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/allwinner-sun8i-emac.c | 12 ++++--------
+ hw/virtio/meson.build              |  2 +-
-file changed, 4 insertions(+), 8 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
  hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 files changed, 91 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
  create mode 100644 hw/virtio/vhost-shadow-virtqueue.h
-diff --git a/hw/net/allwinner-sun8i-emac.c b/hw/net/allwinner-sun8i-emac.c
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/allwinner-sun8i-emac.c
+--- a/hw/virtio/meson.build
-+++ b/hw/net/allwinner-sun8i-emac.c
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ static uint64_t allwinner_sun8i_emac_read(void *opaque, hwaddr offset,
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
-         value = s->mii_data;
-         break;
+ virtio_ss = ss.source_set()
-     case REG_ADDR_HIGH:         /* MAC Address High */
+ virtio_ss.add(files('virtio.c'))
--        value = *(((uint32_t *) (s->conf.macaddr.a)) + 1);
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
-+        value = lduw_le_p(s->conf.macaddr.a + 4);
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
-         break;
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
-     case REG_ADDR_LOW:          /* MAC Address Low */
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
--        value = *(uint32_t *) (s->conf.macaddr.a);
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
-+        value = ldl_le_p(s->conf.macaddr.a);
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
-         break;
+new file mode 100644
-     case REG_TX_DMA_STA:        /* Transmit DMA Status */
+index XXXXXXX..XXXXXXX
-         break;
+--- /dev/null
-@@ -XXX,XX +XXX,XX @@ static void allwinner_sun8i_emac_write(void *opaque, hwaddr offset,
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-         s->mii_data = value;
+@@ -XXX,XX +XXX,XX @@
-         break;
++/*
-     case REG_ADDR_HIGH:         /* MAC Address High */
++ * vhost shadow virtqueue
--        s->conf.macaddr.a[4] = (value & 0xff);
++ *
--        s->conf.macaddr.a[5] = (value & 0xff00) >> 8;
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
-+        stw_le_p(s->conf.macaddr.a + 4, value);
++ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
-         break;
++ *
-     case REG_ADDR_LOW:          /* MAC Address Low */
++ * SPDX-License-Identifier: GPL-2.0-or-later
--        s->conf.macaddr.a[0] = (value & 0xff);
++ */
--        s->conf.macaddr.a[1] = (value & 0xff00) >> 8;
++
--        s->conf.macaddr.a[2] = (value & 0xff0000) >> 16;
++#include "qemu/osdep.h"
--        s->conf.macaddr.a[3] = (value & 0xff000000) >> 24;
++#include "hw/virtio/vhost-shadow-virtqueue.h"
-+        stl_le_p(s->conf.macaddr.a, value);
++
-         break;
++#include "qemu/error-report.h"
-     case REG_TX_DMA_STA:        /* Transmit DMA Status */
++
-     case REG_TX_CUR_DESC:       /* Transmit Current Descriptor */
++/**
 + * Creates vhost shadow virtqueue, and instructs the vhost device to use the
 + * shadow methods and file descriptors.
 + *
 + * Returns the new virtqueue or NULL.
 + *
 + * In case of error, reason is reported through error_report.
 + */
 +VhostShadowVirtqueue *vhost_svq_new(void)
 +{
 +    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
 +    int r;
 +
 +    r = event_notifier_init(&svq->hdev_kick, 0);
 +    if (r != 0) {
 +        error_report("Couldn't create kick event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_kick;
 +    }
 +
 +    r = event_notifier_init(&svq->hdev_call, 0);
 +    if (r != 0) {
 +        error_report("Couldn't create call event notifier: %s (%d)",
 +                     g_strerror(errno), errno);
 +        goto err_init_hdev_call;
 +    }
 +
 +    return g_steal_pointer(&svq);
 +
 +err_init_hdev_call:
 +    event_notifier_cleanup(&svq->hdev_kick);
 +
 +err_init_hdev_kick:
 +    return NULL;
 +}
 +
 +/**
 + * Free the resources of the shadow virtqueue.
 + *
 + * @pvq: gpointer to SVQ so it can be used by autofree functions.
 + */
 +void vhost_svq_free(gpointer pvq)
 +{
 +    VhostShadowVirtqueue *vq = pvq;
 +    event_notifier_cleanup(&vq->hdev_kick);
 +    event_notifier_cleanup(&vq->hdev_call);
 +    g_free(vq);
 +}
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost shadow virtqueue
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
 +#ifndef VHOST_SHADOW_VIRTQUEUE_H
 +#define VHOST_SHADOW_VIRTQUEUE_H
 +
 +#include "qemu/event_notifier.h"
 +
 +/* Shadow virtqueue to relay notifications */
 +typedef struct VhostShadowVirtqueue {
 +    /* Shadow kick notifier, sent to vhost */
 +    EventNotifier hdev_kick;
 +    /* Shadow call notifier, sent to vhost */
 +    EventNotifier hdev_call;
 +} VhostShadowVirtqueue;
 +
 +VhostShadowVirtqueue *vhost_svq_new(void);
 +
 +void vhost_svq_free(gpointer vq);
 +G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 +
 +#endif
 --
-.5.0
+.7.4

-New patch
+[PULL V3 03/15] vhost: Add Shadow VirtQueue kick forwarding capabilities
+From: Eugenio Pérez <eperezma@redhat.com>
 At this mode no buffer forwarding will be performed in SVQ mode: Qemu
 will just forward the guest's kicks to the device.
 Host memory notifiers regions are left out for simplicity, and they will
 not be addressed in this series.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
  hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
  hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
  hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
  include/hw/virtio/vhost-vdpa.h     |   4 ++
 files changed, 215 insertions(+), 2 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.c
 +++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "qemu/error-report.h"
 +#include "qemu/main-loop.h"
 +#include "linux-headers/linux/vhost.h"
 +
 +/**
 + * Forward guest notifications.
 + *
 + * @n: guest kick event notifier, the one that guest set to notify svq.
 + */
 +static void vhost_handle_guest_kick(EventNotifier *n)
 +{
 +    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
 +    event_notifier_test_and_clear(n);
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Set a new file descriptor for the guest to kick the SVQ and notify for avail
 + *
 + * @svq: The svq
 + * @svq_kick_fd: The svq kick fd
 + *
 + * Note that the SVQ will never close the old file descriptor.
 + */
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 +{
 +    EventNotifier *svq_kick = &svq->svq_kick;
 +    bool poll_stop = VHOST_FILE_UNBIND != event_notifier_get_fd(svq_kick);
 +    bool poll_start = svq_kick_fd != VHOST_FILE_UNBIND;
 +
 +    if (poll_stop) {
 +        event_notifier_set_handler(svq_kick, NULL);
 +    }
 +
 +    /*
 +     * event_notifier_set_handler already checks for guest's notifications if
 +     * they arrive at the new file descriptor in the switch, so there is no
 +     * need to explicitly check for them.
 +     */
 +    if (poll_start) {
 +        event_notifier_init_fd(svq_kick, svq_kick_fd);
 +        event_notifier_set(svq_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +    }
 +}
 +
 +/**
 + * Stop the shadow virtqueue operation.
 + * @svq: Shadow Virtqueue
 + */
 +void vhost_svq_stop(VhostShadowVirtqueue *svq)
 +{
 +    event_notifier_set_handler(&svq->svq_kick, NULL);
 +}
  /**
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
          goto err_init_hdev_call;
      }
 +    event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      return g_steal_pointer(&svq);
  err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ err_init_hdev_kick:
  void vhost_svq_free(gpointer pvq)
  {
      VhostShadowVirtqueue *vq = pvq;
 +    vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier hdev_kick;
      /* Shadow call notifier, sent to vhost */
      EventNotifier hdev_call;
 +
 +    /*
 +     * Borrowed virtqueue's guest to host notifier. To borrow it in this event
 +     * notifier allows to recover the VhostShadowVirtqueue from the event loop
 +     * easily. If we use the VirtQueue's one, we don't have an easy way to
 +     * retrieve VhostShadowVirtqueue.
 +     *
 +     * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
 +     */
 +    EventNotifier svq_kick;
  } VhostShadowVirtqueue;
 +void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +
 +void vhost_svq_stop(VhostShadowVirtqueue *svq);
 +
  VhostShadowVirtqueue *vhost_svq_new(void);
  void vhost_svq_free(gpointer vq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@
  #include "hw/virtio/vhost.h"
  #include "hw/virtio/vhost-backend.h"
  #include "hw/virtio/virtio-net.h"
 +#include "hw/virtio/vhost-shadow-virtqueue.h"
  #include "hw/virtio/vhost-vdpa.h"
  #include "exec/address-spaces.h"
  #include "qemu/main-loop.h"
  #include "cpu.h"
  #include "trace.h"
  #include "qemu-common.h"
 +#include "qapi/error.h"
  /*
   * Return one past the end of the end of section. Be careful with uint64_t
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
      return v->index != 0;
  }
 +static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 +                               Error **errp)
 +{
 +    g_autoptr(GPtrArray) shadow_vqs = NULL;
 +
 +    if (!v->shadow_vqs_enabled) {
 +        return 0;
 +    }
 +
 +    shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
 +    for (unsigned n = 0; n < hdev->nvqs; ++n) {
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +
 +        if (unlikely(!svq)) {
 +            error_setg(errp, "Cannot create svq %u", n);
 +            return -1;
 +        }
 +        g_ptr_array_add(shadow_vqs, g_steal_pointer(&svq));
 +    }
 +
 +    v->shadow_vqs = g_steal_pointer(&shadow_vqs);
 +    return 0;
 +}
 +
  static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
      dev->opaque =  opaque ;
      v->listener = vhost_vdpa_memory_listener;
      v->msg_type = VHOST_IOTLB_MSG_V2;
 +    ret = vhost_vdpa_init_svq(dev, v, errp);
 +    if (ret) {
 +        goto err;
 +    }
      vhost_vdpa_get_iova_range(v);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init(struct vhost_dev *dev, void *opaque, Error **errp)
                                 VIRTIO_CONFIG_S_DRIVER);
      return 0;
 +
 +err:
 +    ram_block_discard_disable(false);
 +    return ret;
  }
  static void vhost_vdpa_host_notifier_uninit(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_host_notifiers_uninit(struct vhost_dev *dev, int n)
  static void vhost_vdpa_host_notifiers_init(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int i;
 +    if (v->shadow_vqs_enabled) {
 +        /* FIXME SVQ is not compatible with host notifiers mr */
 +        return;
 +    }
 +
      for (i = dev->vq_index; i < dev->vq_index + dev->nvqs; i++) {
          if (vhost_vdpa_host_notifier_init(dev, i)) {
              goto err;
@@ -XXX,XX +XXX,XX @@ err:
      return;
  }
 +static void vhost_vdpa_svq_cleanup(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t idx;
 +
 +    if (!v->shadow_vqs) {
 +        return;
 +    }
 +
 +    for (idx = 0; idx < v->shadow_vqs->len; ++idx) {
 +        vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, idx));
 +    }
 +    g_ptr_array_free(v->shadow_vqs, true);
 +}
 +
  static int vhost_vdpa_cleanup(struct vhost_dev *dev)
  {
      struct vhost_vdpa *v;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_cleanup(struct vhost_dev *dev)
      trace_vhost_vdpa_cleanup(dev, v);
      vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      memory_listener_unregister(&v->listener);
 +    vhost_vdpa_svq_cleanup(dev);
      dev->opaque = NULL;
      ram_block_discard_disable(false);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_device_id(struct vhost_dev *dev,
      return ret;
  }
 +static void vhost_vdpa_reset_svq(struct vhost_vdpa *v)
 +{
 +    if (!v->shadow_vqs_enabled) {
 +        return;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        vhost_svq_stop(svq);
 +    }
 +}
 +
  static int vhost_vdpa_reset_device(struct vhost_dev *dev)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      uint8_t status = 0;
 +    vhost_vdpa_reset_svq(v);
 +
      ret = vhost_vdpa_call(dev, VHOST_VDPA_SET_STATUS, &status);
      trace_vhost_vdpa_reset_device(dev, status);
      return ret;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
      return ret;
   }
 +static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
 +                                         struct vhost_vring_file *file)
 +{
 +    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +}
 +
 +/**
 + * Set the shadow virtqueue descriptors to the device
 + *
 + * @dev: The vhost device model
 + * @svq: The shadow virtqueue
 + * @idx: The index of the virtqueue in the vhost device
 + * @errp: Error
 + */
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq, unsigned idx,
 +                                 Error **errp)
 +{
 +    struct vhost_vring_file file = {
 +        .index = dev->vq_index + idx,
 +    };
 +    const EventNotifier *event_notifier = &svq->hdev_kick;
 +    int r;
 +
 +    file.fd = event_notifier_get_fd(event_notifier);
 +    r = vhost_vdpa_set_vring_dev_kick(dev, &file);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Can't set device kick fd");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    Error *err = NULL;
 +    unsigned i;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
 +        if (unlikely(!ok)) {
 +            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
  static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
  {
      struct vhost_vdpa *v = dev->opaque;
 +    bool ok;
      trace_vhost_vdpa_dev_start(dev, started);
      if (started) {
          vhost_vdpa_host_notifiers_init(dev);
 +        ok = vhost_vdpa_svqs_start(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_set_vring_ready(dev);
      } else {
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_kick(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +    int vdpa_idx = file->index - dev->vq_index;
 +
 +    if (v->shadow_vqs_enabled) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
 +        vhost_svq_set_svq_kick_fd(svq, file->fd);
 +        return 0;
 +    } else {
 +        return vhost_vdpa_set_vring_dev_kick(dev, file);
 +    }
  }
  static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
  #ifndef HW_VIRTIO_VHOST_VDPA_H
  #define HW_VIRTIO_VHOST_VDPA_H
 +#include <gmodule.h>
 +
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      bool iotlb_batch_begin_sent;
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
 +    bool shadow_vqs_enabled;
 +    GPtrArray *shadow_vqs;
      struct vhost_dev *dev;
      VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
  } VhostVDPA;
 --
 .7.4

-[PULL V2 10/14] net/colo-compare.c: Expose "compare_timeout" to users
+[PULL V3 04/15] vhost: Add Shadow VirtQueue call forwarding capabilities
-From: Zhang Chen <chen.zhang@intel.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The "compare_timeout" determines the maximum time to hold the primary net packet.
+This will make qemu aware of the device used buffers, allowing it to
-This patch expose the "compare_timeout", make user have ability to
+write the guest memory with its contents if needed.
 adjest the value according to application scenarios.
-QMP command demo:
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-    { "execute": "qom-get",
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
          "arguments": { "path": "/objects/comp0",
                         "property": "compare_timeout" } }
     { "execute": "qom-set",
          "arguments": { "path": "/objects/comp0",
                         "property": "compare_timeout",
                         "value": 5000} }
 Signed-off-by: Zhang Chen <chen.zhang@intel.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
+ hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
- qemu-options.hx    |  8 +++++---
+ hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
-files changed, 50 insertions(+), 5 deletions(-)
+ hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 files changed, 71 insertions(+), 2 deletions(-)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/net/colo-compare.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static NotifierList colo_compare_notifiers =
+@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
  /* TODO: Should be configurable */
  #define REGULAR_PACKET_CHECK_MS 3000
 +#define DEFAULT_TIME_OUT_MS 3000
  static QemuMutex event_mtx;
  static QemuCond event_complete_cond;
@@ -XXX,XX +XXX,XX @@ typedef struct CompareState {
      SocketReadState sec_rs;
      SocketReadState notify_rs;
      bool vnet_hdr;
 +    uint32_t compare_timeout;
      /*
       * Record the connection that through the NIC
@@ -XXX,XX +XXX,XX @@ static int colo_old_packet_check_one_conn(Connection *conn,
                                            CompareState *s)
  {
      GList *result = NULL;
 -    int64_t check_time = REGULAR_PACKET_CHECK_MS;
      result = g_queue_find_custom(&conn->primary_list,
 -                                 &check_time,
 +                                 &s->compare_timeout,
                                   (GCompareFunc)colo_old_packet_check_one);
      if (result) {
@@ -XXX,XX +XXX,XX @@ static void compare_set_notify_dev(Object *obj, const char *value, Error **errp)
      s->notify_dev = g_strdup(value);
  }
-+static void compare_get_timeout(Object *obj, Visitor *v,
+ /**
-+                                const char *name, void *opaque,
++ * Forward vhost notifications
-+                                Error **errp)
++ *
 + * @n: hdev call event notifier, the one that device set to notify svq.
 + */
 +static void vhost_svq_handle_call(EventNotifier *n)
 +{
-+    CompareState *s = COLO_COMPARE(obj);
++    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
-+    uint32_t value = s->compare_timeout;
++                                             hdev_call);
-+
++    event_notifier_test_and_clear(n);
-+    visit_type_uint32(v, name, &value, errp);
++    event_notifier_set(&svq->svq_call);
 +}
 +
-+static void compare_set_timeout(Object *obj, Visitor *v,
++/**
-+                                const char *name, void *opaque,
++ * Set the call notifier for the SVQ to call the guest
-+                                Error **errp)
++ *
 + * @svq: Shadow virtqueue
 + * @call_fd: call notifier
 + *
 + * Called on BQL context.
 + */
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 +{
-+    CompareState *s = COLO_COMPARE(obj);
++    if (call_fd == VHOST_FILE_UNBIND) {
-+    Error *local_err = NULL;
++        /*
-+    uint32_t value;
++         * Fail event_notifier_set if called handling device call.
-+
++         *
-+    visit_type_uint32(v, name, &value, &local_err);
++         * SVQ still needs device notifications, since it needs to keep
-+    if (local_err) {
++         * forwarding used buffers even with the unbind.
-+        goto out;
++         */
 +        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
 +    } else {
 +        event_notifier_init_fd(&svq->svq_call, call_fd);
 +    }
-+    if (!value) {
-+        error_setg(&local_err, "Property '%s.%s' requires a positive value",
-+                   object_get_typename(obj), name);
-+        goto out;
-+    }
-+    s->compare_timeout = value;
-+
-+out:
-+    error_propagate(errp, local_err);
 +}
 +
- static void compare_pri_rs_finalize(SocketReadState *pri_rs)
++/**
- {
+  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
-     CompareState *s = container_of(pri_rs, CompareState, pri_rs);
+  *
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
+  * @svq: The svq
-         return;
+@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      }
-+    if (!s->compare_timeout) {
+     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
-+        /* Set default value to 3000 MS */
++    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
-+        s->compare_timeout = DEFAULT_TIME_OUT_MS;
+     return g_steal_pointer(&svq);
  err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
      VhostShadowVirtqueue *vq = pvq;
      vhost_svq_stop(vq);
      event_notifier_cleanup(&vq->hdev_kick);
 +    event_notifier_set_handler(&vq->hdev_call, NULL);
      event_notifier_cleanup(&vq->hdev_call);
      g_free(vq);
  }
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
       * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
       */
      EventNotifier svq_kick;
 +
 +    /* Guest's call notifier, where the SVQ calls guest. */
 +    EventNotifier svq_call;
  } VhostShadowVirtqueue;
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 +void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
  }
 +static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
 +                                         struct vhost_vring_file *file)
 +{
 +    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 +}
 +
  /**
   * Set the shadow virtqueue descriptors to the device
   *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
   * @svq: The shadow virtqueue
   * @idx: The index of the virtqueue in the vhost device
   * @errp: Error
 + *
 + * Note that this function does not rewind kick file descriptor if cannot set
 + * call one.
   */
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                   VhostShadowVirtqueue *svq, unsigned idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 +        return false;
 +    }
 +
-     if (find_and_check_chardev(&chr, s->pri_indev, errp) ||
++    event_notifier = &svq->hdev_call;
-         !qemu_chr_fe_init(&s->chr_pri_in, chr, errp)) {
++    file.fd = event_notifier_get_fd(event_notifier);
-         return;
++    r = vhost_vdpa_set_vring_dev_call(dev, &file);
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_init(Object *obj)
++    if (unlikely(r != 0)) {
-                             compare_get_notify_dev, compare_set_notify_dev,
++        error_setg_errno(errp, -r, "Can't set device call fd");
-                             NULL);
+     }
-+    object_property_add(obj, "compare_timeout", "uint32",
+     return r == 0;
-+                        compare_get_timeout,
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
-+                        compare_set_timeout, NULL, NULL, NULL);
+ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                         struct vhost_vring_file *file)
  {
 -    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 +    struct vhost_vdpa *v = dev->opaque;
 +
-     s->vnet_hdr = false;
++    if (v->shadow_vqs_enabled) {
-     object_property_add_bool(obj, "vnet_hdr_support", compare_get_vnet_hdr,
++        int vdpa_idx = file->index - dev->vq_index;
-                              compare_set_vnet_hdr, NULL);
++        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
-diff --git a/qemu-options.hx b/qemu-options.hx
++
-index XXXXXXX..XXXXXXX 100644
++        vhost_svq_set_svq_call_fd(svq, file->fd);
---- a/qemu-options.hx
++        return 0;
-+++ b/qemu-options.hx
++    } else {
-@@ -XXX,XX +XXX,XX @@ SRST
++        return vhost_vdpa_set_vring_dev_call(dev, file);
-         stored. The file format is libpcap, so it can be analyzed with
++    }
-         tools such as tcpdump or Wireshark.
+ }
--    ``-object colo-compare,id=id,primary_in=chardevid,secondary_in=chardevid,outdev=chardevid,iothread=id[,vnet_hdr_support][,notify_dev=id]``
+ static int vhost_vdpa_get_features(struct vhost_dev *dev,
 +    ``-object colo-compare,id=id,primary_in=chardevid,secondary_in=chardevid,outdev=chardevid,iothread=id[,vnet_hdr_support][,notify_dev=id][,compare_timeout=@var{ms}]``
          Colo-compare gets packet from primary\_inchardevid and
          secondary\_inchardevid, than compare primary packet with
          secondary packet. If the packets are same, we will output
@@ -XXX,XX +XXX,XX @@ SRST
          outdevchardevid. In order to improve efficiency, we need to put
          the task of comparison in another thread. If it has the
          vnet\_hdr\_support flag, colo compare will send/recv packet with
 -        vnet\_hdr\_len. If you want to use Xen COLO, will need the
 -        notify\_dev to notify Xen colo-frame to do checkpoint.
 +        vnet\_hdr\_len. Then compare\_timeout=@var{ms} determines the
 +        maximum delay colo-compare wait for the packet.
 +        If you want to use Xen COLO, will need the notify\_dev to
 +        notify Xen colo-frame to do checkpoint.
          we must use it with the help of filter-mirror and
          filter-redirector.
 --
-.5.0
+.7.4

-[PULL V2 03/14] Fixed integer overflow in e1000e
+[PULL V3 05/15] vhost: Add vhost_svq_valid_features to shadow vq
-From: Andrew Melnychenko <andrew@daynix.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1737400
+This allows SVQ to negotiate features with the guest and the device. For
-Fixed setting max_queue_num if there are no peers in
+the device, SVQ is a driver. While this function bypasses all
-NICConf. qemu_new_nic() creates NICState with 1 NetClientState(index
+non-transport features, it needs to disable the features that SVQ does
-) without peers, set max_queue_num to 0 - It prevents undefined
+not support when forwarding buffers. This includes packed vq layout,
-behavior and possible crashes, especially during pcie hotplug.
+indirect descriptors or event idx.
-Fixes: 6f3fbe4ed06 ("net: Introduce e1000e device emulation")
+Future changes can add support to offer more features to the guest,
-Signed-off-by: Andrew Melnychenko <andrew@daynix.com>
+since the use of VirtQueue gives this for free. This is left out at the
-Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+moment for simplicity.
-Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/e1000e.c | 2 +-
+ hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
-file changed, 1 insertion(+), 1 deletion(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  2 ++
  hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 files changed, 61 insertions(+)
-diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000e.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/e1000e.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ e1000e_init_net_peer(E1000EState *s, PCIDevice *pci_dev, uint8_t *macaddr)
+@@ -XXX,XX +XXX,XX @@
-     s->nic = qemu_new_nic(&net_e1000e_info, &s->conf,
+ #include "hw/virtio/vhost-shadow-virtqueue.h"
-         object_get_typename(OBJECT(s)), dev->id, s);
+ #include "qemu/error-report.h"
--    s->core.max_queue_num = s->conf.peers.queues - 1;
++#include "qapi/error.h"
-+    s->core.max_queue_num = s->conf.peers.queues ? s->conf.peers.queues - 1 : 0;
+ #include "qemu/main-loop.h"
+ #include "linux-headers/linux/vhost.h"
-     trace_e1000e_mac_set_permanent(MAC_ARG(macaddr));
-     memcpy(s->core.permanent_mac, macaddr, sizeof(s->core.permanent_mac));
+ /**
 + * Validate the transport device features that both guests can use with the SVQ
 + * and SVQs can use with the device.
 + *
 + * @dev_features: The features
 + * @errp: Error pointer
 + */
 +bool vhost_svq_valid_features(uint64_t features, Error **errp)
 +{
 +    bool ok = true;
 +    uint64_t svq_features = features;
 +
 +    for (uint64_t b = VIRTIO_TRANSPORT_F_START; b <= VIRTIO_TRANSPORT_F_END;
 +         ++b) {
 +        switch (b) {
 +        case VIRTIO_F_ANY_LAYOUT:
 +            continue;
 +
 +        case VIRTIO_F_ACCESS_PLATFORM:
 +            /* SVQ trust in the host's IOMMU to translate addresses */
 +        case VIRTIO_F_VERSION_1:
 +            /* SVQ trust that the guest vring is little endian */
 +            if (!(svq_features & BIT_ULL(b))) {
 +                svq_features |= BIT_ULL(b);
 +                ok = false;
 +            }
 +            continue;
 +
 +        default:
 +            if (svq_features & BIT_ULL(b)) {
 +                svq_features &= ~BIT_ULL(b);
 +                ok = false;
 +            }
 +        }
 +    }
 +
 +    if (!ok) {
 +        error_setg(errp, "SVQ Invalid device feature flags, offer: 0x%"PRIx64
 +                         ", ok: 0x%"PRIx64, features, svq_features);
 +    }
 +    return ok;
 +}
 +
 +/**
   * Forward guest notifications.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      EventNotifier svq_call;
  } VhostShadowVirtqueue;
 +bool vhost_svq_valid_features(uint64_t features, Error **errp);
 +
  void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
  void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
      g_autoptr(GPtrArray) shadow_vqs = NULL;
 +    uint64_t dev_features, svq_features;
 +    int r;
 +    bool ok;
      if (!v->shadow_vqs_enabled) {
          return 0;
      }
 +    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
 +    if (r != 0) {
 +        error_setg_errno(errp, -r, "Can't get vdpa device features");
 +        return r;
 +    }
 +
 +    svq_features = dev_features;
 +    ok = vhost_svq_valid_features(svq_features, errp);
 +    if (unlikely(!ok)) {
 +        return -1;
 +    }
 +
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
          g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 --
-.5.0
+.7.4

-[PULL V2 11/14] net/colo-compare.c: Expose "expired_scan_cycle" to users
+[PULL V3 06/15] virtio: Add vhost_svq_get_vring_addr
-From: Zhang Chen <chen.zhang@intel.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The "expired_scan_cycle" determines period of scanning expired
+It reports the shadow virtqueue address from qemu virtual address space.
 primary node net packets.
-Signed-off-by: Zhang Chen <chen.zhang@intel.com>
+Since this will be different from the guest's vaddr, but the device can
 access it, SVQ takes special care about its alignment & lack of garbage
 data. It assumes that IOMMU will work in host_page_size ranges for that.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- net/colo-compare.c | 48 +++++++++++++++++++++++++++++++++++++++++++++---
+ hw/virtio/vhost-shadow-virtqueue.c | 29 +++++++++++++++++++++++++++++
- qemu-options.hx    |  4 +++-
+ hw/virtio/vhost-shadow-virtqueue.h |  9 +++++++++
-files changed, 48 insertions(+), 4 deletions(-)
+files changed, 38 insertions(+)
-diff --git a/net/colo-compare.c b/net/colo-compare.c
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/net/colo-compare.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/net/colo-compare.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static NotifierList colo_compare_notifiers =
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
  #define COLO_COMPARE_FREE_PRIMARY     0x01
  #define COLO_COMPARE_FREE_SECONDARY   0x02
 -/* TODO: Should be configurable */
  #define REGULAR_PACKET_CHECK_MS 3000
  #define DEFAULT_TIME_OUT_MS 3000
@@ -XXX,XX +XXX,XX @@ typedef struct CompareState {
      SocketReadState notify_rs;
      bool vnet_hdr;
      uint32_t compare_timeout;
 +    uint32_t expired_scan_cycle;
      /*
       * Record the connection that through the NIC
@@ -XXX,XX +XXX,XX @@ static void check_old_packet_regular(void *opaque)
      /* if have old packet we will notify checkpoint */
      colo_old_packet_check(s);
      timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
 -                REGULAR_PACKET_CHECK_MS);
 +              s->expired_scan_cycle);
  }
- /* Public API, Used for COLO frame to notify compare event */
+ /**
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_timer_init(CompareState *s)
++ * Get the shadow vq vring address.
-                                 SCALE_MS, check_old_packet_regular,
++ * @svq: Shadow virtqueue
-                                 s);
++ * @addr: Destination to store address
-     timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
++ */
--                    REGULAR_PACKET_CHECK_MS);
++void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
-+              s->expired_scan_cycle);
++                              struct vhost_vring_addr *addr)
  }
  static void colo_compare_timer_del(CompareState *s)
@@ -XXX,XX +XXX,XX @@ out:
      error_propagate(errp, local_err);
  }
 +static void compare_get_expired_scan_cycle(Object *obj, Visitor *v,
 +                                           const char *name, void *opaque,
 +                                           Error **errp)
 +{
-+    CompareState *s = COLO_COMPARE(obj);
++    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-+    uint32_t value = s->expired_scan_cycle;
++    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-+
++    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
 +    visit_type_uint32(v, name, &value, errp);
 +}
 +
-+static void compare_set_expired_scan_cycle(Object *obj, Visitor *v,
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
 +                                           const char *name, void *opaque,
 +                                           Error **errp)
 +{
-+    CompareState *s = COLO_COMPARE(obj);
++    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
-+    Error *local_err = NULL;
++    size_t avail_size = offsetof(vring_avail_t, ring) +
-+    uint32_t value;
++                                             sizeof(uint16_t) * svq->vring.num;
 +
-+    visit_type_uint32(v, name, &value, &local_err);
++    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
 +    if (local_err) {
 +        goto out;
 +    }
 +    if (!value) {
 +        error_setg(&local_err, "Property '%s.%s' requires a positive value",
 +                   object_get_typename(obj), name);
 +        goto out;
 +    }
 +    s->expired_scan_cycle = value;
 +
 +out:
 +    error_propagate(errp, local_err);
 +}
 +
- static void compare_pri_rs_finalize(SocketReadState *pri_rs)
++size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
- {
++{
-     CompareState *s = container_of(pri_rs, CompareState, pri_rs);
++    size_t used_size = offsetof(vring_used_t, ring) +
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
++                                    sizeof(vring_used_elem_t) * svq->vring.num;
-         s->compare_timeout = DEFAULT_TIME_OUT_MS;
++    return ROUND_UP(used_size, qemu_real_host_page_size);
-     }
++}
 +    if (!s->expired_scan_cycle) {
 +        /* Set default value to 3000 MS */
 +        s->expired_scan_cycle = REGULAR_PACKET_CHECK_MS;
 +    }
 +
-     if (find_and_check_chardev(&chr, s->pri_indev, errp) ||
++/**
-         !qemu_chr_fe_init(&s->chr_pri_in, chr, errp)) {
+  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
-         return;
+  *
-@@ -XXX,XX +XXX,XX @@ static void colo_compare_init(Object *obj)
+  * @svq: The svq
-                         compare_get_timeout,
+diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
-                         compare_set_timeout, NULL, NULL, NULL);
+index XXXXXXX..XXXXXXX 100644
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-+    object_property_add(obj, "expired_scan_cycle", "uint32",
++++ b/hw/virtio/vhost-shadow-virtqueue.h
-+                        compare_get_expired_scan_cycle,
+@@ -XXX,XX +XXX,XX @@
-+                        compare_set_expired_scan_cycle, NULL, NULL, NULL);
+ #define VHOST_SHADOW_VIRTQUEUE_H
  #include "qemu/event_notifier.h"
 +#include "hw/virtio/virtio.h"
 +#include "standard-headers/linux/vhost_types.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
 +    /* Shadow vring */
 +    struct vring vring;
 +
-     s->vnet_hdr = false;
+     /* Shadow kick notifier, sent to vhost */
-     object_property_add_bool(obj, "vnet_hdr_support", compare_get_vnet_hdr,
+     EventNotifier hdev_kick;
-                              compare_set_vnet_hdr, NULL);
+     /* Shadow call notifier, sent to vhost */
-diff --git a/qemu-options.hx b/qemu-options.hx
+@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
-index XXXXXXX..XXXXXXX 100644
---- a/qemu-options.hx
+ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
-+++ b/qemu-options.hx
+ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
-@@ -XXX,XX +XXX,XX @@ SRST
++void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
-         stored. The file format is libpcap, so it can be analyzed with
++                              struct vhost_vring_addr *addr);
-         tools such as tcpdump or Wireshark.
++size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
++size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
--    ``-object colo-compare,id=id,primary_in=chardevid,secondary_in=chardevid,outdev=chardevid,iothread=id[,vnet_hdr_support][,notify_dev=id][,compare_timeout=@var{ms}]``
-+    ``-object colo-compare,id=id,primary_in=chardevid,secondary_in=chardevid,outdev=chardevid,iothread=id[,vnet_hdr_support][,notify_dev=id][,compare_timeout=@var{ms}][,expired_scan_cycle=@var{ms}``
+ void vhost_svq_stop(VhostShadowVirtqueue *svq);
          Colo-compare gets packet from primary\_inchardevid and
          secondary\_inchardevid, than compare primary packet with
          secondary packet. If the packets are same, we will output
@@ -XXX,XX +XXX,XX @@ SRST
          vnet\_hdr\_support flag, colo compare will send/recv packet with
          vnet\_hdr\_len. Then compare\_timeout=@var{ms} determines the
          maximum delay colo-compare wait for the packet.
 +        The expired\_scan\_cycle=@var{ms} to set the period of scanning
 +        expired primary node network packets.
          If you want to use Xen COLO, will need the notify\_dev to
          notify Xen colo-frame to do checkpoint.
 --
-.5.0
+.7.4

-[PULL V2 05/14] hw/net/smc91c111: Let smc91c111_can_receive() return a boolean
+[PULL V3 07/15] vdpa: adapt vhost_ops callbacks to svq
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The smc91c111_can_receive() function simply returns a boolean value.
+First half of the buffers forwarding part, preparing vhost-vdpa
 callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
 this is effectively dead code at the moment, but it helps to reduce
 patch size.
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Reviewed-by: Cédric Le Goater <clg@kaod.org>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/smc91c111.c | 8 ++++----
+ hw/virtio/vhost-vdpa.c | 48 +++++++++++++++++++++++++++++++++++++++++-------
-file changed, 4 insertions(+), 4 deletions(-)
+file changed, 41 insertions(+), 7 deletions(-)
-diff --git a/hw/net/smc91c111.c b/hw/net/smc91c111.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/smc91c111.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/smc91c111.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static void smc91c111_update(smc91c111_state *s)
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
-     qemu_set_irq(s->irq, level);
+     return ret;
   }
 +static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
 +                                         struct vhost_vring_state *ring)
 +{
 +    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 +}
 +
  static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                           struct vhost_vring_file *file)
  {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
      return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
  }
--static int smc91c111_can_receive(smc91c111_state *s)
++static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
-+static bool smc91c111_can_receive(smc91c111_state *s)
++                                         struct vhost_vring_addr *addr)
 +{
 +    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
 +                                addr->desc_user_addr, addr->used_user_addr,
 +                                addr->avail_user_addr,
 +                                addr->log_guest_addr);
 +
 +    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
 +
 +}
 +
  /**
   * Set the shadow virtqueue descriptors to the device
   *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
  static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                         struct vhost_vring_addr *addr)
  {
-     if ((s->rcr & RCR_RXEN) == 0 || (s->rcr & RCR_SOFT_RST)) {
+-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
--        return 1;
+-                                    addr->desc_user_addr, addr->used_user_addr,
-+        return true;
+-                                    addr->avail_user_addr,
-     }
+-                                    addr->log_guest_addr);
-     if (s->allocated == (1 << NUM_PACKETS) - 1 ||
+-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
-         s->rx_fifo_len == NUM_PACKETS) {
++    struct vhost_vdpa *v = dev->opaque;
--        return 0;
++
-+        return false;
++    if (v->shadow_vqs_enabled) {
-     }
++        /*
--    return 1;
++         * Device vring addr was set at device start. SVQ base is handled by
-+    return true;
++         * VirtQueue code.
 +         */
 +        return 0;
 +    }
 +
 +    return vhost_vdpa_set_vring_dev_addr(dev, addr);
  }
- static inline void smc91c111_flush_queued_packets(smc91c111_state *s)
+ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
  static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                         struct vhost_vring_state *ring)
  {
 -    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
 -    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (v->shadow_vqs_enabled) {
 +        /*
 +         * Device vring base was set at device start. SVQ base is handled by
 +         * VirtQueue code.
 +         */
 +        return 0;
 +    }
 +
 +    return vhost_vdpa_set_dev_vring_base(dev, ring);
  }
  static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
 --
-.5.0
+.7.4

-[PULL V2 12/14] net: tulip: check frame size and r/w data length
+[PULL V3 08/15] vhost: Shadow virtqueue buffers forwarding
-From: Prasad J Pandit <pjp@fedoraproject.org>
+From: Eugenio Pérez <eperezma@redhat.com>
-Tulip network driver while copying tx/rx buffers does not check
+Initial version of shadow virtqueue that actually forward buffers. There
-frame size against r/w data length. This may lead to OOB buffer
+is no iommu support at the moment, and that will be addressed in future
-access. Add check to avoid it.
+patches of this series. Since all vhost-vdpa devices use forced IOMMU,
+this means that SVQ is not usable at this point of the series on any
-Limit iterations over descriptors to avoid potential infinite
+device.
-loop issue in tulip_xmit_list_update.
+For simplicity it only supports modern devices, that expects vring
-Reported-by: Li Qiang <pangpei.lq@antfin.com>
+in little endian, with split ring and no event idx or indirect
-Reported-by: Ziming Zhang <ezrakiez@gmail.com>
+descriptors. Support for them will not be added in this series.
-Reported-by: Jason Wang <jasowang@redhat.com>
-Tested-by: Li Qiang <liq3ea@gmail.com>
+It reuses the VirtQueue code for the device part. The driver part is
-Reviewed-by: Li Qiang <liq3ea@gmail.com>
+based on Linux's virtio_ring driver, but with stripped functionality
-Signed-off-by: Prasad J Pandit <pjp@fedoraproject.org>
+and optimizations so it's easier to review.
 However, forwarding buffers have some particular pieces: One of the most
 unexpected ones is that a guest's buffer can expand through more than
 one descriptor in SVQ. While this is handled gracefully by qemu's
 emulated virtio devices, it may cause unexpected SVQ queue full. This
 patch also solves it by checking for this condition at both guest's
 kicks and device's calls. The code may be more elegant in the future if
 SVQ code runs in its own iocontext.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/tulip.c | 36 +++++++++++++++++++++++++++---------
+ hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
-file changed, 27 insertions(+), 9 deletions(-)
+ hw/virtio/vhost-shadow-virtqueue.h |  26 +++
+ hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
-diff --git a/hw/net/tulip.c b/hw/net/tulip.c
+files changed, 522 insertions(+), 11 deletions(-)
 diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/tulip.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/tulip.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
+@@ -XXX,XX +XXX,XX @@
-         } else {
+ #include "qemu/error-report.h"
-             len = s->rx_frame_len;
+ #include "qapi/error.h"
-         }
+ #include "qemu/main-loop.h"
-+
++#include "qemu/log.h"
-+        if (s->rx_frame_len + len > sizeof(s->rx_frame)) {
++#include "qemu/memalign.h"
-+            return;
+ #include "linux-headers/linux/vhost.h"
-+        }
-         pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame +
+ /**
-             (s->rx_frame_size - s->rx_frame_len), len);
+@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
          s->rx_frame_len -= len;
@@ -XXX,XX +XXX,XX @@ static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
          } else {
              len = s->rx_frame_len;
          }
 +
 +        if (s->rx_frame_len + len > sizeof(s->rx_frame)) {
 +            return;
 +        }
          pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame +
              (s->rx_frame_size - s->rx_frame_len), len);
          s->rx_frame_len -= len;
@@ -XXX,XX +XXX,XX @@ static ssize_t tulip_receive(TULIPState *s, const uint8_t *buf, size_t size)
      trace_tulip_receive(buf, size);
 -    if (size < 14 || size > 2048 || s->rx_frame_len || tulip_rx_stopped(s)) {
 +    if (size < 14 || size > sizeof(s->rx_frame) - 4
 +        || s->rx_frame_len || tulip_rx_stopped(s)) {
          return 0;
      }
@@ -XXX,XX +XXX,XX @@ static ssize_t tulip_receive_nc(NetClientState *nc,
      return tulip_receive(qemu_get_nic_opaque(nc), buf, size);
  }
--
+ /**
- static NetClientInfo net_tulip_info = {
+- * Forward guest notifications.
-     .type = NET_CLIENT_DRIVER_NIC,
++ * Number of descriptors that the SVQ can make available from the guest.
-     .size = sizeof(NICState),
++ *
-@@ -XXX,XX +XXX,XX @@ static void tulip_tx(TULIPState *s, struct tulip_descriptor *desc)
++ * @svq: The svq
-         if ((s->csr[6] >> CSR6_OM_SHIFT) & CSR6_OM_MASK) {
++ */
-             /* Internal or external Loopback */
++static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
-             tulip_receive(s, s->tx_frame, s->tx_frame_len);
++{
--        } else {
++    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
-+        } else if (s->tx_frame_len <= sizeof(s->tx_frame)) {
++}
-             qemu_send_packet(qemu_get_queue(s->nic),
++
-                 s->tx_frame, s->tx_frame_len);
++static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
-         }
++                                    const struct iovec *iovec, size_t num,
-@@ -XXX,XX +XXX,XX @@ static void tulip_tx(TULIPState *s, struct tulip_descriptor *desc)
++                                    bool more_descs, bool write)
 +{
 +    uint16_t i = svq->free_head, last = svq->free_head;
 +    unsigned n;
 +    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
 +    vring_desc_t *descs = svq->vring.desc;
 +
 +    if (num == 0) {
 +        return;
 +    }
 +
 +    for (n = 0; n < num; n++) {
 +        if (more_descs || (n + 1 < num)) {
 +            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
 +        } else {
 +            descs[i].flags = flags;
 +        }
 +        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
 +        descs[i].len = cpu_to_le32(iovec[n].iov_len);
 +
 +        last = i;
 +        i = cpu_to_le16(descs[i].next);
 +    }
 +
 +    svq->free_head = le16_to_cpu(descs[last].next);
 +}
 +
 +static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 +                                VirtQueueElement *elem, unsigned *head)
 +{
 +    unsigned avail_idx;
 +    vring_avail_t *avail = svq->vring.avail;
 +
 +    *head = svq->free_head;
 +
 +    /* We need some descriptors here */
 +    if (unlikely(!elem->out_num && !elem->in_num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +                      "Guest provided element with no descriptors");
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
 +                            false);
 +    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +
 +    /*
 +     * Put the entry in the available array (but don't update avail->idx until
 +     * they do sync).
 +     */
 +    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
 +    avail->ring[avail_idx] = cpu_to_le16(*head);
 +    svq->shadow_avail_idx++;
 +
 +    /* Update the avail index after write the descriptor */
 +    smp_wmb();
 +    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
 +
 +    return true;
 +}
 +
 +static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
 +{
 +    unsigned qemu_head;
 +    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    svq->ring_id_maps[qemu_head] = elem;
 +    return true;
 +}
 +
 +static void vhost_svq_kick(VhostShadowVirtqueue *svq)
 +{
 +    /*
 +     * We need to expose the available array entries before checking the used
 +     * flags
 +     */
 +    smp_mb();
 +    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
 +        return;
 +    }
 +
 +    event_notifier_set(&svq->hdev_kick);
 +}
 +
 +/**
 + * Forward available buffers.
 + *
 + * @svq: Shadow VirtQueue
 + *
 + * Note that this function does not guarantee that all guest's available
 + * buffers are available to the device in SVQ avail ring. The guest may have
 + * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
 + * qemu vaddr.
 + *
 + * If that happens, guest's kick notifications will be disabled until the
 + * device uses some buffers.
 + */
 +static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
 +{
 +    /* Clear event notifier */
 +    event_notifier_test_and_clear(&svq->svq_kick);
 +
 +    /* Forward to the device as many available buffers as possible */
 +    do {
 +        virtio_queue_set_notification(svq->vq, false);
 +
 +        while (true) {
 +            VirtQueueElement *elem;
 +            bool ok;
 +
 +            if (svq->next_guest_avail_elem) {
 +                elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +            } else {
 +                elem = virtqueue_pop(svq->vq, sizeof(*elem));
 +            }
 +
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
 +                /*
 +                 * This condition is possible since a contiguous buffer in GPA
 +                 * does not imply a contiguous buffer in qemu's VA
 +                 * scatter-gather segments. If that happens, the buffer exposed
 +                 * to the device needs to be a chain of descriptors at this
 +                 * moment.
 +                 *
 +                 * SVQ cannot hold more available buffers if we are here:
 +                 * queue the current guest descriptor and ignore further kicks
 +                 * until some elements are used.
 +                 */
 +                svq->next_guest_avail_elem = elem;
 +                return;
 +            }
 +
 +            ok = vhost_svq_add(svq, elem);
 +            if (unlikely(!ok)) {
 +                /* VQ is broken, just return and ignore any other kicks */
 +                return;
 +            }
 +            vhost_svq_kick(svq);
 +        }
 +
 +        virtio_queue_set_notification(svq->vq, true);
 +    } while (!virtio_queue_empty(svq->vq));
 +}
 +
 +/**
 + * Handle guest's kick.
   *
   * @n: guest kick event notifier, the one that guest set to notify svq.
   */
 -static void vhost_handle_guest_kick(EventNotifier *n)
 +static void vhost_handle_guest_kick_notifier(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->hdev_kick);
 +    vhost_handle_guest_kick(svq);
 +}
 +
 +static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
 +{
 +    if (svq->last_used_idx != svq->shadow_used_idx) {
 +        return true;
 +    }
 +
 +    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
 +
 +    return svq->last_used_idx != svq->shadow_used_idx;
  }
  /**
 - * Forward vhost notifications
 + * Enable vhost device calls after disable them.
 + *
 + * @svq: The svq
 + *
 + * It returns false if there are pending used buffers from the vhost device,
 + * avoiding the possible races between SVQ checking for more work and enabling
 + * callbacks. True if SVQ used vring has no more pending buffers.
 + */
 +static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +    /* Make sure the flag is written before the read of used_idx */
 +    smp_mb();
 +    return !vhost_svq_more_used(svq);
 +}
 +
 +static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
 +{
 +    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
 +}
 +
 +static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
 +                                           uint32_t *len)
 +{
 +    vring_desc_t *descs = svq->vring.desc;
 +    const vring_used_t *used = svq->vring.used;
 +    vring_used_elem_t used_elem;
 +    uint16_t last_used;
 +
 +    if (!vhost_svq_more_used(svq)) {
 +        return NULL;
 +    }
 +
 +    /* Only get used array entries after they have been exposed by dev */
 +    smp_rmb();
 +    last_used = svq->last_used_idx & (svq->vring.num - 1);
 +    used_elem.id = le32_to_cpu(used->ring[last_used].id);
 +    used_elem.len = le32_to_cpu(used->ring[last_used].len);
 +
 +    svq->last_used_idx++;
 +    if (unlikely(used_elem.id >= svq->vring.num)) {
 +        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
 +                      svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
 +        qemu_log_mask(LOG_GUEST_ERROR,
 +            "Device %s says index %u is used, but it was not available",
 +            svq->vdev->name, used_elem.id);
 +        return NULL;
 +    }
 +
 +    descs[used_elem.id].next = svq->free_head;
 +    svq->free_head = used_elem.id;
 +
 +    *len = used_elem.len;
 +    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
 +}
 +
 +static void vhost_svq_flush(VhostShadowVirtqueue *svq,
 +                            bool check_for_avail_queue)
 +{
 +    VirtQueue *vq = svq->vq;
 +
 +    /* Forward as many used buffers as possible. */
 +    do {
 +        unsigned i = 0;
 +
 +        vhost_svq_disable_notification(svq);
 +        while (true) {
 +            uint32_t len;
 +            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
 +            if (!elem) {
 +                break;
 +            }
 +
 +            if (unlikely(i >= svq->vring.num)) {
 +                qemu_log_mask(LOG_GUEST_ERROR,
 +                         "More than %u used buffers obtained in a %u size SVQ",
 +                         i, svq->vring.num);
 +                virtqueue_fill(vq, elem, len, i);
 +                virtqueue_flush(vq, i);
 +                return;
 +            }
 +            virtqueue_fill(vq, elem, len, i++);
 +        }
 +
 +        virtqueue_flush(vq, i);
 +        event_notifier_set(&svq->svq_call);
 +
 +        if (check_for_avail_queue && svq->next_guest_avail_elem) {
 +            /*
 +             * Avail ring was full when vhost_svq_flush was called, so it's a
 +             * good moment to make more descriptors available if possible.
 +             */
 +            vhost_handle_guest_kick(svq);
 +        }
 +    } while (!vhost_svq_enable_notification(svq));
 +}
 +
 +/**
 + * Forward used buffers.
   *
   * @n: hdev call event notifier, the one that device set to notify svq.
 + *
 + * Note that we are not making any buffers available in the loop, there is no
 + * way that it runs more than virtqueue size times.
   */
  static void vhost_svq_handle_call(EventNotifier *n)
  {
      VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                               hdev_call);
      event_notifier_test_and_clear(n);
 -    event_notifier_set(&svq->svq_call);
 +    vhost_svq_flush(svq, true);
  }
  /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
      if (poll_start) {
          event_notifier_init_fd(svq_kick, svq_kick_fd);
          event_notifier_set(svq_kick);
 -        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
 +        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
 +    }
 +}
 +
 +/**
 + * Start the shadow virtqueue operation.
 + *
 + * @svq: Shadow Virtqueue
 + * @vdev: VirtIO device
 + * @vq: Virtqueue to shadow
 + */
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq)
 +{
 +    size_t desc_size, driver_size, device_size;
 +
 +    svq->next_guest_avail_elem = NULL;
 +    svq->shadow_avail_idx = 0;
 +    svq->shadow_used_idx = 0;
 +    svq->last_used_idx = 0;
 +    svq->vdev = vdev;
 +    svq->vq = vq;
 +
 +    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
 +    driver_size = vhost_svq_driver_area_size(svq);
 +    device_size = vhost_svq_device_area_size(svq);
 +    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
 +    desc_size = sizeof(vring_desc_t) * svq->vring.num;
 +    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
 +    memset(svq->vring.desc, 0, driver_size);
 +    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
 +    memset(svq->vring.used, 0, device_size);
 +    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
 +    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
 +        svq->vring.desc[i].next = cpu_to_le16(i + 1);
      }
  }
--static void tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
+@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
-+static int tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
+ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  {
-     int len1 = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
+     event_notifier_set_handler(&svq->svq_kick, NULL);
-     int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;
++    g_autofree VirtQueueElement *next_avail_elem = NULL;
++
-+    if (s->tx_frame_len + len1 > sizeof(s->tx_frame)) {
++    if (!svq->vq) {
-+        return -1;
++        return;
 +    }
-     if (len1) {
++
-         pci_dma_read(&s->dev, desc->buf_addr1,
++    /* Send all pending used descriptors to guest */
-             s->tx_frame + s->tx_frame_len, len1);
++    vhost_svq_flush(svq, false);
-         s->tx_frame_len += len1;
++
 +    for (unsigned i = 0; i < svq->vring.num; ++i) {
 +        g_autofree VirtQueueElement *elem = NULL;
 +        elem = g_steal_pointer(&svq->ring_id_maps[i]);
 +        if (elem) {
 +            virtqueue_detach_element(svq->vq, elem, 0);
 +        }
 +    }
 +
 +    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
 +    if (next_avail_elem) {
 +        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
 +    }
 +    svq->vq = NULL;
 +    g_free(svq->ring_id_maps);
 +    qemu_vfree(svq->vring.desc);
 +    qemu_vfree(svq->vring.used);
  }
  /**
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-shadow-virtqueue.h
 +++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Guest's call notifier, where the SVQ calls guest. */
      EventNotifier svq_call;
 +
 +    /* Virtio queue shadowing */
 +    VirtQueue *vq;
 +
 +    /* Virtio device */
 +    VirtIODevice *vdev;
 +
 +    /* Map for use the guest's descriptors */
 +    VirtQueueElement **ring_id_maps;
 +
 +    /* Next VirtQueue element that guest made available */
 +    VirtQueueElement *next_guest_avail_elem;
 +
 +    /* Next head to expose to the device */
 +    uint16_t shadow_avail_idx;
 +
 +    /* Next free descriptor */
 +    uint16_t free_head;
 +
 +    /* Last seen used idx */
 +    uint16_t shadow_used_idx;
 +
 +    /* Next head to consume from the device */
 +    uint16_t last_used_idx;
  } VhostShadowVirtqueue;
  bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
  size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 +void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
 +                     VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
  VhostShadowVirtqueue *vhost_svq_new(void);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
   * Note that this function does not rewind kick file descriptor if cannot set
   * call one.
   */
 -static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 -                                 VhostShadowVirtqueue *svq, unsigned idx,
 -                                 Error **errp)
 +static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 +                                  VhostShadowVirtqueue *svq, unsigned idx,
 +                                  Error **errp)
  {
      struct vhost_vring_file file = {
          .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
      r = vhost_vdpa_set_vring_dev_kick(dev, &file);
      if (unlikely(r != 0)) {
          error_setg_errno(errp, -r, "Can't set device kick fd");
 -        return false;
 +        return r;
      }
-+    if (s->tx_frame_len + len2 > sizeof(s->tx_frame)) {
+     event_notifier = &svq->hdev_call;
-+        return -1;
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-+    }
+         error_setg_errno(errp, -r, "Can't set device call fd");
      if (len2) {
          pci_dma_read(&s->dev, desc->buf_addr2,
              s->tx_frame + s->tx_frame_len, len2);
          s->tx_frame_len += len2;
      }
-     desc->status = (len1 + len2) ? 0 : 0x7fffffff;
-+
++    return r;
-+    return 0;
++}
 +
 +/**
 + * Unmap a SVQ area in the device
 + */
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 +                                      hwaddr size)
 +{
 +    int r;
 +
 +    size = ROUND_UP(size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
 +                                       const VhostShadowVirtqueue *svq)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    struct vhost_vring_addr svq_addr;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    bool ok;
 +
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 +
 +    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
 +}
 +
 +/**
 + * Map the shadow virtqueue rings in the device
 + *
 + * @dev: The vhost device
 + * @svq: The shadow virtqueue
 + * @addr: Assigned IOVA addresses
 + * @errp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
 +                                     const VhostShadowVirtqueue *svq,
 +                                     struct vhost_vring_addr *addr,
 +                                     Error **errp)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +    size_t device_size = vhost_svq_device_area_size(svq);
 +    size_t driver_size = vhost_svq_driver_area_size(svq);
 +    int r;
 +
 +    ERRP_GUARD();
 +    vhost_svq_get_vring_addr(svq, addr);
 +
 +    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 +                           (void *)(uintptr_t)addr->desc_user_addr, true);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 +                           (void *)(intptr_t)addr->used_user_addr, false);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    }
 +
 +    return r == 0;
 +}
 +
 +static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 +                                 VhostShadowVirtqueue *svq, unsigned idx,
 +                                 Error **errp)
 +{
 +    uint16_t vq_index = dev->vq_index + idx;
 +    struct vhost_vring_state s = {
 +        .index = vq_index,
 +    };
 +    int r;
 +
 +    r = vhost_vdpa_set_dev_vring_base(dev, &s);
 +    if (unlikely(r)) {
 +        error_setg_errno(errp, -r, "Cannot set vring base");
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
      return r == 0;
  }
- static void tulip_setup_filter_addr(TULIPState *s, uint8_t *buf, int n)
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
@@ -XXX,XX +XXX,XX @@ static uint32_t tulip_ts(TULIPState *s)
  static void tulip_xmit_list_update(TULIPState *s)
  {
 +#define TULIP_DESC_MAX 128
 +    uint8_t i = 0;
      struct tulip_descriptor desc;
      if (tulip_ts(s) != CSR5_TS_SUSPENDED) {
          return;
      }
--    for (;;) {
+     for (i = 0; i < v->shadow_vqs->len; ++i) {
-+    for (i = 0; i < TULIP_DESC_MAX; i++) {
++        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
-         tulip_desc_read(s, s->current_tx_desc, &desc);
+         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
-         tulip_dump_tx_descriptor(s, &desc);
++        struct vhost_vring_addr addr = {
++            .index = i,
-@@ -XXX,XX +XXX,XX @@ static void tulip_xmit_list_update(TULIPState *s)
++        };
-                 s->tx_frame_len = 0;
++        int r;
-             }
+         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
+         if (unlikely(!ok)) {
--            tulip_copy_tx_buffers(s, &desc);
+-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
--
++            goto err;
--            if (desc.control & TDES1_LS) {
++        }
--                tulip_tx(s, &desc);
++
-+            if (!tulip_copy_tx_buffers(s, &desc)) {
++        vhost_svq_start(svq, dev->vdev, vq);
-+                if (desc.control & TDES1_LS) {
++        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
-+                    tulip_tx(s, &desc);
++        if (unlikely(!ok)) {
-+                }
++            goto err_map;
-             }
++        }
 +
 +        /* Override vring GPA set by vhost subsystem */
 +        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
 +        if (unlikely(r != 0)) {
 +            error_setg_errno(&err, -r, "Cannot set device address");
 +            goto err_set_addr;
 +        }
 +    }
 +
 +    return true;
 +
 +err_set_addr:
 +    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
 +
 +err_map:
 +    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
 +
 +err:
 +    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
 +    for (unsigned j = 0; j < i; ++j) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
 +        vhost_vdpa_svq_unmap_rings(dev, svq);
 +        vhost_svq_stop(svq);
 +    }
 +
 +    return false;
 +}
 +
 +static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
 +{
 +    struct vhost_vdpa *v = dev->opaque;
 +
 +    if (!v->shadow_vqs) {
 +        return true;
 +    }
 +
 +    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
 +        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
 +        if (unlikely(!ok)) {
              return false;
          }
-         tulip_desc_write(s, s->current_tx_desc, &desc);
+     }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
          }
          vhost_vdpa_set_vring_ready(dev);
      } else {
 +        ok = vhost_vdpa_svqs_stop(dev);
 +        if (unlikely(!ok)) {
 +            return -1;
 +        }
          vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
      }
 --
-.5.0
+.7.4

-[PULL V2 06/14] hw/net/rtl8139: Simplify if/else statement
+[PULL V3 09/15] util: Add iova_tree_alloc_map
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-Rewrite:
+This iova tree function allows it to look for a hole in allocated
+regions and return a totally new translation for a given translated
-      if (E) {
+address.
-          return A;
-      } else {
+It's usage is mainly to allow devices to access qemu address space,
-          return B;
+remapping guest's one into a new iova space where qemu can add chunks of
-      }
+addresses.
-      /* EOF */
-  }
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
-as:
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
       if (E) {
           return A;
       }
       return B;
   }
 Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
 Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
 Reviewed-by: Cédric Le Goater <clg@kaod.org>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/rtl8139.c | 8 ++++----
+ include/qemu/iova-tree.h |  18 +++++++
-file changed, 4 insertions(+), 4 deletions(-)
+ util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
+files changed, 154 insertions(+)
-diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
 diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/rtl8139.c
+--- a/include/qemu/iova-tree.h
-+++ b/hw/net/rtl8139.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ static int rtl8139_can_receive(NetClientState *nc)
+@@ -XXX,XX +XXX,XX @@
-         /* ??? Flow control not implemented in c+ mode.
+ #define  IOVA_OK           (0)
-            This is a hack to work around slirp deficiencies anyway.  */
+ #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
-         return 1;
+ #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
--    } else {
++#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
--        avail = MOD2(s->RxBufferSize + s->RxBufPtr - s->RxBufAddr,
--                     s->RxBufferSize);
+ typedef struct IOVATree IOVATree;
--        return (avail == 0 || avail >= 1514 || (s->IntrMask & RxOverflow));
+ typedef struct DMAMap {
-     }
+@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
-+
+ void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
-+    avail = MOD2(s->RxBufferSize + s->RxBufPtr - s->RxBufAddr,
-+                 s->RxBufferSize);
+ /**
-+    return avail == 0 || avail >= 1514 || (s->IntrMask & RxOverflow);
++ * iova_tree_alloc_map:
 + *
 + * @tree: the iova tree to allocate from
 + * @map: the new map (as translated addr & size) to allocate in the iova region
 + * @iova_begin: the minimum address of the allocation
 + * @iova_end: the maximum addressable direction of the allocation
 + *
 + * Allocates a new region of a given size, between iova_min and iova_max.
 + *
 + * Return: Same as iova_tree_insert, but cannot overlap and can return error if
 + * iova tree is out of free contiguous range. The caller gets the assigned iova
 + * in map->iova.
 + */
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_end);
 +
 +/**
   * iova_tree_destroy:
   *
   * @tree: the iova tree to destroy
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
      GTree *tree;
  };
 +/* Args to pass to iova_tree_alloc foreach function. */
 +struct IOVATreeAllocArgs {
 +    /* Size of the desired allocation */
 +    size_t new_size;
 +
 +    /* The minimum address allowed in the allocation */
 +    hwaddr iova_begin;
 +
 +    /* Map at the left of the hole, can be NULL if "this" is first one */
 +    const DMAMap *prev;
 +
 +    /* Map at the right of the hole, can be NULL if "prev" is the last one */
 +    const DMAMap *this;
 +
 +    /* If found, we fill in the IOVA here */
 +    hwaddr iova_result;
 +
 +    /* Whether have we found a valid IOVA */
 +    bool iova_found;
 +};
 +
 +/**
 + * Iterate args to the next hole
 + *
 + * @args: The alloc arguments
 + * @next: The next mapping in the tree. Can be NULL to signal the last one
 + */
 +static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
 +                                         const DMAMap *next)
 +{
 +    args->prev = args->this;
 +    args->this = next;
 +}
 +
  static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
  {
      const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
      return IOVA_OK;
  }
- static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t size_, int do_interrupt)
++/**
 + * Try to find an unallocated IOVA range between prev and this elements.
 + *
 + * @args: Arguments to allocation
 + *
 + * Cases:
 + *
 + * (1) !prev, !this: No entries allocated, always succeed
 + *
 + * (2) !prev, this: We're iterating at the 1st element.
 + *
 + * (3) prev, !this: We're iterating at the last element.
 + *
 + * (4) prev, this: this is the most common case, we'll try to find a hole
 + * between "prev" and "this" mapping.
 + *
 + * Note that this function assumes the last valid iova is HWADDR_MAX, but it
 + * searches linearly so it's easy to discard the result if it's not the case.
 + */
 +static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
 +{
 +    const DMAMap *prev = args->prev, *this = args->this;
 +    uint64_t hole_start, hole_last;
 +
 +    if (this && this->iova + this->size < args->iova_begin) {
 +        return;
 +    }
 +
 +    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
 +    hole_last = this ? this->iova : HWADDR_MAX;
 +
 +    if (hole_last - hole_start > args->new_size) {
 +        args->iova_result = hole_start;
 +        args->iova_found = true;
 +    }
 +}
 +
 +/**
 + * Foreach dma node in the tree, compare if there is a hole with its previous
 + * node (or minimum iova address allowed) and the node.
 + *
 + * @key: Node iterating
 + * @value: Node iterating
 + * @pargs: Struct to communicate with the outside world
 + *
 + * Return: false to keep iterating, true if needs break.
 + */
 +static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
 +                                         gpointer pargs)
 +{
 +    struct IOVATreeAllocArgs *args = pargs;
 +    DMAMap *node = value;
 +
 +    assert(key == value);
 +
 +    iova_tree_alloc_args_iterate(args, node);
 +    iova_tree_alloc_map_in_hole(args);
 +    return args->iova_found;
 +}
 +
 +int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
 +                        hwaddr iova_last)
 +{
 +    struct IOVATreeAllocArgs args = {
 +        .new_size = map->size,
 +        .iova_begin = iova_begin,
 +    };
 +
 +    if (unlikely(iova_last < iova_begin)) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
 +    /*
 +     * Find a valid hole for the mapping
 +     *
 +     * Assuming low iova_begin, so no need to do a binary search to
 +     * locate the first node.
 +     *
 +     * TODO: Replace all this with g_tree_node_first/next/last when available
 +     * (from glib since 2.68). To do it with g_tree_foreach complicates the
 +     * code a lot.
 +     *
 +     */
 +    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
 +    if (!args.iova_found) {
 +        /*
 +         * Either tree is empty or the last hole is still not checked.
 +         * g_tree_foreach does not compare (last, iova_last] range, so we check
 +         * it here.
 +         */
 +        iova_tree_alloc_args_iterate(&args, NULL);
 +        iova_tree_alloc_map_in_hole(&args);
 +    }
 +
 +    if (!args.iova_found || args.iova_result + map->size > iova_last) {
 +        return IOVA_ERR_NOMEM;
 +    }
 +
 +    map->iova = args.iova_result;
 +    return iova_tree_insert(tree, map);
 +}
 +
  void iova_tree_destroy(IOVATree *tree)
  {
      g_tree_destroy(tree->tree);
 --
-.5.0
+.7.4

-[PULL V2 04/14] hw/net/e1000e_core: Let e1000e_can_receive() return a boolean
+[PULL V3 10/15] util: add iova_tree_find_iova
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The e1000e_can_receive() function simply returns a boolean value.
+This function does the reverse operation of iova_tree_find: To look for
 a mapping that match a translated address so we can do the reverse.
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+This have linear complexity instead of logarithmic, but it supports
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
+overlapping HVA. Future developments could reduce it.
-Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/e1000e_core.c | 2 +-
+ include/qemu/iova-tree.h | 20 +++++++++++++++++++-
- hw/net/e1000e_core.h | 2 +-
+ util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
-files changed, 2 insertions(+), 2 deletions(-)
+files changed, 53 insertions(+), 1 deletion(-)
-diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
+diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/e1000e_core.c
+--- a/include/qemu/iova-tree.h
-+++ b/hw/net/e1000e_core.c
++++ b/include/qemu/iova-tree.h
-@@ -XXX,XX +XXX,XX @@ e1000e_start_recv(E1000ECore *core)
+@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
-     }
+  * @tree: the iova tree to search from
   * @map: the mapping to search
   *
 - * Search for a mapping in the iova tree that overlaps with the
 + * Search for a mapping in the iova tree that iova overlaps with the
   * mapping range specified.  Only the first found mapping will be
   * returned.
   *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
  /**
 + * iova_tree_find_iova:
 + *
 + * @tree: the iova tree to search from
 + * @map: the mapping to search
 + *
 + * Search for a mapping in the iova tree that translated_addr overlaps with the
 + * mapping range specified.  Only the first found mapping will be
 + * returned.
 + *
 + * Return: DMAMap pointer if found, or NULL if not found.  Note that
 + * the returned DMAMap pointer is maintained internally.  User should
 + * only read the content but never modify or free the content.  Also,
 + * user is responsible to make sure the pointer is valid (say, no
 + * concurrent deletion in progress).
 + */
 +const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
 +
 +/**
   * iova_tree_find_address:
   *
   * @tree: the iova tree to search from
 diff --git a/util/iova-tree.c b/util/iova-tree.c
 index XXXXXXX..XXXXXXX 100644
 --- a/util/iova-tree.c
 +++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
      bool iova_found;
  };
 +typedef struct IOVATreeFindIOVAArgs {
 +    const DMAMap *needle;
 +    const DMAMap *result;
 +} IOVATreeFindIOVAArgs;
 +
  /**
   * Iterate args to the next hole
   *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
      return g_tree_lookup(tree->tree, map);
  }
--int
++static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
-+bool
++                                                gpointer data)
- e1000e_can_receive(E1000ECore *core)
++{
 +    const DMAMap *map = key;
 +    IOVATreeFindIOVAArgs *args = data;
 +    const DMAMap *needle;
 +
 +    g_assert(key == value);
 +
 +    needle = args->needle;
 +    if (map->translated_addr + map->size < needle->translated_addr ||
 +        needle->translated_addr + needle->size < map->translated_addr) {
 +        return false;
 +    }
 +
 +    args->result = map;
 +    return true;
 +}
 +
 +const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
 +{
 +    IOVATreeFindIOVAArgs args = {
 +        .needle = map,
 +    };
 +
 +    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
 +    return args.result;
 +}
 +
  const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
  {
-     int i;
+     const DMAMap map = { .iova = iova, .size = 0 };
 diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/net/e1000e_core.h
 +++ b/hw/net/e1000e_core.h
@@ -XXX,XX +XXX,XX @@ e1000e_core_set_link_status(E1000ECore *core);
  void
  e1000e_core_pci_uninit(E1000ECore *core);
 -int
 +bool
  e1000e_can_receive(E1000ECore *core);
  ssize_t
 --
-.5.0
+.7.4

-[PULL V2 14/14] qtest: add tulip test case
+[PULL V3 11/15] vhost: Add VhostIOVATree
-From: Li Qiang <liq3ea@163.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The tulip networking card emulation has an OOB issue in
+This tree is able to look for a translated address from an IOVA address.
 'tulip_copy_tx_buffers' when the guest provide malformed descriptor.
 This test will trigger a ASAN heap overflow crash. To trigger this
 issue we can construct the data as following:
-. construct a 'tulip_descriptor'. Its control is set to
+At first glance it is similar to util/iova-tree. However, SVQ working on
-'0x7ff | 0x7ff << 11', this will make the 'tulip_copy_tx_buffers's
+devices with limited IOVA space need more capabilities, like allocating
-'len1' and 'len2' to 0x7ff(2047). So 'len1+len2' will overflow
+IOVA chunks or performing reverse translations (qemu addresses to iova).
 'TULIPState's 'tx_frame' field. This descriptor's 'buf_addr1' and
 'buf_addr2' should set to a guest address.
-. write this descriptor to tulip device's CSR4 register. This will
+The allocation capability, as "assign a free IOVA address to this chunk
-set the 'TULIPState's 'current_tx_desc' field.
+of memory in qemu's address space" allows shadow virtqueue to create a
 new address space that is not restricted by guest's addressable one, so
 we can allocate shadow vqs vrings outside of it.
-. write 'CSR6_ST' to tulip device's CSR6 register. This will trigger
+It duplicates the tree so it can search efficiently in both directions,
-'tulip_xmit_list_update' and finally calls 'tulip_copy_tx_buffers'.
+and it will signal overlap if iova or the translated address is present
 in any tree.
-Following shows the backtrack of crash:
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 ==31781==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x628000007cd0 at pc 0x7fe03c5a077a bp 0x7fff05b46770 sp 0x7fff05b45f18
 WRITE of size 2047 at 0x628000007cd0 thread T0
     #0 0x7fe03c5a0779  (/usr/lib/x86_64-linux-gnu/libasan.so.4+0x79779)
     #1 0x5575fb6daa6a in flatview_read_continue /home/test/qemu/exec.c:3194
     #2 0x5575fb6daccb in flatview_read /home/test/qemu/exec.c:3227
     #3 0x5575fb6dae66 in address_space_read_full /home/test/qemu/exec.c:3240
     #4 0x5575fb6db0cb in address_space_rw /home/test/qemu/exec.c:3268
     #5 0x5575fbdfd460 in dma_memory_rw_relaxed /home/test/qemu/include/sysemu/dma.h:87
     #6 0x5575fbdfd4b5 in dma_memory_rw /home/test/qemu/include/sysemu/dma.h:110
     #7 0x5575fbdfd866 in pci_dma_rw /home/test/qemu/include/hw/pci/pci.h:787
     #8 0x5575fbdfd8a3 in pci_dma_read /home/test/qemu/include/hw/pci/pci.h:794
     #9 0x5575fbe02761 in tulip_copy_tx_buffers hw/net/tulip.c:585
     #10 0x5575fbe0366b in tulip_xmit_list_update hw/net/tulip.c:678
     #11 0x5575fbe04073 in tulip_write hw/net/tulip.c:783
 Signed-off-by: Li Qiang <liq3ea@163.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- tests/qtest/Makefile.include |  1 +
+ hw/virtio/meson.build       |   2 +-
- tests/qtest/tulip-test.c     | 91 ++++++++++++++++++++++++++++++++++++++++++++
+ hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
-files changed, 92 insertions(+)
+ hw/virtio/vhost-iova-tree.h |  27 +++++++++++
- create mode 100644 tests/qtest/tulip-test.c
+files changed, 138 insertions(+), 1 deletion(-)
  create mode 100644 hw/virtio/vhost-iova-tree.c
  create mode 100644 hw/virtio/vhost-iova-tree.h
-diff --git a/tests/qtest/Makefile.include b/tests/qtest/Makefile.include
+diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
 index XXXXXXX..XXXXXXX 100644
---- a/tests/qtest/Makefile.include
+--- a/hw/virtio/meson.build
-+++ b/tests/qtest/Makefile.include
++++ b/hw/virtio/meson.build
-@@ -XXX,XX +XXX,XX @@ qos-test-obj-y += tests/qtest/es1370-test.o
+@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
- qos-test-obj-y += tests/qtest/ipoctal232-test.o
- qos-test-obj-y += tests/qtest/megasas-test.o
+ virtio_ss = ss.source_set()
- qos-test-obj-y += tests/qtest/ne2000-test.o
+ virtio_ss.add(files('virtio.c'))
-+qos-test-obj-y += tests/qtest/tulip-test.o
+-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
- qos-test-obj-y += tests/qtest/nvme-test.o
++virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
- qos-test-obj-y += tests/qtest/pca9552-test.o
+ virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
- qos-test-obj-y += tests/qtest/pci-test.o
+ virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
-diff --git a/tests/qtest/tulip-test.c b/tests/qtest/tulip-test.c
+ virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
 diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
-+++ b/tests/qtest/tulip-test.c
++++ b/hw/virtio/vhost-iova-tree.c
 @@ -XXX,XX +XXX,XX @@
 +/*
-+ * QTest testcase for DEC/Intel Tulip 21143
++ * vhost software live migration iova tree
 + *
-+ * Copyright (c) 2020 Li Qiang <liq3ea@gmail.com>
++ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
-+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
++ * SPDX-License-Identifier: GPL-2.0-or-later
 + * See the COPYING file in the top-level directory.
 + */
 +
 +#include "qemu/osdep.h"
-+#include "libqtest.h"
++#include "qemu/iova-tree.h"
-+#include "qemu/module.h"
++#include "vhost-iova-tree.h"
 +#include "libqos/qgraph.h"
 +#include "libqos/pci.h"
 +#include "qemu/bitops.h"
 +#include "hw/net/tulip.h"
 +
-+typedef struct QTulip_pci QTulip_pci;
++#define iova_min_addr qemu_real_host_page_size
 +
-+struct QTulip_pci {
++/**
-+    QOSGraphObject obj;
++ * VhostIOVATree, able to:
-+    QPCIDevice dev;
++ * - Translate iova address
 + * - Reverse translate iova address (from translated to iova)
 + * - Allocate IOVA regions for translated range (linear operation)
 + */
 +struct VhostIOVATree {
 +    /* First addressable iova address in the device */
 +    uint64_t iova_first;
 +
 +    /* Last addressable iova address in the device */
 +    uint64_t iova_last;
 +
 +    /* IOVA address to qemu memory maps. */
 +    IOVATree *iova_taddr_map;
 +};
 +
-+static void *tulip_pci_get_driver(void *obj, const char *interface)
++/**
 + * Create a new IOVA tree
 + *
 + * Returns the new IOVA tree
 + */
 +VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
 +{
-+    QTulip_pci *tulip_pci = obj;
++    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
 +
-+    if (!g_strcmp0(interface, "pci-device")) {
++    /* Some devices do not like 0 addresses */
-+        return &tulip_pci->dev;
++    tree->iova_first = MAX(iova_first, iova_min_addr);
 +    tree->iova_last = iova_last;
 +
 +    tree->iova_taddr_map = iova_tree_new();
 +    return tree;
 +}
 +
 +/**
 + * Delete an iova tree
 + */
 +void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
 +{
 +    iova_tree_destroy(iova_tree->iova_taddr_map);
 +    g_free(iova_tree);
 +}
 +
 +/**
 + * Find the IOVA address stored from a memory address
 + *
 + * @tree: The iova tree
 + * @map: The map with the memory address
 + *
 + * Return the stored mapping, or NULL if not found.
 + */
 +const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
 +                                        const DMAMap *map)
 +{
 +    return iova_tree_find_iova(tree->iova_taddr_map, map);
 +}
 +
 +/**
 + * Allocate a new mapping
 + *
 + * @tree: The iova tree
 + * @map: The iova map
 + *
 + * Returns:
 + * - IOVA_OK if the map fits in the container
 + * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
 + * - IOVA_ERR_NOMEM if tree cannot allocate more space.
 + *
 + * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
 + */
 +int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
 +{
 +    /* Some vhost devices do not like addr 0. Skip first page */
 +    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
 +
 +    if (map->translated_addr + map->size < map->translated_addr ||
 +        map->perm == IOMMU_NONE) {
 +        return IOVA_ERR_INVALID;
 +    }
 +
-+    fprintf(stderr, "%s not present in tulip_pci\n", interface);
++    /* Allocate a node in IOVA address */
-+    g_assert_not_reached();
++    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
 +                               tree->iova_last);
 +}
 +
-+static void *tulip_pci_create(void *pci_bus, QGuestAllocator *alloc, void *addr)
++/**
 + * Remove existing mappings from iova tree
 + *
 + * @iova_tree: The vhost iova tree
 + * @map: The map to remove
 + */
 +void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
 +{
-+    QTulip_pci *tulip_pci = g_new0(QTulip_pci, 1);
++    iova_tree_remove(iova_tree->iova_taddr_map, map);
-+    QPCIBus *bus = pci_bus;
++}
 diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
 new file mode 100644
 index XXXXXXX..XXXXXXX
 --- /dev/null
 +++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
 +/*
 + * vhost software live migration iova tree
 + *
 + * SPDX-FileCopyrightText: Red Hat, Inc. 2021
 + * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
 + *
 + * SPDX-License-Identifier: GPL-2.0-or-later
 + */
 +
-+    qpci_device_init(&tulip_pci->dev, bus, addr);
++#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
-+    tulip_pci->obj.get_driver = tulip_pci_get_driver;
++#define HW_VIRTIO_VHOST_IOVA_TREE_H
 +
-+    return &tulip_pci->obj;
++#include "qemu/iova-tree.h"
-+}
++#include "exec/memory.h"
 +
-+static void tulip_large_tx(void *obj, void *data, QGuestAllocator *alloc)
++typedef struct VhostIOVATree VhostIOVATree;
 +{
 +    QTulip_pci *tulip_pci = obj;
 +    QPCIDevice *dev = &tulip_pci->dev;
 +    QPCIBar bar;
 +    struct tulip_descriptor context;
 +    char guest_data[4096];
 +    uint64_t context_pa;
 +    uint64_t guest_pa;
 +
-+    qpci_device_enable(dev);
++VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
-+    bar = qpci_iomap(dev, 0, NULL);
++void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
-+    context_pa = guest_alloc(alloc, sizeof(context));
++G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
 +    guest_pa = guest_alloc(alloc, 4096);
 +    memset(guest_data, 'A', sizeof(guest_data));
 +    context.status = TDES0_OWN;
 +    context.control = TDES1_BUF2_SIZE_MASK << TDES1_BUF2_SIZE_SHIFT |
 +                      TDES1_BUF1_SIZE_MASK << TDES1_BUF1_SIZE_SHIFT;
 +    context.buf_addr2 = guest_pa;
 +    context.buf_addr1 = guest_pa;
 +
-+    qtest_memwrite(dev->bus->qts, context_pa, &context, sizeof(context));
++const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
-+    qtest_memwrite(dev->bus->qts, guest_pa, guest_data, sizeof(guest_data));
++                                        const DMAMap *map);
-+    qpci_io_writel(dev, bar, 0x20, context_pa);
++int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
-+    qpci_io_writel(dev, bar, 0x30, CSR6_ST);
++void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
 +    guest_free(alloc, context_pa);
 +    guest_free(alloc, guest_pa);
 +}
 +
-+static void tulip_register_nodes(void)
++#endif
 +{
 +    QOSGraphEdgeOptions opts = {
 +        .extra_device_opts = "addr=04.0",
 +    };
 +    add_qpci_address(&opts, &(QPCIAddress) { .devfn = QPCI_DEVFN(4, 0) });
 +
 +    qos_node_create_driver("tulip", tulip_pci_create);
 +    qos_node_consumes("tulip", "pci-bus", &opts);
 +    qos_node_produces("tulip", "pci-device");
 +
 +    qos_add_test("tulip_large_tx", "tulip", tulip_large_tx, NULL);
 +}
 +
 +libqos_init(tulip_register_nodes);
 --
-.5.0
+.7.4

-[PULL V2 09/14] hw/net/can: Make CanBusClientInfo::can_receive() return a boolean
+[PULL V3 12/15] vdpa: Add custom IOTLB translations to SVQ
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-The CanBusClientInfo::can_receive handler return whether the
+Use translations added in VhostIOVATree in SVQ.
-device can or can not receive new frames. Make it obvious by
-returning a boolean type.
+Only introduce usage here, not allocation and deallocation. As with
+previous patches, we use the dead code paths of shadow_vqs_enabled to
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+avoid commiting too many changes at once. These are impossible to take
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
+at the moment.
-Reviewed-by: Cédric Le Goater <clg@kaod.org>
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/allwinner-sun8i-emac.c | 2 +-
+ hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
- hw/net/can/can_sja1000.c      | 8 ++++----
+ hw/virtio/vhost-shadow-virtqueue.h |   6 +-
- hw/net/can/can_sja1000.h      | 2 +-
+ hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
- include/net/can_emu.h         | 2 +-
+ include/hw/virtio/vhost-vdpa.h     |   3 +
- net/can/can_socketcan.c       | 4 ++--
+files changed, 187 insertions(+), 30 deletions(-)
-files changed, 9 insertions(+), 9 deletions(-)
+diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
 diff --git a/hw/net/allwinner-sun8i-emac.c b/hw/net/allwinner-sun8i-emac.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/allwinner-sun8i-emac.c
+--- a/hw/virtio/vhost-shadow-virtqueue.c
-+++ b/hw/net/allwinner-sun8i-emac.c
++++ b/hw/virtio/vhost-shadow-virtqueue.c
-@@ -XXX,XX +XXX,XX @@ static void allwinner_sun8i_emac_flush_desc(FrameDescriptor *desc,
+@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
-     cpu_physical_memory_write(phys_addr, desc, sizeof(*desc));
+     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
  }
--static int allwinner_sun8i_emac_can_receive(NetClientState *nc)
+-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
-+static bool allwinner_sun8i_emac_can_receive(NetClientState *nc)
++/**
- {
++ * Translate addresses between the qemu's virtual address and the SVQ IOVA
-     AwSun8iEmacState *s = qemu_get_nic_opaque(nc);
++ *
-     FrameDescriptor desc;
++ * @svq: Shadow VirtQueue
-diff --git a/hw/net/can/can_sja1000.c b/hw/net/can/can_sja1000.c
++ * @vaddr: Translated IOVA addresses
 + * @iovec: Source qemu's VA addresses
 + * @num: Length of iovec and minimum length of vaddr
 + */
 +static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
 +                                     hwaddr *addrs, const struct iovec *iovec,
 +                                     size_t num)
 +{
 +    if (num == 0) {
 +        return true;
 +    }
 +
 +    for (size_t i = 0; i < num; ++i) {
 +        DMAMap needle = {
 +            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
 +            .size = iovec[i].iov_len,
 +        };
 +        Int128 needle_last, map_last;
 +        size_t off;
 +
 +        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
 +        /*
 +         * Map cannot be NULL since iova map contains all guest space and
 +         * qemu already has a physical address mapped
 +         */
 +        if (unlikely(!map)) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
 +                          needle.translated_addr);
 +            return false;
 +        }
 +
 +        off = needle.translated_addr - map->translated_addr;
 +        addrs[i] = map->iova + off;
 +
 +        needle_last = int128_add(int128_make64(needle.translated_addr),
 +                                 int128_make64(iovec[i].iov_len));
 +        map_last = int128_make64(map->translated_addr + map->size);
 +        if (unlikely(int128_gt(needle_last, map_last))) {
 +            qemu_log_mask(LOG_GUEST_ERROR,
 +                          "Guest buffer expands over iova range");
 +            return false;
 +        }
 +    }
 +
 +    return true;
 +}
 +
 +static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                      const struct iovec *iovec, size_t num,
                                      bool more_descs, bool write)
  {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
          } else {
              descs[i].flags = flags;
          }
 -        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
 +        descs[i].addr = cpu_to_le64(sg[n]);
          descs[i].len = cpu_to_le32(iovec[n].iov_len);
          last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
  {
      unsigned avail_idx;
      vring_avail_t *avail = svq->vring.avail;
 +    bool ok;
 +    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
      *head = svq->free_head;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
          return false;
      }
 -    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
 -                            false);
 -    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
 +                            elem->in_num > 0, false);
 +
 +
 +    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
 +    if (unlikely(!ok)) {
 +        return false;
 +    }
 +
 +    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
      /*
       * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
  void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                                struct vhost_vring_addr *addr)
  {
 -    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
 -    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
 -    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
 +    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
 +    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
 +    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
  }
  size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
   * Creates vhost shadow virtqueue, and instructs the vhost device to use the
   * shadow methods and file descriptors.
   *
 + * @iova_tree: Tree to perform descriptors translations
 + *
   * Returns the new virtqueue or NULL.
   *
   * In case of error, reason is reported through error_report.
   */
 -VhostShadowVirtqueue *vhost_svq_new(void)
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
  {
      g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
      int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
      event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
      event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
 +    svq->iova_tree = iova_tree;
      return g_steal_pointer(&svq);
  err_init_hdev_call:
 diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/can/can_sja1000.c
+--- a/hw/virtio/vhost-shadow-virtqueue.h
-+++ b/hw/net/can/can_sja1000.c
++++ b/hw/virtio/vhost-shadow-virtqueue.h
-@@ -XXX,XX +XXX,XX @@ uint64_t can_sja_mem_read(CanSJA1000State *s, hwaddr addr, unsigned size)
+@@ -XXX,XX +XXX,XX @@
-     return temp;
+ #include "qemu/event_notifier.h"
  #include "hw/virtio/virtio.h"
  #include "standard-headers/linux/vhost_types.h"
 +#include "hw/virtio/vhost-iova-tree.h"
  /* Shadow virtqueue to relay notifications */
  typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      /* Virtio device */
      VirtIODevice *vdev;
 +    /* IOVA mapping */
 +    VhostIOVATree *iova_tree;
 +
      /* Map for use the guest's descriptors */
      VirtQueueElement **ring_id_maps;
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                       VirtQueue *vq);
  void vhost_svq_stop(VhostShadowVirtqueue *svq);
 -VhostShadowVirtqueue *vhost_svq_new(void);
 +VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
  void vhost_svq_free(gpointer vq);
  G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
 diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
 --- a/hw/virtio/vhost-vdpa.c
 +++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                           vaddr, section->readonly);
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)(uintptr_t)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
 +        };
 +
 +        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
 +        if (unlikely(r != IOVA_OK)) {
 +            error_report("Can't allocate a mapping (%d)", r);
 +            goto fail;
 +        }
 +
 +        iova = mem_region.iova;
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
      llsize = int128_sub(llend, int128_make64(iova));
 +    if (v->shadow_vqs_enabled) {
 +        const DMAMap *result;
 +        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
 +            section->offset_within_region +
 +            (iova - section->offset_within_address_space);
 +        DMAMap mem_region = {
 +            .translated_addr = (hwaddr)(uintptr_t)vaddr,
 +            .size = int128_get64(llsize) - 1,
 +        };
 +
 +        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
 +        iova = result->iova;
 +        vhost_iova_tree_remove(v->iova_tree, &mem_region);
 +    }
      vhost_vdpa_iotlb_batch_begin_once(v);
      ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
      shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
      for (unsigned n = 0; n < hdev->nvqs; ++n) {
 -        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
 +        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
          if (unlikely(!svq)) {
              error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
  /**
   * Unmap a SVQ area in the device
   */
 -static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
 -                                      hwaddr size)
 +static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
 +                                      const DMAMap *needle)
  {
 +    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
 +    hwaddr size;
      int r;
 -    size = ROUND_UP(size, qemu_real_host_page_size);
 -    r = vhost_vdpa_dma_unmap(v, iova, size);
 +    if (unlikely(!result)) {
 +        error_report("Unable to find SVQ address to unmap");
 +        return false;
 +    }
 +
 +    size = ROUND_UP(result->size, qemu_real_host_page_size);
 +    r = vhost_vdpa_dma_unmap(v, result->iova, size);
      return r == 0;
  }
--int can_sja_can_receive(CanBusClientState *client)
+ static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
-+bool can_sja_can_receive(CanBusClientState *client)
+                                        const VhostShadowVirtqueue *svq)
  {
-     CanSJA1000State *s = container_of(client, CanSJA1000State, bus_client);
++    DMAMap needle = {};
+     struct vhost_vdpa *v = dev->opaque;
-     if (s->clock & 0x80) { /* PeliCAN Mode */
+     struct vhost_vring_addr svq_addr;
-         if (s->mode & 0x01) { /* reset mode. */
+-    size_t device_size = vhost_svq_device_area_size(svq);
--            return 0;
+-    size_t driver_size = vhost_svq_driver_area_size(svq);
-+            return false;
+     bool ok;
-         }
-     } else { /* BasicCAN mode */
+     vhost_svq_get_vring_addr(svq, &svq_addr);
-         if (s->control & 0x01) {
--            return 0;
+-    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
-+            return false;
++    needle.translated_addr = svq_addr.desc_user_addr;
-         }
++    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
      if (unlikely(!ok)) {
          return false;
      }
--    return 1; /* always return 1, when operation mode */
+-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
-+    return true; /* always return true, when operation mode */
++    needle.translated_addr = svq_addr.used_user_addr;
 +    return vhost_vdpa_svq_unmap_ring(v, &needle);
 +}
 +
 +/**
 + * Map the SVQ area in the device
 + *
 + * @v: Vhost-vdpa device
 + * @needle: The area to search iova
 + * @errorp: Error pointer
 + */
 +static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
 +                                    Error **errp)
 +{
 +    int r;
 +
 +    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
 +    if (unlikely(r != IOVA_OK)) {
 +        error_setg(errp, "Cannot allocate iova (%d)", r);
 +        return false;
 +    }
 +
 +    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
 +                           (void *)(uintptr_t)needle->translated_addr,
 +                           needle->perm == IOMMU_RO);
 +    if (unlikely(r != 0)) {
 +        error_setg_errno(errp, -r, "Cannot map region to device");
 +        vhost_iova_tree_remove(v->iova_tree, needle);
 +    }
 +
 +    return r == 0;
  }
- ssize_t can_sja_receive(CanBusClientState *client, const qemu_can_frame *frames,
+ /**
-diff --git a/hw/net/can/can_sja1000.h b/hw/net/can/can_sja1000.h
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                       struct vhost_vring_addr *addr,
                                       Error **errp)
  {
 +    DMAMap device_region, driver_region;
 +    struct vhost_vring_addr svq_addr;
      struct vhost_vdpa *v = dev->opaque;
      size_t device_size = vhost_svq_device_area_size(svq);
      size_t driver_size = vhost_svq_driver_area_size(svq);
 -    int r;
 +    size_t avail_offset;
 +    bool ok;
      ERRP_GUARD();
 -    vhost_svq_get_vring_addr(svq, addr);
 +    vhost_svq_get_vring_addr(svq, &svq_addr);
 -    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
 -                           (void *)(uintptr_t)addr->desc_user_addr, true);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
 +    driver_region = (DMAMap) {
 +        .translated_addr = svq_addr.desc_user_addr,
 +        .size = driver_size - 1,
 +        .perm = IOMMU_RO,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq driver region: ");
          return false;
      }
 +    addr->desc_user_addr = driver_region.iova;
 +    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
 +    addr->avail_user_addr = driver_region.iova + avail_offset;
 -    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
 -                           (void *)(intptr_t)addr->used_user_addr, false);
 -    if (unlikely(r != 0)) {
 -        error_setg_errno(errp, -r, "Cannot create vq device region: ");
 +    device_region = (DMAMap) {
 +        .translated_addr = svq_addr.used_user_addr,
 +        .size = device_size - 1,
 +        .perm = IOMMU_RW,
 +    };
 +    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
 +    if (unlikely(!ok)) {
 +        error_prepend(errp, "Cannot create vq device region: ");
 +        vhost_vdpa_svq_unmap_ring(v, &driver_region);
      }
 +    addr->used_user_addr = device_region.iova;
 -    return r == 0;
 +    return ok;
  }
  static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/can/can_sja1000.h
+--- a/include/hw/virtio/vhost-vdpa.h
-+++ b/hw/net/can/can_sja1000.h
++++ b/include/hw/virtio/vhost-vdpa.h
-@@ -XXX,XX +XXX,XX @@ void can_sja_disconnect(CanSJA1000State *s);
+@@ -XXX,XX +XXX,XX @@
- int can_sja_init(CanSJA1000State *s, qemu_irq irq);
+ #include <gmodule.h>
--int can_sja_can_receive(CanBusClientState *client);
++#include "hw/virtio/vhost-iova-tree.h"
-+bool can_sja_can_receive(CanBusClientState *client);
+ #include "hw/virtio/virtio.h"
+ #include "standard-headers/linux/vhost_types.h"
- ssize_t can_sja_receive(CanBusClientState *client,
-                         const qemu_can_frame *frames, size_t frames_cnt);
+@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
-diff --git a/include/net/can_emu.h b/include/net/can_emu.h
+     MemoryListener listener;
-index XXXXXXX..XXXXXXX 100644
+     struct vhost_vdpa_iova_range iova_range;
---- a/include/net/can_emu.h
+     bool shadow_vqs_enabled;
-+++ b/include/net/can_emu.h
++    /* IOVA mapping used by the Shadow Virtqueue */
-@@ -XXX,XX +XXX,XX @@ typedef struct CanBusClientState CanBusClientState;
++    VhostIOVATree *iova_tree;
- typedef struct CanBusState CanBusState;
+     GPtrArray *shadow_vqs;
+     struct vhost_dev *dev;
- typedef struct CanBusClientInfo {
+     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
 -    int (*can_receive)(CanBusClientState *);
 +    bool (*can_receive)(CanBusClientState *);
      ssize_t (*receive)(CanBusClientState *,
          const struct qemu_can_frame *frames, size_t frames_cnt);
  } CanBusClientInfo;
 diff --git a/net/can/can_socketcan.c b/net/can/can_socketcan.c
 index XXXXXXX..XXXXXXX 100644
 --- a/net/can/can_socketcan.c
 +++ b/net/can/can_socketcan.c
@@ -XXX,XX +XXX,XX @@ static void can_host_socketcan_read(void *opaque)
      }
  }
 -static int can_host_socketcan_can_receive(CanBusClientState *client)
 +static bool can_host_socketcan_can_receive(CanBusClientState *client)
  {
 -    return 1;
 +    return true;
  }
  static ssize_t can_host_socketcan_receive(CanBusClientState *client,
 --
-.5.0
+.7.4

-[PULL V2 01/14] hw/net/i82596: Correct command bitmask (CID 1419392)
+[PULL V3 13/15] vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
-From: Philippe Mathieu-Daudé <f4bug@amsat.org>
+From: Eugenio Pérez <eperezma@redhat.com>
-The command is 32-bit, but we are loading the 16 upper bits with
+This is needed to achieve migration, so the destination can restore its
-the 'get_uint16(s->scb + 2)' call.
+index.
-Once shifted by 16, the command bits match the status bits:
+Setting base as last used idx, so destination will see as available all
 the entries that the device did not use, including the in-flight
 processing ones.
-- Command
+This is ok for networking, but other kinds of devices might have
-  Bit 31 ACK-CX   Acknowledges that the CU completed an Action Command.
+problems with these retransmissions.
   Bit 30 ACK-FR   Acknowledges that the RU received a frame.
   Bit 29 ACK-CNA  Acknowledges that the Command Unit became not active.
   Bit 28 ACK-RNR  Acknowledges that the Receive Unit became not ready.
-- Status
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-  Bit 15 CX       The CU finished executing a command with its I(interrupt) bit set.
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
   Bit 14 FR       The RU finished receiving a frame.
   Bit 13 CNA      The Command Unit left the Active state.
   Bit 12 RNR      The Receive Unit left the Ready state.
 Add the SCB_COMMAND_ACK_MASK definition to simplify the code.
 This fixes Coverity 1419392 (CONSTANT_EXPRESSION_RESULT):
   /hw/net/i82596.c: 352 in examine_scb()
 cuc = (command >> 8) & 0x7;
 ruc = (command >> 4) & 0x7;
 DBG(printf("MAIN COMMAND %04x  cuc %02x ruc %02x\n", command, cuc, ruc));
 /* and clear the scb command word */
 set_uint16(s->scb + 2, 0);
   >>>     CID 1419392:    (CONSTANT_EXPRESSION_RESULT)
   >>>     "command & (2147483648UL /* 1UL << 31 */)" is always 0 regardless of the values of its operands. This occurs as the logical operand of "if".
 if (command & BIT(31))      /* ACK-CX */
 s->scb_status &= ~SCB_STATUS_CX;
   >>>     CID 1419392:    (CONSTANT_EXPRESSION_RESULT)
   >>>     "command & (1073741824UL /* 1UL << 30 */)" is always 0 regardless of the values of its operands. This occurs as the logical operand of "if".
 if (command & BIT(30))      /*ACK-FR */
 s->scb_status &= ~SCB_STATUS_FR;
   >>>     CID 1419392:    (CONSTANT_EXPRESSION_RESULT)
   >>>     "command & (536870912UL /* 1UL << 29 */)" is always 0 regardless of the values of its operands. This occurs as the logical operand of "if".
 if (command & BIT(29))      /*ACK-CNA */
 s->scb_status &= ~SCB_STATUS_CNA;
   >>>     CID 1419392:    (CONSTANT_EXPRESSION_RESULT)
   >>>     "command & (268435456UL /* 1UL << 28 */)" is always 0 regardless of the values of its operands. This occurs as the logical operand of "if".
 if (command & BIT(28))      /*ACK-RNR */
 s->scb_status &= ~SCB_STATUS_RNR;
 Fixes: Covertiy CID 1419392 (commit 376b851909)
 Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
 Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/i82596.c | 12 ++++--------
+ hw/virtio/vhost-vdpa.c | 17 +++++++++++++++++
-file changed, 4 insertions(+), 8 deletions(-)
+file changed, 17 insertions(+)
-diff --git a/hw/net/i82596.c b/hw/net/i82596.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/i82596.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/i82596.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
- #define SCB_STATUS_CNA  0x2000 /* CU left active state */
+ static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
- #define SCB_STATUS_RNR  0x1000 /* RU left active state */
+                                        struct vhost_vring_state *ring)
+ {
-+#define SCB_COMMAND_ACK_MASK \
++    struct vhost_vdpa *v = dev->opaque;
-+        (SCB_STATUS_CX | SCB_STATUS_FR | SCB_STATUS_CNA | SCB_STATUS_RNR)
+     int ret;
 +    if (v->shadow_vqs_enabled) {
 +        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
 +                                                      ring->index);
 +
- #define CU_IDLE         0
++        /*
- #define CU_SUSPENDED    1
++         * Setting base as last used idx, so destination will see as available
- #define CU_ACTIVE       2
++         * all the entries that the device did not use, including the in-flight
-@@ -XXX,XX +XXX,XX @@ static void examine_scb(I82596State *s)
++         * processing ones.
-     /* and clear the scb command word */
++         *
-     set_uint16(s->scb + 2, 0);
++         * TODO: This is ok for networking, but other kinds of devices might
++         * have problems with these retransmissions.
--    if (command & BIT(31))      /* ACK-CX */
++         */
--        s->scb_status &= ~SCB_STATUS_CX;
++        ring->num = svq->last_used_idx;
--    if (command & BIT(30))      /*ACK-FR */
++        return 0;
--        s->scb_status &= ~SCB_STATUS_FR;
++    }
--    if (command & BIT(29))      /*ACK-CNA */
++
--        s->scb_status &= ~SCB_STATUS_CNA;
+     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
--    if (command & BIT(28))      /*ACK-RNR */
+     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
--        s->scb_status &= ~SCB_STATUS_RNR;
+     return ret;
 +    s->scb_status &= ~(command & SCB_COMMAND_ACK_MASK);
      switch (cuc) {
      case 0:     /* no change */
 --
-.5.0
+.7.4

-[PULL V2 02/14] hw/net/i82596.c: Avoid reading off end of buffer in i82596_receive()
+[PULL V3 14/15] vdpa: Never set log_base addr if SVQ is enabled
-From: Peter Maydell <peter.maydell@linaro.org>
+From: Eugenio Pérez <eperezma@redhat.com>
-The i82596_receive() function attempts to pass the guest a buffer
+Setting the log address would make the device start reporting invalid
-which is effectively the concatenation of the data it is passed and a
+dirty memory because the SVQ vrings are located in qemu's memory.
 byte CRC value.  However, rather than implementing this as "write
 the data; then write the CRC" it instead bumps the length value of
 the data by 4, and writes 4 extra bytes from beyond the end of the
 buffer, which it then overwrites with the CRC.  It also assumed that
 we could always fit all four bytes of the CRC into the final receive
 buffer, which might not be true if the CRC needs to be split over two
 receive buffers.
-Calculate separately how many bytes we need to transfer into the
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
-guest's receive buffer from the source buffer, and how many we need
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
 to transfer from the CRC work.
 We add a count 'bufsz' of the number of bytes left in the source
 buffer, which we use purely to assert() that we don't overrun.
 Spotted by Coverity (CID 1419396) for the specific case when we end
 up using a local array as the source buffer.
 Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/i82596.c | 44 +++++++++++++++++++++++++++++++++++---------
+ hw/virtio/vhost-vdpa.c | 3 ++-
-file changed, 35 insertions(+), 9 deletions(-)
+file changed, 2 insertions(+), 1 deletion(-)
-diff --git a/hw/net/i82596.c b/hw/net/i82596.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/i82596.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/i82596.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ ssize_t i82596_receive(NetClientState *nc, const uint8_t *buf, size_t sz)
+@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
-     uint32_t rfd_p;
+ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
-     uint32_t rbd;
+                                      struct vhost_log *log)
-     uint16_t is_broadcast = 0;
+ {
--    size_t len = sz;
+-    if (vhost_vdpa_one_time_request(dev)) {
-+    size_t len = sz; /* length of data for guest (including CRC) */
++    struct vhost_vdpa *v = dev->opaque;
-+    size_t bufsz = sz; /* length of data in buf */
++    if (v->shadow_vqs_enabled || vhost_vdpa_one_time_request(dev)) {
-     uint32_t crc;
+         return 0;
      uint8_t *crc_ptr;
      uint8_t buf1[MIN_BUF_SIZE + VLAN_HLEN];
@@ -XXX,XX +XXX,XX @@ ssize_t i82596_receive(NetClientState *nc, const uint8_t *buf, size_t sz)
          if (len < MIN_BUF_SIZE) {
              len = MIN_BUF_SIZE;
          }
 +        bufsz = len;
      }
-     /* Calculate the ethernet checksum (4 bytes) */
-@@ -XXX,XX +XXX,XX @@ ssize_t i82596_receive(NetClientState *nc, const uint8_t *buf, size_t sz)
-         while (len) {
-             uint16_t buffer_size, num;
-             uint32_t rba;
-+            size_t bufcount, crccount;
-             /* printf("Receive: rbd is %08x\n", rbd); */
-             buffer_size = get_uint16(rbd + 12);
-@@ -XXX,XX +XXX,XX @@ ssize_t i82596_receive(NetClientState *nc, const uint8_t *buf, size_t sz)
-             }
-             rba = get_uint32(rbd + 8);
-             /* printf("rba is 0x%x\n", rba); */
--            address_space_write(&address_space_memory, rba,
--                                MEMTXATTRS_UNSPECIFIED, buf, num);
--            rba += num;
--            buf += num;
--            len -= num;
--            if (len == 0) { /* copy crc */
--                address_space_write(&address_space_memory, rba - 4,
--                                    MEMTXATTRS_UNSPECIFIED, crc_ptr, 4);
-+            /*
-+             * Calculate how many bytes we want from buf[] and how many
-+             * from the CRC.
-+             */
-+            if ((len - num) >= 4) {
-+                /* The whole guest buffer, we haven't hit the CRC yet */
-+                bufcount = num;
-+            } else {
-+                /* All that's left of buf[] */
-+                bufcount = len - 4;
-+            }
-+            crccount = num - bufcount;
-+
-+            if (bufcount > 0) {
-+                /* Still some of the actual data buffer to transfer */
-+                assert(bufsz >= bufcount);
-+                bufsz -= bufcount;
-+                address_space_write(&address_space_memory, rba,
-+                                    MEMTXATTRS_UNSPECIFIED, buf, bufcount);
-+                rba += bufcount;
-+                buf += bufcount;
-+                len -= bufcount;
-+            }
-+
-+            /* Write as much of the CRC as fits */
-+            if (crccount > 0) {
-+                address_space_write(&address_space_memory, rba,
-+                                    MEMTXATTRS_UNSPECIFIED, crc_ptr, crccount);
-+                rba += crccount;
-+                crc_ptr += crccount;
-+                len -= crccount;
-             }
-             num |= 0x4000; /* set F BIT */
 --
-.5.0
+.7.4

-[PULL V2 07/14] hw/net/rtl8139: Update coding style to make checkpatch.pl happy
+[PULL V3 15/15] vdpa: Expose VHOST_F_LOG_ALL on SVQ
-From: Philippe Mathieu-Daudé <philmd@redhat.com>
+From: Eugenio Pérez <eperezma@redhat.com>
-We will modify this code in the next commit. Clean it up
+SVQ is able to log the dirty bits by itself, so let's use it to not
-first to avoid checkpatch.pl errors.
+block migration.
-Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
+Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
-Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
+enabled. Even if the device supports it, the reports would be nonsense
-Reviewed-by: Cédric Le Goater <clg@kaod.org>
+because SVQ memory is in the qemu region.
 The log region is still allocated. Future changes might skip that, but
 this series is already long enough.
 Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
 Acked-by: Michael S. Tsirkin <mst@redhat.com>
 Signed-off-by: Jason Wang <jasowang@redhat.com>
 ---
- hw/net/rtl8139.c | 10 ++++++----
+ hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
-file changed, 6 insertions(+), 4 deletions(-)
+ include/hw/virtio/vhost-vdpa.h |  1 +
 files changed, 36 insertions(+), 4 deletions(-)
-diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
+diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
 index XXXXXXX..XXXXXXX 100644
---- a/hw/net/rtl8139.c
+--- a/hw/virtio/vhost-vdpa.c
-+++ b/hw/net/rtl8139.c
++++ b/hw/virtio/vhost-vdpa.c
-@@ -XXX,XX +XXX,XX @@ static int rtl8139_can_receive(NetClientState *nc)
+@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
-     int avail;
+     return v->index != 0;
+ }
-     /* Receive (drop) packets if card is disabled.  */
--    if (!s->clock_enabled)
++static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
--      return 1;
++                                       uint64_t *features)
--    if (!rtl8139_receiver_enabled(s))
++{
--      return 1;
++    int ret;
-+    if (!s->clock_enabled) {
++
-+        return 1;
++    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
 +    trace_vhost_vdpa_get_features(dev, *features);
 +    return ret;
 +}
 +
  static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                 Error **errp)
  {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
          return 0;
      }
 -    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
 +    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
      if (r != 0) {
          error_setg_errno(errp, -r, "Can't get vdpa device features");
          return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
  static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                     uint64_t features)
  {
 +    struct vhost_vdpa *v = dev->opaque;
      int ret;
      if (vhost_vdpa_one_time_request(dev)) {
          return 0;
      }
 +    if (v->shadow_vqs_enabled) {
 +        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
 +            /*
 +             * QEMU is just trying to enable or disable logging. SVQ handles
 +             * this sepparately, so no need to forward this.
 +             */
 +            v->acked_features = features;
 +            return 0;
 +        }
 +
 +        v->acked_features = features;
 +
 +        /* We must not ack _F_LOG if SVQ is enabled */
 +        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
 +    }
-+    if (!rtl8139_receiver_enabled(s)) {
++
-+        return 1;
+     trace_vhost_vdpa_set_features(dev, features);
      ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
      if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
  static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                       uint64_t *features)
  {
 -    int ret;
 +    struct vhost_vdpa *v = dev->opaque;
 +    int ret = vhost_vdpa_get_dev_features(dev, features);
 +
 +    if (ret == 0 && v->shadow_vqs_enabled) {
 +        /* Add SVQ logging capabilities */
 +        *features |= BIT_ULL(VHOST_F_LOG_ALL);
 +    }
-     if (rtl8139_cp_receiver_enabled(s) && rtl8139_cp_rx_valid(s)) {
+-    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
-         /* ??? Flow control not implemented in c+ mode.
+-    trace_vhost_vdpa_get_features(dev, *features);
      return ret;
  }
 diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
 index XXXXXXX..XXXXXXX 100644
 --- a/include/hw/virtio/vhost-vdpa.h
 +++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
      bool iotlb_batch_begin_sent;
      MemoryListener listener;
      struct vhost_vdpa_iova_range iova_range;
 +    uint64_t acked_features;
      bool shadow_vqs_enabled;
      /* IOVA mapping used by the Shadow Virtqueue */
      VhostIOVATree *iova_tree;
 --
-.5.0
+.7.4

The following changes since commit 2a95551e8b1456aa53ce54fac573df18809340a6:

Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20200330' into staging (2020-03-31 11:20:21 +0100)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 1153cf9f5b67fad41ca6f8571e9a26e2c7c70759:

qtest: add tulip test case (2020-03-31 21:14:35 +0800)

----------------------------------------------------------------

Changes from V1:

- fix the compiling error
- include qtest for tulip OOB

----------------------------------------------------------------
Andrew Melnychenko (1):
      Fixed integer overflow in e1000e

Li Qiang (1):
      qtest: add tulip test case

Peter Maydell (2):
      hw/net/i82596.c: Avoid reading off end of buffer in i82596_receive()
      hw/net/allwinner-sun8i-emac.c: Fix REG_ADDR_HIGH/LOW reads

Philippe Mathieu-Daudé (7):
      hw/net/i82596: Correct command bitmask (CID 1419392)
      hw/net/e1000e_core: Let e1000e_can_receive() return a boolean
      hw/net/smc91c111: Let smc91c111_can_receive() return a boolean
      hw/net/rtl8139: Simplify if/else statement
      hw/net/rtl8139: Update coding style to make checkpatch.pl happy
      hw/net: Make NetCanReceive() return a boolean
      hw/net/can: Make CanBusClientInfo::can_receive() return a boolean

Prasad J Pandit (1):
      net: tulip: check frame size and r/w data length

Zhang Chen (2):
      net/colo-compare.c: Expose "compare_timeout" to users
      net/colo-compare.c: Expose "expired_scan_cycle" to users

From: Philippe Mathieu-Daudé <f4bug@amsat.org>

The command is 32-bit, but we are loading the 16 upper bits with
the 'get_uint16(s->scb + 2)' call.

Once shifted by 16, the command bits match the status bits:

- Command
  Bit 31 ACK-CX   Acknowledges that the CU completed an Action Command.
  Bit 30 ACK-FR   Acknowledges that the RU received a frame.
  Bit 29 ACK-CNA  Acknowledges that the Command Unit became not active.
  Bit 28 ACK-RNR  Acknowledges that the Receive Unit became not ready.

- Status
  Bit 15 CX       The CU finished executing a command with its I(interrupt) bit set.
  Bit 14 FR       The RU finished receiving a frame.
  Bit 13 CNA      The Command Unit left the Active state.
  Bit 12 RNR      The Receive Unit left the Ready state.

Add the SCB_COMMAND_ACK_MASK definition to simplify the code.

This fixes Coverity 1419392 (CONSTANT_EXPRESSION_RESULT):

/hw/net/i82596.c: 352 in examine_scb()
  346         cuc = (command >> 8) & 0x7;
  347         ruc = (command >> 4) & 0x7;
  348         DBG(printf("MAIN COMMAND %04x  cuc %02x ruc %02x\n", command, cuc, ruc));
  349         /* and clear the scb command word */
  350         set_uint16(s->scb + 2, 0);
  351
  >>>     CID 1419392:    (CONSTANT_EXPRESSION_RESULT)
  >>>     "command & (2147483648UL /* 1UL << 31 */)" is always 0 regardless of the values of its operands. This occurs as the logical operand of "if".
  352         if (command & BIT(31))      /* ACK-CX */
  353             s->scb_status &= ~SCB_STATUS_CX;
  >>>     CID 1419392:    (CONSTANT_EXPRESSION_RESULT)
  >>>     "command & (1073741824UL /* 1UL << 30 */)" is always 0 regardless of the values of its operands. This occurs as the logical operand of "if".
  354         if (command & BIT(30))      /*ACK-FR */
  355             s->scb_status &= ~SCB_STATUS_FR;
  >>>     CID 1419392:    (CONSTANT_EXPRESSION_RESULT)
  >>>     "command & (536870912UL /* 1UL << 29 */)" is always 0 regardless of the values of its operands. This occurs as the logical operand of "if".
  356         if (command & BIT(29))      /*ACK-CNA */
  357             s->scb_status &= ~SCB_STATUS_CNA;
  >>>     CID 1419392:    (CONSTANT_EXPRESSION_RESULT)
  >>>     "command & (268435456UL /* 1UL << 28 */)" is always 0 regardless of the values of its operands. This occurs as the logical operand of "if".
  358         if (command & BIT(28))      /*ACK-RNR */
  359             s->scb_status &= ~SCB_STATUS_RNR;

Fixes: Covertiy CID 1419392 (commit 376b851909)
Signed-off-by: Philippe Mathieu-Daudé <f4bug@amsat.org>
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/i82596.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/hw/net/i82596.c b/hw/net/i82596.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/i82596.c
+++ b/hw/net/i82596.c
@@ -XXX,XX +XXX,XX @@
 #define SCB_STATUS_CNA  0x2000 /* CU left active state */
 #define SCB_STATUS_RNR  0x1000 /* RU left active state */
 
+#define SCB_COMMAND_ACK_MASK \
+        (SCB_STATUS_CX | SCB_STATUS_FR | SCB_STATUS_CNA | SCB_STATUS_RNR)
+
 #define CU_IDLE         0
 #define CU_SUSPENDED    1
 #define CU_ACTIVE       2
@@ -XXX,XX +XXX,XX @@ static void examine_scb(I82596State *s)
     /* and clear the scb command word */
     set_uint16(s->scb + 2, 0);
 
-    if (command & BIT(31))      /* ACK-CX */
-        s->scb_status &= ~SCB_STATUS_CX;
-    if (command & BIT(30))      /*ACK-FR */
-        s->scb_status &= ~SCB_STATUS_FR;
-    if (command & BIT(29))      /*ACK-CNA */
-        s->scb_status &= ~SCB_STATUS_CNA;
-    if (command & BIT(28))      /*ACK-RNR */
-        s->scb_status &= ~SCB_STATUS_RNR;
+    s->scb_status &= ~(command & SCB_COMMAND_ACK_MASK);
 
     switch (cuc) {
     case 0:     /* no change */
-- 
2.5.0

From: Peter Maydell <peter.maydell@linaro.org>

The i82596_receive() function attempts to pass the guest a buffer
which is effectively the concatenation of the data it is passed and a
4 byte CRC value.  However, rather than implementing this as "write
the data; then write the CRC" it instead bumps the length value of
the data by 4, and writes 4 extra bytes from beyond the end of the
buffer, which it then overwrites with the CRC.  It also assumed that
we could always fit all four bytes of the CRC into the final receive
buffer, which might not be true if the CRC needs to be split over two
receive buffers.

Calculate separately how many bytes we need to transfer into the
guest's receive buffer from the source buffer, and how many we need
to transfer from the CRC work.

We add a count 'bufsz' of the number of bytes left in the source
buffer, which we use purely to assert() that we don't overrun.

Spotted by Coverity (CID 1419396) for the specific case when we end
up using a local array as the source buffer.

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/i82596.c | 44 +++++++++++++++++++++++++++++++++++---------
 1 file changed, 35 insertions(+), 9 deletions(-)

diff --git a/hw/net/i82596.c b/hw/net/i82596.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/i82596.c
+++ b/hw/net/i82596.c
@@ -XXX,XX +XXX,XX @@ ssize_t i82596_receive(NetClientState *nc, const uint8_t *buf, size_t sz)
     uint32_t rfd_p;
     uint32_t rbd;
     uint16_t is_broadcast = 0;
-    size_t len = sz;
+    size_t len = sz; /* length of data for guest (including CRC) */
+    size_t bufsz = sz; /* length of data in buf */
     uint32_t crc;
     uint8_t *crc_ptr;
     uint8_t buf1[MIN_BUF_SIZE + VLAN_HLEN];
@@ -XXX,XX +XXX,XX @@ ssize_t i82596_receive(NetClientState *nc, const uint8_t *buf, size_t sz)
         if (len < MIN_BUF_SIZE) {
             len = MIN_BUF_SIZE;
         }
+        bufsz = len;
     }
 
     /* Calculate the ethernet checksum (4 bytes) */
@@ -XXX,XX +XXX,XX @@ ssize_t i82596_receive(NetClientState *nc, const uint8_t *buf, size_t sz)
         while (len) {
             uint16_t buffer_size, num;
             uint32_t rba;
+            size_t bufcount, crccount;
 
             /* printf("Receive: rbd is %08x\n", rbd); */
             buffer_size = get_uint16(rbd + 12);
@@ -XXX,XX +XXX,XX @@ ssize_t i82596_receive(NetClientState *nc, const uint8_t *buf, size_t sz)
             }
             rba = get_uint32(rbd + 8);
             /* printf("rba is 0x%x\n", rba); */
-            address_space_write(&address_space_memory, rba,
-                                MEMTXATTRS_UNSPECIFIED, buf, num);
-            rba += num;
-            buf += num;
-            len -= num;
-            if (len == 0) { /* copy crc */
-                address_space_write(&address_space_memory, rba - 4,
-                                    MEMTXATTRS_UNSPECIFIED, crc_ptr, 4);
+            /*
+             * Calculate how many bytes we want from buf[] and how many
+             * from the CRC.
+             */
+            if ((len - num) >= 4) {
+                /* The whole guest buffer, we haven't hit the CRC yet */
+                bufcount = num;
+            } else {
+                /* All that's left of buf[] */
+                bufcount = len - 4;
+            }
+            crccount = num - bufcount;
+
+            if (bufcount > 0) {
+                /* Still some of the actual data buffer to transfer */
+                assert(bufsz >= bufcount);
+                bufsz -= bufcount;
+                address_space_write(&address_space_memory, rba,
+                                    MEMTXATTRS_UNSPECIFIED, buf, bufcount);
+                rba += bufcount;
+                buf += bufcount;
+                len -= bufcount;
+            }
+
+            /* Write as much of the CRC as fits */
+            if (crccount > 0) {
+                address_space_write(&address_space_memory, rba,
+                                    MEMTXATTRS_UNSPECIFIED, crc_ptr, crccount);
+                rba += crccount;
+                crc_ptr += crccount;
+                len -= crccount;
             }
 
             num |= 0x4000; /* set F BIT */
-- 
2.5.0

From: Andrew Melnychenko <andrew@daynix.com>

Buglink: https://bugzilla.redhat.com/show_bug.cgi?id=1737400
Fixed setting max_queue_num if there are no peers in
NICConf. qemu_new_nic() creates NICState with 1 NetClientState(index
0) without peers, set max_queue_num to 0 - It prevents undefined
behavior and possible crashes, especially during pcie hotplug.

Fixes: 6f3fbe4ed06 ("net: Introduce e1000e device emulation")
Signed-off-by: Andrew Melnychenko <andrew@daynix.com>
Reviewed-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Dmitry Fleytman <dmitry.fleytman@gmail.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/e1000e.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000e.c
+++ b/hw/net/e1000e.c
@@ -XXX,XX +XXX,XX @@ e1000e_init_net_peer(E1000EState *s, PCIDevice *pci_dev, uint8_t *macaddr)
     s->nic = qemu_new_nic(&net_e1000e_info, &s->conf,
         object_get_typename(OBJECT(s)), dev->id, s);
 
-    s->core.max_queue_num = s->conf.peers.queues - 1;
+    s->core.max_queue_num = s->conf.peers.queues ? s->conf.peers.queues - 1 : 0;
 
     trace_e1000e_mac_set_permanent(MAC_ARG(macaddr));
     memcpy(s->core.permanent_mac, macaddr, sizeof(s->core.permanent_mac));
-- 
2.5.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The e1000e_can_receive() function simply returns a boolean value.

Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/e1000e_core.c | 2 +-
 hw/net/e1000e_core.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -XXX,XX +XXX,XX @@ e1000e_start_recv(E1000ECore *core)
     }
 }
 
-int
+bool
 e1000e_can_receive(E1000ECore *core)
 {
     int i;
diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000e_core.h
+++ b/hw/net/e1000e_core.h
@@ -XXX,XX +XXX,XX @@ e1000e_core_set_link_status(E1000ECore *core);
 void
 e1000e_core_pci_uninit(E1000ECore *core);
 
-int
+bool
 e1000e_can_receive(E1000ECore *core);
 
 ssize_t
-- 
2.5.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The smc91c111_can_receive() function simply returns a boolean value.

diff --git a/hw/net/smc91c111.c b/hw/net/smc91c111.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/smc91c111.c
+++ b/hw/net/smc91c111.c
@@ -XXX,XX +XXX,XX @@ static void smc91c111_update(smc91c111_state *s)
     qemu_set_irq(s->irq, level);
 }
 
-static int smc91c111_can_receive(smc91c111_state *s)
+static bool smc91c111_can_receive(smc91c111_state *s)
 {
     if ((s->rcr & RCR_RXEN) == 0 || (s->rcr & RCR_SOFT_RST)) {
-        return 1;
+        return true;
     }
     if (s->allocated == (1 << NUM_PACKETS) - 1 ||
         s->rx_fifo_len == NUM_PACKETS) {
-        return 0;
+        return false;
     }
-    return 1;
+    return true;
 }
 
 static inline void smc91c111_flush_queued_packets(smc91c111_state *s)
-- 
2.5.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

Rewrite:

if (E) {
          return A;
      } else {
          return B;
      }
      /* EOF */
  }

as:

if (E) {
          return A;
      }
      return B;
  }

diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -XXX,XX +XXX,XX @@ static int rtl8139_can_receive(NetClientState *nc)
         /* ??? Flow control not implemented in c+ mode.
            This is a hack to work around slirp deficiencies anyway.  */
         return 1;
-    } else {
-        avail = MOD2(s->RxBufferSize + s->RxBufPtr - s->RxBufAddr,
-                     s->RxBufferSize);
-        return (avail == 0 || avail >= 1514 || (s->IntrMask & RxOverflow));
     }
+
+    avail = MOD2(s->RxBufferSize + s->RxBufPtr - s->RxBufAddr,
+                 s->RxBufferSize);
+    return avail == 0 || avail >= 1514 || (s->IntrMask & RxOverflow);
 }
 
 static ssize_t rtl8139_do_receive(NetClientState *nc, const uint8_t *buf, size_t size_, int do_interrupt)
-- 
2.5.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

We will modify this code in the next commit. Clean it up
first to avoid checkpatch.pl errors.

diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -XXX,XX +XXX,XX @@ static int rtl8139_can_receive(NetClientState *nc)
     int avail;
 
     /* Receive (drop) packets if card is disabled.  */
-    if (!s->clock_enabled)
-      return 1;
-    if (!rtl8139_receiver_enabled(s))
-      return 1;
+    if (!s->clock_enabled) {
+        return 1;
+    }
+    if (!rtl8139_receiver_enabled(s)) {
+        return 1;
+    }
 
     if (rtl8139_cp_receiver_enabled(s) && rtl8139_cp_rx_valid(s)) {
         /* ??? Flow control not implemented in c+ mode.
-- 
2.5.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The NetCanReceive handler return whether the device can or
can not receive new packets. Make it obvious by returning
a boolean type.

diff --git a/hw/net/allwinner_emac.c b/hw/net/allwinner_emac.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/allwinner_emac.c
+++ b/hw/net/allwinner_emac.c
@@ -XXX,XX +XXX,XX @@ static uint32_t fifo8_pop_word(Fifo8 *fifo)
     return ret;
 }
 
-static int aw_emac_can_receive(NetClientState *nc)
+static bool aw_emac_can_receive(NetClientState *nc)
 {
     AwEmacState *s = qemu_get_nic_opaque(nc);
 
diff --git a/hw/net/cadence_gem.c b/hw/net/cadence_gem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/cadence_gem.c
+++ b/hw/net/cadence_gem.c
@@ -XXX,XX +XXX,XX @@ static void phy_update_link(CadenceGEMState *s)
     }
 }
 
-static int gem_can_receive(NetClientState *nc)
+static bool gem_can_receive(NetClientState *nc)
 {
     CadenceGEMState *s;
     int i;
@@ -XXX,XX +XXX,XX @@ static int gem_can_receive(NetClientState *nc)
             s->can_rx_state = 1;
             DB_PRINT("can't receive - no enable\n");
         }
-        return 0;
+        return false;
     }
 
     for (i = 0; i < s->num_priority_queues; i++) {
@@ -XXX,XX +XXX,XX @@ static int gem_can_receive(NetClientState *nc)
             s->can_rx_state = 2;
             DB_PRINT("can't receive - all the buffer descriptors are busy\n");
         }
-        return 0;
+        return false;
     }
 
     if (s->can_rx_state != 0) {
         s->can_rx_state = 0;
         DB_PRINT("can receive\n");
     }
-    return 1;
+    return true;
 }
 
 /*
diff --git a/hw/net/dp8393x.c b/hw/net/dp8393x.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/dp8393x.c
+++ b/hw/net/dp8393x.c
@@ -XXX,XX +XXX,XX @@ static void dp8393x_do_stop_timer(dp8393xState *s)
     dp8393x_update_wt_regs(s);
 }
 
-static int dp8393x_can_receive(NetClientState *nc);
+static bool dp8393x_can_receive(NetClientState *nc);
 
 static void dp8393x_do_receiver_enable(dp8393xState *s)
 {
@@ -XXX,XX +XXX,XX @@ static void dp8393x_watchdog(void *opaque)
     dp8393x_update_irq(s);
 }
 
-static int dp8393x_can_receive(NetClientState *nc)
+static bool dp8393x_can_receive(NetClientState *nc)
 {
     dp8393xState *s = qemu_get_nic_opaque(nc);
 
-    if (!(s->regs[SONIC_CR] & SONIC_CR_RXEN))
-        return 0;
-    return 1;
+    return !!(s->regs[SONIC_CR] & SONIC_CR_RXEN);
 }
 
 static int dp8393x_receive_filter(dp8393xState *s, const uint8_t * buf,
diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -XXX,XX +XXX,XX @@ static bool e1000_has_rxbufs(E1000State *s, size_t total_size)
     return total_size <= bufs * s->rxbuf_size;
 }
 
-static int
+static bool
 e1000_can_receive(NetClientState *nc)
 {
     E1000State *s = qemu_get_nic_opaque(nc);
diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/e1000e.c
+++ b/hw/net/e1000e.c
@@ -XXX,XX +XXX,XX @@ static const MemoryRegionOps io_ops = {
     },
 };
 
-static int
+static bool
 e1000e_nc_can_receive(NetClientState *nc)
 {
     E1000EState *s = qemu_get_nic_opaque(nc);
diff --git a/hw/net/ftgmac100.c b/hw/net/ftgmac100.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/ftgmac100.c
+++ b/hw/net/ftgmac100.c
@@ -XXX,XX +XXX,XX @@ static void ftgmac100_do_tx(FTGMAC100State *s, uint32_t tx_ring,
     ftgmac100_update_irq(s);
 }
 
-static int ftgmac100_can_receive(NetClientState *nc)
+static bool ftgmac100_can_receive(NetClientState *nc)
 {
     FTGMAC100State *s = FTGMAC100(qemu_get_nic_opaque(nc));
     FTGMAC100Desc bd;
 
     if ((s->maccr & (FTGMAC100_MACCR_RXDMA_EN | FTGMAC100_MACCR_RXMAC_EN))
          != (FTGMAC100_MACCR_RXDMA_EN | FTGMAC100_MACCR_RXMAC_EN)) {
-        return 0;
+        return false;
     }
 
     if (ftgmac100_read_bd(&bd, s->rx_descriptor)) {
-        return 0;
+        return false;
     }
     return !(bd.des0 & FTGMAC100_RXDES0_RXPKT_RDY);
 }
diff --git a/hw/net/i82596.c b/hw/net/i82596.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/i82596.c
+++ b/hw/net/i82596.c
@@ -XXX,XX +XXX,XX @@ void i82596_h_reset(void *opaque)
     i82596_s_reset(s);
 }
 
-int i82596_can_receive(NetClientState *nc)
+bool i82596_can_receive(NetClientState *nc)
 {
     I82596State *s = qemu_get_nic_opaque(nc);
 
     if (s->rx_status == RX_SUSPENDED) {
-        return 0;
+        return false;
     }
 
     if (!s->lnkst) {
-        return 0;
+        return false;
     }
 
     if (USE_TIMER && !timer_pending(s->flush_queue_timer)) {
-        return 1;
+        return true;
     }
 
-    return 1;
+    return true;
 }
 
 #define MIN_BUF_SIZE 60
diff --git a/hw/net/i82596.h b/hw/net/i82596.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/i82596.h
+++ b/hw/net/i82596.h
@@ -XXX,XX +XXX,XX @@ void i82596_ioport_writel(void *opaque, uint32_t addr, uint32_t val);
 uint32_t i82596_ioport_readl(void *opaque, uint32_t addr);
 uint32_t i82596_bcr_readw(I82596State *s, uint32_t rap);
 ssize_t i82596_receive(NetClientState *nc, const uint8_t *buf, size_t size_);
-int i82596_can_receive(NetClientState *nc);
+bool i82596_can_receive(NetClientState *nc);
 void i82596_set_link_status(NetClientState *nc);
 void i82596_common_init(DeviceState *dev, I82596State *s, NetClientInfo *info);
 extern const VMStateDescription vmstate_i82596;
diff --git a/hw/net/imx_fec.c b/hw/net/imx_fec.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/imx_fec.c
+++ b/hw/net/imx_fec.c
@@ -XXX,XX +XXX,XX @@ static void imx_eth_write(void *opaque, hwaddr offset, uint64_t value,
     imx_eth_update(s);
 }
 
-static int imx_eth_can_receive(NetClientState *nc)
+static bool imx_eth_can_receive(NetClientState *nc)
 {
     IMXFECState *s = IMX_FEC(qemu_get_nic_opaque(nc));
 
diff --git a/hw/net/opencores_eth.c b/hw/net/opencores_eth.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/opencores_eth.c
+++ b/hw/net/opencores_eth.c
@@ -XXX,XX +XXX,XX @@ static void open_eth_reset(void *opaque)
     open_eth_set_link_status(qemu_get_queue(s->nic));
 }
 
-static int open_eth_can_receive(NetClientState *nc)
+static bool open_eth_can_receive(NetClientState *nc)
 {
     OpenEthState *s = qemu_get_nic_opaque(nc);
 
-    return GET_REGBIT(s, MODER, RXEN) &&
-        (s->regs[TX_BD_NUM] < 0x80);
+    return GET_REGBIT(s, MODER, RXEN) && (s->regs[TX_BD_NUM] < 0x80);
 }
 
 static ssize_t open_eth_receive(NetClientState *nc,
diff --git a/hw/net/rtl8139.c b/hw/net/rtl8139.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/rtl8139.c
+++ b/hw/net/rtl8139.c
@@ -XXX,XX +XXX,XX @@ static bool rtl8139_cp_rx_valid(RTL8139State *s)
     return !(s->RxRingAddrLO == 0 && s->RxRingAddrHI == 0);
 }
 
-static int rtl8139_can_receive(NetClientState *nc)
+static bool rtl8139_can_receive(NetClientState *nc)
 {
     RTL8139State *s = qemu_get_nic_opaque(nc);
     int avail;
 
     /* Receive (drop) packets if card is disabled.  */
     if (!s->clock_enabled) {
-        return 1;
+        return true;
     }
     if (!rtl8139_receiver_enabled(s)) {
-        return 1;
+        return true;
     }
 
     if (rtl8139_cp_receiver_enabled(s) && rtl8139_cp_rx_valid(s)) {
         /* ??? Flow control not implemented in c+ mode.
            This is a hack to work around slirp deficiencies anyway.  */
-        return 1;
+        return true;
     }
 
     avail = MOD2(s->RxBufferSize + s->RxBufPtr - s->RxBufAddr,
diff --git a/hw/net/smc91c111.c b/hw/net/smc91c111.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/smc91c111.c
+++ b/hw/net/smc91c111.c
@@ -XXX,XX +XXX,XX @@ static void smc91c111_writefn(void *opaque, hwaddr addr,
     }
 }
 
-static int smc91c111_can_receive_nc(NetClientState *nc)
+static bool smc91c111_can_receive_nc(NetClientState *nc)
 {
     smc91c111_state *s = qemu_get_nic_opaque(nc);
 
diff --git a/hw/net/spapr_llan.c b/hw/net/spapr_llan.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/spapr_llan.c
+++ b/hw/net/spapr_llan.c
@@ -XXX,XX +XXX,XX @@ typedef struct SpaprVioVlan {
     RxBufPool *rx_pool[RX_MAX_POOLS];  /* Receive buffer descriptor pools */
 } SpaprVioVlan;
 
-static int spapr_vlan_can_receive(NetClientState *nc)
+static bool spapr_vlan_can_receive(NetClientState *nc)
 {
     SpaprVioVlan *dev = qemu_get_nic_opaque(nc);
 
-    return (dev->isopen && dev->rx_bufs > 0);
+    return dev->isopen && dev->rx_bufs > 0;
 }
 
 /**
diff --git a/hw/net/sungem.c b/hw/net/sungem.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/sungem.c
+++ b/hw/net/sungem.c
@@ -XXX,XX +XXX,XX @@ static bool sungem_rx_full(SunGEMState *s, uint32_t kick, uint32_t done)
     return kick == ((done + 1) & s->rx_mask);
 }
 
-static int sungem_can_receive(NetClientState *nc)
+static bool sungem_can_receive(NetClientState *nc)
 {
     SunGEMState *s = qemu_get_nic_opaque(nc);
     uint32_t kick, done, rxdma_cfg, rxmac_cfg;
@@ -XXX,XX +XXX,XX @@ static int sungem_can_receive(NetClientState *nc)
     /* If MAC disabled, can't receive */
     if ((rxmac_cfg & MAC_RXCFG_ENAB) == 0) {
         trace_sungem_rx_mac_disabled();
-        return 0;
+        return false;
     }
     if ((rxdma_cfg & RXDMA_CFG_ENABLE) == 0) {
         trace_sungem_rx_txdma_disabled();
-        return 0;
+        return false;
     }
 
     /* Check RX availability */
diff --git a/hw/net/sunhme.c b/hw/net/sunhme.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/sunhme.c
+++ b/hw/net/sunhme.c
@@ -XXX,XX +XXX,XX @@ static void sunhme_transmit(SunHMEState *s)
     sunhme_update_irq(s);
 }
 
-static int sunhme_can_receive(NetClientState *nc)
+static bool sunhme_can_receive(NetClientState *nc)
 {
     SunHMEState *s = qemu_get_nic_opaque(nc);
 
-    return s->macregs[HME_MACI_RXCFG >> 2] & HME_MAC_RXCFG_ENABLE;
+    return !!(s->macregs[HME_MACI_RXCFG >> 2] & HME_MAC_RXCFG_ENABLE);
 }
 
 static void sunhme_link_status_changed(NetClientState *nc)
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static void virtio_net_handle_rx(VirtIODevice *vdev, VirtQueue *vq)
     qemu_flush_queued_packets(qemu_get_subqueue(n->nic, queue_index));
 }
 
-static int virtio_net_can_receive(NetClientState *nc)
+static bool virtio_net_can_receive(NetClientState *nc)
 {
     VirtIONet *n = qemu_get_nic_opaque(nc);
     VirtIODevice *vdev = VIRTIO_DEVICE(n);
     VirtIONetQueue *q = virtio_net_get_subqueue(nc);
 
     if (!vdev->vm_running) {
-        return 0;
+        return false;
     }
 
     if (nc->queue_index >= n->curr_queues) {
-        return 0;
+        return false;
     }
 
     if (!virtio_queue_ready(q->rx_vq) ||
         !(vdev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
-        return 0;
+        return false;
     }
 
-    return 1;
+    return true;
 }
 
 static int virtio_net_has_buffers(VirtIONetQueue *q, int bufsize)
diff --git a/hw/net/xilinx_ethlite.c b/hw/net/xilinx_ethlite.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/xilinx_ethlite.c
+++ b/hw/net/xilinx_ethlite.c
@@ -XXX,XX +XXX,XX @@ static const MemoryRegionOps eth_ops = {
     }
 };
 
-static int eth_can_rx(NetClientState *nc)
+static bool eth_can_rx(NetClientState *nc)
 {
     struct xlx_ethlite *s = qemu_get_nic_opaque(nc);
     unsigned int rxbase = s->rxbuf * (0x800 / 4);
diff --git a/include/net/net.h b/include/net/net.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -XXX,XX +XXX,XX @@ typedef struct NICConf {
 /* Net clients */
 
 typedef void (NetPoll)(NetClientState *, bool enable);
-typedef int (NetCanReceive)(NetClientState *);
+typedef bool (NetCanReceive)(NetClientState *);
 typedef ssize_t (NetReceive)(NetClientState *, const uint8_t *, size_t);
 typedef ssize_t (NetReceiveIOV)(NetClientState *, const struct iovec *, int);
 typedef void (NetCleanup) (NetClientState *);
diff --git a/net/filter-buffer.c b/net/filter-buffer.c
index XXXXXXX..XXXXXXX 100644
--- a/net/filter-buffer.c
+++ b/net/filter-buffer.c
@@ -XXX,XX +XXX,XX @@ static ssize_t filter_buffer_receive_iov(NetFilterState *nf,
      * the filter can still accept packets until its internal queue is full.
      * For example:
      *   For some reason, receiver could not receive more packets
-     * (.can_receive() returns zero). Without a filter, at most one packet
+     * (.can_receive() returns false). Without a filter, at most one packet
      * will be queued in incoming queue and sender's poll will be disabled
      * unit its sent_cb() was called. With a filter, it will keep receiving
      * the packets without caring about the receiver. This is suboptimal.
diff --git a/net/hub.c b/net/hub.c
index XXXXXXX..XXXXXXX 100644
--- a/net/hub.c
+++ b/net/hub.c
@@ -XXX,XX +XXX,XX @@ static NetHub *net_hub_new(int id)
     return hub;
 }
 
-static int net_hub_port_can_receive(NetClientState *nc)
+static bool net_hub_port_can_receive(NetClientState *nc)
 {
     NetHubPort *port;
     NetHubPort *src_port = DO_UPCAST(NetHubPort, nc, nc);
@@ -XXX,XX +XXX,XX @@ static int net_hub_port_can_receive(NetClientState *nc)
         }
 
         if (qemu_can_send_packet(&port->nc)) {
-            return 1;
+            return true;
         }
     }
 
-    return 0;
+    return false;
 }
 
 static ssize_t net_hub_port_receive(NetClientState *nc,
-- 
2.5.0

From: Philippe Mathieu-Daudé <philmd@redhat.com>

The CanBusClientInfo::can_receive handler return whether the
device can or can not receive new frames. Make it obvious by
returning a boolean type.

Signed-off-by: Philippe Mathieu-Daudé <philmd@redhat.com>
Reviewed-by: Alistair Francis <alistair.francis@wdc.com>
Reviewed-by: Cédric Le Goater <clg@kaod.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/allwinner-sun8i-emac.c | 2 +-
 hw/net/can/can_sja1000.c      | 8 ++++----
 hw/net/can/can_sja1000.h      | 2 +-
 include/net/can_emu.h         | 2 +-
 net/can/can_socketcan.c       | 4 ++--
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/hw/net/allwinner-sun8i-emac.c b/hw/net/allwinner-sun8i-emac.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/allwinner-sun8i-emac.c
+++ b/hw/net/allwinner-sun8i-emac.c
@@ -XXX,XX +XXX,XX @@ static void allwinner_sun8i_emac_flush_desc(FrameDescriptor *desc,
     cpu_physical_memory_write(phys_addr, desc, sizeof(*desc));
 }
 
-static int allwinner_sun8i_emac_can_receive(NetClientState *nc)
+static bool allwinner_sun8i_emac_can_receive(NetClientState *nc)
 {
     AwSun8iEmacState *s = qemu_get_nic_opaque(nc);
     FrameDescriptor desc;
diff --git a/hw/net/can/can_sja1000.c b/hw/net/can/can_sja1000.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/can/can_sja1000.c
+++ b/hw/net/can/can_sja1000.c
@@ -XXX,XX +XXX,XX @@ uint64_t can_sja_mem_read(CanSJA1000State *s, hwaddr addr, unsigned size)
     return temp;
 }
 
-int can_sja_can_receive(CanBusClientState *client)
+bool can_sja_can_receive(CanBusClientState *client)
 {
     CanSJA1000State *s = container_of(client, CanSJA1000State, bus_client);
 
     if (s->clock & 0x80) { /* PeliCAN Mode */
         if (s->mode & 0x01) { /* reset mode. */
-            return 0;
+            return false;
         }
     } else { /* BasicCAN mode */
         if (s->control & 0x01) {
-            return 0;
+            return false;
         }
     }
 
-    return 1; /* always return 1, when operation mode */
+    return true; /* always return true, when operation mode */
 }
 
 ssize_t can_sja_receive(CanBusClientState *client, const qemu_can_frame *frames,
diff --git a/hw/net/can/can_sja1000.h b/hw/net/can/can_sja1000.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/can/can_sja1000.h
+++ b/hw/net/can/can_sja1000.h
@@ -XXX,XX +XXX,XX @@ void can_sja_disconnect(CanSJA1000State *s);
 
 int can_sja_init(CanSJA1000State *s, qemu_irq irq);
 
-int can_sja_can_receive(CanBusClientState *client);
+bool can_sja_can_receive(CanBusClientState *client);
 
 ssize_t can_sja_receive(CanBusClientState *client,
                         const qemu_can_frame *frames, size_t frames_cnt);
diff --git a/include/net/can_emu.h b/include/net/can_emu.h
index XXXXXXX..XXXXXXX 100644
--- a/include/net/can_emu.h
+++ b/include/net/can_emu.h
@@ -XXX,XX +XXX,XX @@ typedef struct CanBusClientState CanBusClientState;
 typedef struct CanBusState CanBusState;
 
 typedef struct CanBusClientInfo {
-    int (*can_receive)(CanBusClientState *);
+    bool (*can_receive)(CanBusClientState *);
     ssize_t (*receive)(CanBusClientState *,
         const struct qemu_can_frame *frames, size_t frames_cnt);
 } CanBusClientInfo;
diff --git a/net/can/can_socketcan.c b/net/can/can_socketcan.c
index XXXXXXX..XXXXXXX 100644
--- a/net/can/can_socketcan.c
+++ b/net/can/can_socketcan.c
@@ -XXX,XX +XXX,XX @@ static void can_host_socketcan_read(void *opaque)
     }
 }
 
-static int can_host_socketcan_can_receive(CanBusClientState *client)
+static bool can_host_socketcan_can_receive(CanBusClientState *client)
 {
-    return 1;
+    return true;
 }
 
 static ssize_t can_host_socketcan_receive(CanBusClientState *client,
-- 
2.5.0

From: Zhang Chen <chen.zhang@intel.com>

The "compare_timeout" determines the maximum time to hold the primary net packet.
This patch expose the "compare_timeout", make user have ability to
adjest the value according to application scenarios.

QMP command demo:
    { "execute": "qom-get",
         "arguments": { "path": "/objects/comp0",
                        "property": "compare_timeout" } }

{ "execute": "qom-set",
         "arguments": { "path": "/objects/comp0",
                        "property": "compare_timeout",
                        "value": 5000} }

Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 47 +++++++++++++++++++++++++++++++++++++++++++++--
 qemu-options.hx    |  8 +++++---
 2 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ static NotifierList colo_compare_notifiers =
 
 /* TODO: Should be configurable */
 #define REGULAR_PACKET_CHECK_MS 3000
+#define DEFAULT_TIME_OUT_MS 3000
 
 static QemuMutex event_mtx;
 static QemuCond event_complete_cond;
@@ -XXX,XX +XXX,XX @@ typedef struct CompareState {
     SocketReadState sec_rs;
     SocketReadState notify_rs;
     bool vnet_hdr;
+    uint32_t compare_timeout;
 
     /*
      * Record the connection that through the NIC
@@ -XXX,XX +XXX,XX @@ static int colo_old_packet_check_one_conn(Connection *conn,
                                           CompareState *s)
 {
     GList *result = NULL;
-    int64_t check_time = REGULAR_PACKET_CHECK_MS;
 
     result = g_queue_find_custom(&conn->primary_list,
-                                 &check_time,
+                                 &s->compare_timeout,
                                  (GCompareFunc)colo_old_packet_check_one);
 
     if (result) {
@@ -XXX,XX +XXX,XX @@ static void compare_set_notify_dev(Object *obj, const char *value, Error **errp)
     s->notify_dev = g_strdup(value);
 }
 
+static void compare_get_timeout(Object *obj, Visitor *v,
+                                const char *name, void *opaque,
+                                Error **errp)
+{
+    CompareState *s = COLO_COMPARE(obj);
+    uint32_t value = s->compare_timeout;
+
+    visit_type_uint32(v, name, &value, errp);
+}
+
+static void compare_set_timeout(Object *obj, Visitor *v,
+                                const char *name, void *opaque,
+                                Error **errp)
+{
+    CompareState *s = COLO_COMPARE(obj);
+    Error *local_err = NULL;
+    uint32_t value;
+
+    visit_type_uint32(v, name, &value, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    if (!value) {
+        error_setg(&local_err, "Property '%s.%s' requires a positive value",
+                   object_get_typename(obj), name);
+        goto out;
+    }
+    s->compare_timeout = value;
+
+out:
+    error_propagate(errp, local_err);
+}
+
 static void compare_pri_rs_finalize(SocketReadState *pri_rs)
 {
     CompareState *s = container_of(pri_rs, CompareState, pri_rs);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
         return;
     }
 
+    if (!s->compare_timeout) {
+        /* Set default value to 3000 MS */
+        s->compare_timeout = DEFAULT_TIME_OUT_MS;
+    }
+
     if (find_and_check_chardev(&chr, s->pri_indev, errp) ||
         !qemu_chr_fe_init(&s->chr_pri_in, chr, errp)) {
         return;
@@ -XXX,XX +XXX,XX @@ static void colo_compare_init(Object *obj)
                             compare_get_notify_dev, compare_set_notify_dev,
                             NULL);
 
+    object_property_add(obj, "compare_timeout", "uint32",
+                        compare_get_timeout,
+                        compare_set_timeout, NULL, NULL, NULL);
+
     s->vnet_hdr = false;
     object_property_add_bool(obj, "vnet_hdr_support", compare_get_vnet_hdr,
                              compare_set_vnet_hdr, NULL);
diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ SRST
         stored. The file format is libpcap, so it can be analyzed with
         tools such as tcpdump or Wireshark.
 
-    ``-object colo-compare,id=id,primary_in=chardevid,secondary_in=chardevid,outdev=chardevid,iothread=id[,vnet_hdr_support][,notify_dev=id]``
+    ``-object colo-compare,id=id,primary_in=chardevid,secondary_in=chardevid,outdev=chardevid,iothread=id[,vnet_hdr_support][,notify_dev=id][,compare_timeout=@var{ms}]``
         Colo-compare gets packet from primary\_inchardevid and
         secondary\_inchardevid, than compare primary packet with
         secondary packet. If the packets are same, we will output
@@ -XXX,XX +XXX,XX @@ SRST
         outdevchardevid. In order to improve efficiency, we need to put
         the task of comparison in another thread. If it has the
         vnet\_hdr\_support flag, colo compare will send/recv packet with
-        vnet\_hdr\_len. If you want to use Xen COLO, will need the
-        notify\_dev to notify Xen colo-frame to do checkpoint.
+        vnet\_hdr\_len. Then compare\_timeout=@var{ms} determines the
+        maximum delay colo-compare wait for the packet.
+        If you want to use Xen COLO, will need the notify\_dev to
+        notify Xen colo-frame to do checkpoint.
 
         we must use it with the help of filter-mirror and
         filter-redirector.
-- 
2.5.0

From: Zhang Chen <chen.zhang@intel.com>

The "expired_scan_cycle" determines period of scanning expired
primary node net packets.

Signed-off-by: Zhang Chen <chen.zhang@intel.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 net/colo-compare.c | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 qemu-options.hx    |  4 +++-
 2 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index XXXXXXX..XXXXXXX 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -XXX,XX +XXX,XX @@ static NotifierList colo_compare_notifiers =
 #define COLO_COMPARE_FREE_PRIMARY     0x01
 #define COLO_COMPARE_FREE_SECONDARY   0x02
 
-/* TODO: Should be configurable */
 #define REGULAR_PACKET_CHECK_MS 3000
 #define DEFAULT_TIME_OUT_MS 3000
 
@@ -XXX,XX +XXX,XX @@ typedef struct CompareState {
     SocketReadState notify_rs;
     bool vnet_hdr;
     uint32_t compare_timeout;
+    uint32_t expired_scan_cycle;
 
     /*
      * Record the connection that through the NIC
@@ -XXX,XX +XXX,XX @@ static void check_old_packet_regular(void *opaque)
     /* if have old packet we will notify checkpoint */
     colo_old_packet_check(s);
     timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
-                REGULAR_PACKET_CHECK_MS);
+              s->expired_scan_cycle);
 }
 
 /* Public API, Used for COLO frame to notify compare event */
@@ -XXX,XX +XXX,XX @@ static void colo_compare_timer_init(CompareState *s)
                                 SCALE_MS, check_old_packet_regular,
                                 s);
     timer_mod(s->packet_check_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
-                    REGULAR_PACKET_CHECK_MS);
+              s->expired_scan_cycle);
 }
 
 static void colo_compare_timer_del(CompareState *s)
@@ -XXX,XX +XXX,XX @@ out:
     error_propagate(errp, local_err);
 }
 
+static void compare_get_expired_scan_cycle(Object *obj, Visitor *v,
+                                           const char *name, void *opaque,
+                                           Error **errp)
+{
+    CompareState *s = COLO_COMPARE(obj);
+    uint32_t value = s->expired_scan_cycle;
+
+    visit_type_uint32(v, name, &value, errp);
+}
+
+static void compare_set_expired_scan_cycle(Object *obj, Visitor *v,
+                                           const char *name, void *opaque,
+                                           Error **errp)
+{
+    CompareState *s = COLO_COMPARE(obj);
+    Error *local_err = NULL;
+    uint32_t value;
+
+    visit_type_uint32(v, name, &value, &local_err);
+    if (local_err) {
+        goto out;
+    }
+    if (!value) {
+        error_setg(&local_err, "Property '%s.%s' requires a positive value",
+                   object_get_typename(obj), name);
+        goto out;
+    }
+    s->expired_scan_cycle = value;
+
+out:
+    error_propagate(errp, local_err);
+}
+
 static void compare_pri_rs_finalize(SocketReadState *pri_rs)
 {
     CompareState *s = container_of(pri_rs, CompareState, pri_rs);
@@ -XXX,XX +XXX,XX @@ static void colo_compare_complete(UserCreatable *uc, Error **errp)
         s->compare_timeout = DEFAULT_TIME_OUT_MS;
     }
 
+    if (!s->expired_scan_cycle) {
+        /* Set default value to 3000 MS */
+        s->expired_scan_cycle = REGULAR_PACKET_CHECK_MS;
+    }
+
     if (find_and_check_chardev(&chr, s->pri_indev, errp) ||
         !qemu_chr_fe_init(&s->chr_pri_in, chr, errp)) {
         return;
@@ -XXX,XX +XXX,XX @@ static void colo_compare_init(Object *obj)
                         compare_get_timeout,
                         compare_set_timeout, NULL, NULL, NULL);
 
+    object_property_add(obj, "expired_scan_cycle", "uint32",
+                        compare_get_expired_scan_cycle,
+                        compare_set_expired_scan_cycle, NULL, NULL, NULL);
+
     s->vnet_hdr = false;
     object_property_add_bool(obj, "vnet_hdr_support", compare_get_vnet_hdr,
                              compare_set_vnet_hdr, NULL);
diff --git a/qemu-options.hx b/qemu-options.hx
index XXXXXXX..XXXXXXX 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -XXX,XX +XXX,XX @@ SRST
         stored. The file format is libpcap, so it can be analyzed with
         tools such as tcpdump or Wireshark.
 
-    ``-object colo-compare,id=id,primary_in=chardevid,secondary_in=chardevid,outdev=chardevid,iothread=id[,vnet_hdr_support][,notify_dev=id][,compare_timeout=@var{ms}]``
+    ``-object colo-compare,id=id,primary_in=chardevid,secondary_in=chardevid,outdev=chardevid,iothread=id[,vnet_hdr_support][,notify_dev=id][,compare_timeout=@var{ms}][,expired_scan_cycle=@var{ms}``
         Colo-compare gets packet from primary\_inchardevid and
         secondary\_inchardevid, than compare primary packet with
         secondary packet. If the packets are same, we will output
@@ -XXX,XX +XXX,XX @@ SRST
         vnet\_hdr\_support flag, colo compare will send/recv packet with
         vnet\_hdr\_len. Then compare\_timeout=@var{ms} determines the
         maximum delay colo-compare wait for the packet.
+        The expired\_scan\_cycle=@var{ms} to set the period of scanning
+        expired primary node network packets.
         If you want to use Xen COLO, will need the notify\_dev to
         notify Xen colo-frame to do checkpoint.
 
-- 
2.5.0

From: Prasad J Pandit <pjp@fedoraproject.org>

Tulip network driver while copying tx/rx buffers does not check
frame size against r/w data length. This may lead to OOB buffer
access. Add check to avoid it.

Limit iterations over descriptors to avoid potential infinite
loop issue in tulip_xmit_list_update.

Reported-by: Li Qiang <pangpei.lq@antfin.com>
Reported-by: Ziming Zhang <ezrakiez@gmail.com>
Reported-by: Jason Wang <jasowang@redhat.com>
Tested-by: Li Qiang <liq3ea@gmail.com>
Reviewed-by: Li Qiang <liq3ea@gmail.com>
Signed-off-by: Prasad J Pandit <pjp@fedoraproject.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/tulip.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/hw/net/tulip.c b/hw/net/tulip.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/tulip.c
+++ b/hw/net/tulip.c
@@ -XXX,XX +XXX,XX @@ static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
         } else {
             len = s->rx_frame_len;
         }
+
+        if (s->rx_frame_len + len > sizeof(s->rx_frame)) {
+            return;
+        }
         pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame +
             (s->rx_frame_size - s->rx_frame_len), len);
         s->rx_frame_len -= len;
@@ -XXX,XX +XXX,XX @@ static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
         } else {
             len = s->rx_frame_len;
         }
+
+        if (s->rx_frame_len + len > sizeof(s->rx_frame)) {
+            return;
+        }
         pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame +
             (s->rx_frame_size - s->rx_frame_len), len);
         s->rx_frame_len -= len;
@@ -XXX,XX +XXX,XX @@ static ssize_t tulip_receive(TULIPState *s, const uint8_t *buf, size_t size)
 
     trace_tulip_receive(buf, size);
 
-    if (size < 14 || size > 2048 || s->rx_frame_len || tulip_rx_stopped(s)) {
+    if (size < 14 || size > sizeof(s->rx_frame) - 4
+        || s->rx_frame_len || tulip_rx_stopped(s)) {
         return 0;
     }
 
@@ -XXX,XX +XXX,XX @@ static ssize_t tulip_receive_nc(NetClientState *nc,
     return tulip_receive(qemu_get_nic_opaque(nc), buf, size);
 }
 
-
 static NetClientInfo net_tulip_info = {
     .type = NET_CLIENT_DRIVER_NIC,
     .size = sizeof(NICState),
@@ -XXX,XX +XXX,XX @@ static void tulip_tx(TULIPState *s, struct tulip_descriptor *desc)
         if ((s->csr[6] >> CSR6_OM_SHIFT) & CSR6_OM_MASK) {
             /* Internal or external Loopback */
             tulip_receive(s, s->tx_frame, s->tx_frame_len);
-        } else {
+        } else if (s->tx_frame_len <= sizeof(s->tx_frame)) {
             qemu_send_packet(qemu_get_queue(s->nic),
                 s->tx_frame, s->tx_frame_len);
         }
@@ -XXX,XX +XXX,XX @@ static void tulip_tx(TULIPState *s, struct tulip_descriptor *desc)
     }
 }
 
-static void tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
+static int tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
 {
     int len1 = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
     int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;
 
+    if (s->tx_frame_len + len1 > sizeof(s->tx_frame)) {
+        return -1;
+    }
     if (len1) {
         pci_dma_read(&s->dev, desc->buf_addr1,
             s->tx_frame + s->tx_frame_len, len1);
         s->tx_frame_len += len1;
     }
 
+    if (s->tx_frame_len + len2 > sizeof(s->tx_frame)) {
+        return -1;
+    }
     if (len2) {
         pci_dma_read(&s->dev, desc->buf_addr2,
             s->tx_frame + s->tx_frame_len, len2);
         s->tx_frame_len += len2;
     }
     desc->status = (len1 + len2) ? 0 : 0x7fffffff;
+
+    return 0;
 }
 
 static void tulip_setup_filter_addr(TULIPState *s, uint8_t *buf, int n)
@@ -XXX,XX +XXX,XX @@ static uint32_t tulip_ts(TULIPState *s)
 
 static void tulip_xmit_list_update(TULIPState *s)
 {
+#define TULIP_DESC_MAX 128
+    uint8_t i = 0;
     struct tulip_descriptor desc;
 
     if (tulip_ts(s) != CSR5_TS_SUSPENDED) {
         return;
     }
 
-    for (;;) {
+    for (i = 0; i < TULIP_DESC_MAX; i++) {
         tulip_desc_read(s, s->current_tx_desc, &desc);
         tulip_dump_tx_descriptor(s, &desc);
 
@@ -XXX,XX +XXX,XX @@ static void tulip_xmit_list_update(TULIPState *s)
                 s->tx_frame_len = 0;
             }
 
-            tulip_copy_tx_buffers(s, &desc);
-
-            if (desc.control & TDES1_LS) {
-                tulip_tx(s, &desc);
+            if (!tulip_copy_tx_buffers(s, &desc)) {
+                if (desc.control & TDES1_LS) {
+                    tulip_tx(s, &desc);
+                }
             }
         }
         tulip_desc_write(s, s->current_tx_desc, &desc);
-- 
2.5.0

From: Peter Maydell <peter.maydell@linaro.org>

Coverity points out (CID 1421926) that the read code for
REG_ADDR_HIGH reads off the end of the buffer, because it does a
32-bit read from byte 4 of a 6-byte buffer.

The code also has an endianness issue for both REG_ADDR_HIGH and
REG_ADDR_LOW, because it will do the wrong thing on a big-endian
host.

Rewrite the read code to use ldl_le_p() and lduw_le_p() to fix this;
the write code is not incorrect, but for consistency we make it use
stl_le_p() and stw_le_p().

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>
Tested-by: Niek Linnenbank <nieklinnenbank@gmail.com>
Reviewed-by: Niek Linnenbank <nieklinnenbank@gmail.com>
Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/allwinner-sun8i-emac.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/hw/net/allwinner-sun8i-emac.c b/hw/net/allwinner-sun8i-emac.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/allwinner-sun8i-emac.c
+++ b/hw/net/allwinner-sun8i-emac.c
@@ -XXX,XX +XXX,XX @@ static uint64_t allwinner_sun8i_emac_read(void *opaque, hwaddr offset,
         value = s->mii_data;
         break;
     case REG_ADDR_HIGH:         /* MAC Address High */
-        value = *(((uint32_t *) (s->conf.macaddr.a)) + 1);
+        value = lduw_le_p(s->conf.macaddr.a + 4);
         break;
     case REG_ADDR_LOW:          /* MAC Address Low */
-        value = *(uint32_t *) (s->conf.macaddr.a);
+        value = ldl_le_p(s->conf.macaddr.a);
         break;
     case REG_TX_DMA_STA:        /* Transmit DMA Status */
         break;
@@ -XXX,XX +XXX,XX @@ static void allwinner_sun8i_emac_write(void *opaque, hwaddr offset,
         s->mii_data = value;
         break;
     case REG_ADDR_HIGH:         /* MAC Address High */
-        s->conf.macaddr.a[4] = (value & 0xff);
-        s->conf.macaddr.a[5] = (value & 0xff00) >> 8;
+        stw_le_p(s->conf.macaddr.a + 4, value);
         break;
     case REG_ADDR_LOW:          /* MAC Address Low */
-        s->conf.macaddr.a[0] = (value & 0xff);
-        s->conf.macaddr.a[1] = (value & 0xff00) >> 8;
-        s->conf.macaddr.a[2] = (value & 0xff0000) >> 16;
-        s->conf.macaddr.a[3] = (value & 0xff000000) >> 24;
+        stl_le_p(s->conf.macaddr.a, value);
         break;
     case REG_TX_DMA_STA:        /* Transmit DMA Status */
     case REG_TX_CUR_DESC:       /* Transmit Current Descriptor */
-- 
2.5.0

From: Li Qiang <liq3ea@163.com>

The tulip networking card emulation has an OOB issue in
'tulip_copy_tx_buffers' when the guest provide malformed descriptor.
This test will trigger a ASAN heap overflow crash. To trigger this
issue we can construct the data as following:

1. construct a 'tulip_descriptor'. Its control is set to
'0x7ff | 0x7ff << 11', this will make the 'tulip_copy_tx_buffers's
'len1' and 'len2' to 0x7ff(2047). So 'len1+len2' will overflow
'TULIPState's 'tx_frame' field. This descriptor's 'buf_addr1' and
'buf_addr2' should set to a guest address.

2. write this descriptor to tulip device's CSR4 register. This will
set the 'TULIPState's 'current_tx_desc' field.

3. write 'CSR6_ST' to tulip device's CSR6 register. This will trigger
'tulip_xmit_list_update' and finally calls 'tulip_copy_tx_buffers'.

Following shows the backtrack of crash:

==31781==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x628000007cd0 at pc 0x7fe03c5a077a bp 0x7fff05b46770 sp 0x7fff05b45f18
WRITE of size 2047 at 0x628000007cd0 thread T0
    #0 0x7fe03c5a0779  (/usr/lib/x86_64-linux-gnu/libasan.so.4+0x79779)
    #1 0x5575fb6daa6a in flatview_read_continue /home/test/qemu/exec.c:3194
    #2 0x5575fb6daccb in flatview_read /home/test/qemu/exec.c:3227
    #3 0x5575fb6dae66 in address_space_read_full /home/test/qemu/exec.c:3240
    #4 0x5575fb6db0cb in address_space_rw /home/test/qemu/exec.c:3268
    #5 0x5575fbdfd460 in dma_memory_rw_relaxed /home/test/qemu/include/sysemu/dma.h:87
    #6 0x5575fbdfd4b5 in dma_memory_rw /home/test/qemu/include/sysemu/dma.h:110
    #7 0x5575fbdfd866 in pci_dma_rw /home/test/qemu/include/hw/pci/pci.h:787
    #8 0x5575fbdfd8a3 in pci_dma_read /home/test/qemu/include/hw/pci/pci.h:794
    #9 0x5575fbe02761 in tulip_copy_tx_buffers hw/net/tulip.c:585
    #10 0x5575fbe0366b in tulip_xmit_list_update hw/net/tulip.c:678
    #11 0x5575fbe04073 in tulip_write hw/net/tulip.c:783

Signed-off-by: Li Qiang <liq3ea@163.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 tests/qtest/Makefile.include |  1 +
 tests/qtest/tulip-test.c     | 91 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 92 insertions(+)
 create mode 100644 tests/qtest/tulip-test.c

diff --git a/tests/qtest/Makefile.include b/tests/qtest/Makefile.include
index XXXXXXX..XXXXXXX 100644
--- a/tests/qtest/Makefile.include
+++ b/tests/qtest/Makefile.include
@@ -XXX,XX +XXX,XX @@ qos-test-obj-y += tests/qtest/es1370-test.o
 qos-test-obj-y += tests/qtest/ipoctal232-test.o
 qos-test-obj-y += tests/qtest/megasas-test.o
 qos-test-obj-y += tests/qtest/ne2000-test.o
+qos-test-obj-y += tests/qtest/tulip-test.o
 qos-test-obj-y += tests/qtest/nvme-test.o
 qos-test-obj-y += tests/qtest/pca9552-test.o
 qos-test-obj-y += tests/qtest/pci-test.o
diff --git a/tests/qtest/tulip-test.c b/tests/qtest/tulip-test.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/tests/qtest/tulip-test.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * QTest testcase for DEC/Intel Tulip 21143
+ *
+ * Copyright (c) 2020 Li Qiang <liq3ea@gmail.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "libqtest.h"
+#include "qemu/module.h"
+#include "libqos/qgraph.h"
+#include "libqos/pci.h"
+#include "qemu/bitops.h"
+#include "hw/net/tulip.h"
+
+typedef struct QTulip_pci QTulip_pci;
+
+struct QTulip_pci {
+    QOSGraphObject obj;
+    QPCIDevice dev;
+};
+
+static void *tulip_pci_get_driver(void *obj, const char *interface)
+{
+    QTulip_pci *tulip_pci = obj;
+
+    if (!g_strcmp0(interface, "pci-device")) {
+        return &tulip_pci->dev;
+    }
+
+    fprintf(stderr, "%s not present in tulip_pci\n", interface);
+    g_assert_not_reached();
+}
+
+static void *tulip_pci_create(void *pci_bus, QGuestAllocator *alloc, void *addr)
+{
+    QTulip_pci *tulip_pci = g_new0(QTulip_pci, 1);
+    QPCIBus *bus = pci_bus;
+
+    qpci_device_init(&tulip_pci->dev, bus, addr);
+    tulip_pci->obj.get_driver = tulip_pci_get_driver;
+
+    return &tulip_pci->obj;
+}
+
+static void tulip_large_tx(void *obj, void *data, QGuestAllocator *alloc)
+{
+    QTulip_pci *tulip_pci = obj;
+    QPCIDevice *dev = &tulip_pci->dev;
+    QPCIBar bar;
+    struct tulip_descriptor context;
+    char guest_data[4096];
+    uint64_t context_pa;
+    uint64_t guest_pa;
+
+    qpci_device_enable(dev);
+    bar = qpci_iomap(dev, 0, NULL);
+    context_pa = guest_alloc(alloc, sizeof(context));
+    guest_pa = guest_alloc(alloc, 4096);
+    memset(guest_data, 'A', sizeof(guest_data));
+    context.status = TDES0_OWN;
+    context.control = TDES1_BUF2_SIZE_MASK << TDES1_BUF2_SIZE_SHIFT |
+                      TDES1_BUF1_SIZE_MASK << TDES1_BUF1_SIZE_SHIFT;
+    context.buf_addr2 = guest_pa;
+    context.buf_addr1 = guest_pa;
+
+    qtest_memwrite(dev->bus->qts, context_pa, &context, sizeof(context));
+    qtest_memwrite(dev->bus->qts, guest_pa, guest_data, sizeof(guest_data));
+    qpci_io_writel(dev, bar, 0x20, context_pa);
+    qpci_io_writel(dev, bar, 0x30, CSR6_ST);
+    guest_free(alloc, context_pa);
+    guest_free(alloc, guest_pa);
+}
+
+static void tulip_register_nodes(void)
+{
+    QOSGraphEdgeOptions opts = {
+        .extra_device_opts = "addr=04.0",
+    };
+    add_qpci_address(&opts, &(QPCIAddress) { .devfn = QPCI_DEVFN(4, 0) });
+
+    qos_node_create_driver("tulip", tulip_pci_create);
+    qos_node_consumes("tulip", "pci-bus", &opts);
+    qos_node_produces("tulip", "pci-device");
+
+    qos_add_test("tulip_large_tx", "tulip", tulip_large_tx, NULL);
+}
+
+libqos_init(tulip_register_nodes);
-- 
2.5.0

The following changes since commit 352998df1c53b366413690d95b35f76d0721ebed:

Merge tag 'i2c-20220314' of https://github.com/philmd/qemu into staging (2022-03-14 14:39:33 +0000)

are available in the git repository at:

https://github.com/jasowang/qemu.git tags/net-pull-request

for you to fetch changes up to 12a195fa343aae2ead1301ce04727bd0ae25eb15:

vdpa: Expose VHOST_F_LOG_ALL on SVQ (2022-03-15 13:57:44 +0800)

----------------------------------------------------------------

Changes since V2:
- fix 32bit build errros

----------------------------------------------------------------
Eugenio Pérez (14):
      vhost: Add VhostShadowVirtqueue
      vhost: Add Shadow VirtQueue kick forwarding capabilities
      vhost: Add Shadow VirtQueue call forwarding capabilities
      vhost: Add vhost_svq_valid_features to shadow vq
      virtio: Add vhost_svq_get_vring_addr
      vdpa: adapt vhost_ops callbacks to svq
      vhost: Shadow virtqueue buffers forwarding
      util: Add iova_tree_alloc_map
      util: add iova_tree_find_iova
      vhost: Add VhostIOVATree
      vdpa: Add custom IOTLB translations to SVQ
      vdpa: Adapt vhost_vdpa_get_vring_base to SVQ
      vdpa: Never set log_base addr if SVQ is enabled
      vdpa: Expose VHOST_F_LOG_ALL on SVQ

Jason Wang (1):
      virtio-net: fix map leaking on error during receive

hw/net/virtio-net.c                |   1 +
 hw/virtio/meson.build              |   2 +-
 hw/virtio/vhost-iova-tree.c        | 110 +++++++
 hw/virtio/vhost-iova-tree.h        |  27 ++
 hw/virtio/vhost-shadow-virtqueue.c | 636 +++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  87 +++++
 hw/virtio/vhost-vdpa.c             | 522 +++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   8 +
 include/qemu/iova-tree.h           |  38 ++-
 util/iova-tree.c                   | 170 ++++++++++
 10 files changed, 1584 insertions(+), 17 deletions(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

Commit bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
tries to fix the use after free of the sg by caching the virtqueue
elements in an array and unmap them at once after receiving the
packets, But it forgot to unmap the cached elements on error which
will lead to leaking of mapping and other unexpected results.

Fixing this by detaching the cached elements on error. This addresses
CVE-2022-26353.

Reported-by: Victor Tom <vv474172261@gmail.com>
Cc: qemu-stable@nongnu.org
Fixes: CVE-2022-26353
Fixes: bedd7e93d0196 ("virtio-net: fix use after unmap/free for sg")
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/net/virtio-net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -XXX,XX +XXX,XX @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
 
 err:
     for (j = 0; j < i; j++) {
+        virtqueue_detach_element(q->rx_vq, elems[j], lens[j]);
         g_free(elems[j]);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Vhost shadow virtqueue (SVQ) is an intermediate jump for virtqueue
notifications and buffers, allowing qemu to track them. While qemu is
forwarding the buffers and virtqueue changes, it is able to commit the
memory it's being dirtied, the same way regular qemu's VirtIO devices
do.

This commit only exposes basic SVQ allocation and free. Next patches of
the series add functionality like notifications and buffers forwarding.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build              |  2 +-
 hw/virtio/vhost-shadow-virtqueue.c | 62 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h | 28 +++++++++++++++++
 3 files changed, 91 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.c
 create mode 100644 hw/virtio/vhost-shadow-virtqueue.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/virtio/vhost-shadow-virtqueue.h"
+
+#include "qemu/error-report.h"
+
+/**
+ * Creates vhost shadow virtqueue, and instructs the vhost device to use the
+ * shadow methods and file descriptors.
+ *
+ * Returns the new virtqueue or NULL.
+ *
+ * In case of error, reason is reported through error_report.
+ */
+VhostShadowVirtqueue *vhost_svq_new(void)
+{
+    g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
+    int r;
+
+    r = event_notifier_init(&svq->hdev_kick, 0);
+    if (r != 0) {
+        error_report("Couldn't create kick event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_kick;
+    }
+
+    r = event_notifier_init(&svq->hdev_call, 0);
+    if (r != 0) {
+        error_report("Couldn't create call event notifier: %s (%d)",
+                     g_strerror(errno), errno);
+        goto err_init_hdev_call;
+    }
+
+    return g_steal_pointer(&svq);
+
+err_init_hdev_call:
+    event_notifier_cleanup(&svq->hdev_kick);
+
+err_init_hdev_kick:
+    return NULL;
+}
+
+/**
+ * Free the resources of the shadow virtqueue.
+ *
+ * @pvq: gpointer to SVQ so it can be used by autofree functions.
+ */
+void vhost_svq_free(gpointer pvq)
+{
+    VhostShadowVirtqueue *vq = pvq;
+    event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_cleanup(&vq->hdev_call);
+    g_free(vq);
+}
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost shadow virtqueue
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef VHOST_SHADOW_VIRTQUEUE_H
+#define VHOST_SHADOW_VIRTQUEUE_H
+
+#include "qemu/event_notifier.h"
+
+/* Shadow virtqueue to relay notifications */
+typedef struct VhostShadowVirtqueue {
+    /* Shadow kick notifier, sent to vhost */
+    EventNotifier hdev_kick;
+    /* Shadow call notifier, sent to vhost */
+    EventNotifier hdev_call;
+} VhostShadowVirtqueue;
+
+VhostShadowVirtqueue *vhost_svq_new(void);
+
+void vhost_svq_free(gpointer vq);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

At this mode no buffer forwarding will be performed in SVQ mode: Qemu
will just forward the guest's kicks to the device.

Host memory notifiers regions are left out for simplicity, and they will
not be addressed in this series.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  55 ++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  14 ++++
 hw/virtio/vhost-vdpa.c             | 144 ++++++++++++++++++++++++++++++++++++-
 include/hw/virtio/vhost-vdpa.h     |   4 ++
 4 files changed, 215 insertions(+), 2 deletions(-)

From: Eugenio Pérez <eperezma@redhat.com>

This will make qemu aware of the device used buffers, allowing it to
write the guest memory with its contents if needed.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 38 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  4 ++++
 hw/virtio/vhost-vdpa.c             | 31 +++++++++++++++++++++++++++++--
 3 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static void vhost_handle_guest_kick(EventNotifier *n)
 }
 
 /**
+ * Forward vhost notifications
+ *
+ * @n: hdev call event notifier, the one that device set to notify svq.
+ */
+static void vhost_svq_handle_call(EventNotifier *n)
+{
+    VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
+                                             hdev_call);
+    event_notifier_test_and_clear(n);
+    event_notifier_set(&svq->svq_call);
+}
+
+/**
+ * Set the call notifier for the SVQ to call the guest
+ *
+ * @svq: Shadow virtqueue
+ * @call_fd: call notifier
+ *
+ * Called on BQL context.
+ */
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
+{
+    if (call_fd == VHOST_FILE_UNBIND) {
+        /*
+         * Fail event_notifier_set if called handling device call.
+         *
+         * SVQ still needs device notifications, since it needs to keep
+         * forwarding used buffers even with the unbind.
+         */
+        memset(&svq->svq_call, 0, sizeof(svq->svq_call));
+    } else {
+        event_notifier_init_fd(&svq->svq_call, call_fd);
+    }
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
     }
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
+    event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
@@ -XXX,XX +XXX,XX @@ void vhost_svq_free(gpointer pvq)
     VhostShadowVirtqueue *vq = pvq;
     vhost_svq_stop(vq);
     event_notifier_cleanup(&vq->hdev_kick);
+    event_notifier_set_handler(&vq->hdev_call, NULL);
     event_notifier_cleanup(&vq->hdev_call);
     g_free(vq);
 }
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
      * So shadow virtqueue must not clean it, or we would lose VirtQueue one.
      */
     EventNotifier svq_kick;
+
+    /* Guest's call notifier, where the SVQ calls guest. */
+    EventNotifier svq_call;
 } VhostShadowVirtqueue;
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
+void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_KICK, file);
 }
 
+static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
+                                         struct vhost_vring_file *file)
+{
+    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
  * @svq: The shadow virtqueue
  * @idx: The index of the virtqueue in the vhost device
  * @errp: Error
+ *
+ * Note that this function does not rewind kick file descriptor if cannot set
+ * call one.
  */
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
                                  VhostShadowVirtqueue *svq, unsigned idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
+        return false;
+    }
+
+    event_notifier = &svq->hdev_call;
+    file.fd = event_notifier_get_fd(event_notifier);
+    r = vhost_vdpa_set_vring_dev_call(dev, &file);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
     return r == 0;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_kick(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
                                        struct vhost_vring_file *file)
 {
-    trace_vhost_vdpa_set_vring_call(dev, file->index, file->fd);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        int vdpa_idx = file->index - dev->vq_index;
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, vdpa_idx);
+
+        vhost_svq_set_svq_call_fd(svq, file->fd);
+        return 0;
+    } else {
+        return vhost_vdpa_set_vring_dev_call(dev, file);
+    }
 }
 
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This allows SVQ to negotiate features with the guest and the device. For
the device, SVQ is a driver. While this function bypasses all
non-transport features, it needs to disable the features that SVQ does
not support when forwarding buffers. This includes packed vq layout,
indirect descriptors or event idx.

Future changes can add support to offer more features to the guest,
since the use of VirtQueue gives this for free. This is left out at the
moment for simplicity.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 44 ++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-shadow-virtqueue.h |  2 ++
 hw/virtio/vhost-vdpa.c             | 15 +++++++++++++
 3 files changed, 61 insertions(+)

From: Eugenio Pérez <eperezma@redhat.com>

It reports the shadow virtqueue address from qemu virtual address space.

Since this will be different from the guest's vaddr, but the device can
access it, SVQ takes special care about its alignment & lack of garbage
data. It assumes that IOMMU will work in host_page_size ranges for that.

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 }
 
 /**
+ * Get the shadow vq vring address.
+ * @svq: Shadow virtqueue
+ * @addr: Destination to store address
+ */
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr)
+{
+    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+}
+
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    size_t avail_size = offsetof(vring_avail_t, ring) +
+                                             sizeof(uint16_t) * svq->vring.num;
+
+    return ROUND_UP(desc_size + avail_size, qemu_real_host_page_size);
+}
+
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq)
+{
+    size_t used_size = offsetof(vring_used_t, ring) +
+                                    sizeof(vring_used_elem_t) * svq->vring.num;
+    return ROUND_UP(used_size, qemu_real_host_page_size);
+}
+
+/**
  * Set a new file descriptor for the guest to kick the SVQ and notify for avail
  *
  * @svq: The svq
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #define VHOST_SHADOW_VIRTQUEUE_H
 
 #include "qemu/event_notifier.h"
+#include "hw/virtio/virtio.h"
+#include "standard-headers/linux/vhost_types.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
+    /* Shadow vring */
+    struct vring vring;
+
     /* Shadow kick notifier, sent to vhost */
     EventNotifier hdev_kick;
     /* Shadow call notifier, sent to vhost */
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp);
 
 void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd);
 void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd);
+void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
+                              struct vhost_vring_addr *addr);
+size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
+size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

First half of the buffers forwarding part, preparing vhost-vdpa
callbacks to SVQ to offer it. QEMU cannot enable it at this moment, so
this is effectively dead code at the moment, but it helps to reduce
patch size.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_get_config(struct vhost_dev *dev, uint8_t *config,
     return ret;
  }
 
+static int vhost_vdpa_set_dev_vring_base(struct vhost_dev *dev,
+                                         struct vhost_vring_state *ring)
+{
+    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+}
+
 static int vhost_vdpa_set_vring_dev_kick(struct vhost_dev *dev,
                                          struct vhost_vring_file *file)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_call(struct vhost_dev *dev,
     return vhost_vdpa_call(dev, VHOST_SET_VRING_CALL, file);
 }
 
+static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
+                                         struct vhost_vring_addr *addr)
+{
+    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
+                                addr->desc_user_addr, addr->used_user_addr,
+                                addr->avail_user_addr,
+                                addr->log_guest_addr);
+
+    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+
+}
+
 /**
  * Set the shadow virtqueue descriptors to the device
  *
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_log_base(struct vhost_dev *dev, uint64_t base,
 static int vhost_vdpa_set_vring_addr(struct vhost_dev *dev,
                                        struct vhost_vring_addr *addr)
 {
-    trace_vhost_vdpa_set_vring_addr(dev, addr->index, addr->flags,
-                                    addr->desc_user_addr, addr->used_user_addr,
-                                    addr->avail_user_addr,
-                                    addr->log_guest_addr);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_ADDR, addr);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring addr was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_vring_dev_addr(dev, addr);
 }
 
 static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_num(struct vhost_dev *dev,
 static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
-    trace_vhost_vdpa_set_vring_base(dev, ring->index, ring->num);
-    return vhost_vdpa_call(dev, VHOST_SET_VRING_BASE, ring);
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (v->shadow_vqs_enabled) {
+        /*
+         * Device vring base was set at device start. SVQ base is handled by
+         * VirtQueue code.
+         */
+        return 0;
+    }
+
+    return vhost_vdpa_set_dev_vring_base(dev, ring);
 }
 
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Initial version of shadow virtqueue that actually forward buffers. There
is no iommu support at the moment, and that will be addressed in future
patches of this series. Since all vhost-vdpa devices use forced IOMMU,
this means that SVQ is not usable at this point of the series on any
device.

For simplicity it only supports modern devices, that expects vring
in little endian, with split ring and no event idx or indirect
descriptors. Support for them will not be added in this series.

It reuses the VirtQueue code for the device part. The driver part is
based on Linux's virtio_ring driver, but with stripped functionality
and optimizations so it's easier to review.

However, forwarding buffers have some particular pieces: One of the most
unexpected ones is that a guest's buffer can expand through more than
one descriptor in SVQ. While this is handled gracefully by qemu's
emulated virtio devices, it may cause unexpected SVQ queue full. This
patch also solves it by checking for this condition at both guest's
kicks and device's calls. The code may be more elegant in the future if
SVQ code runs in its own iocontext.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c | 352 ++++++++++++++++++++++++++++++++++++-
 hw/virtio/vhost-shadow-virtqueue.h |  26 +++
 hw/virtio/vhost-vdpa.c             | 155 +++++++++++++++-
 3 files changed, 522 insertions(+), 11 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "qemu/main-loop.h"
+#include "qemu/log.h"
+#include "qemu/memalign.h"
 #include "linux-headers/linux/vhost.h"
 
 /**
@@ -XXX,XX +XXX,XX @@ bool vhost_svq_valid_features(uint64_t features, Error **errp)
 }
 
 /**
- * Forward guest notifications.
+ * Number of descriptors that the SVQ can make available from the guest.
+ *
+ * @svq: The svq
+ */
+static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
+{
+    return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+                                    const struct iovec *iovec, size_t num,
+                                    bool more_descs, bool write)
+{
+    uint16_t i = svq->free_head, last = svq->free_head;
+    unsigned n;
+    uint16_t flags = write ? cpu_to_le16(VRING_DESC_F_WRITE) : 0;
+    vring_desc_t *descs = svq->vring.desc;
+
+    if (num == 0) {
+        return;
+    }
+
+    for (n = 0; n < num; n++) {
+        if (more_descs || (n + 1 < num)) {
+            descs[i].flags = flags | cpu_to_le16(VRING_DESC_F_NEXT);
+        } else {
+            descs[i].flags = flags;
+        }
+        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].len = cpu_to_le32(iovec[n].iov_len);
+
+        last = i;
+        i = cpu_to_le16(descs[i].next);
+    }
+
+    svq->free_head = le16_to_cpu(descs[last].next);
+}
+
+static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
+                                VirtQueueElement *elem, unsigned *head)
+{
+    unsigned avail_idx;
+    vring_avail_t *avail = svq->vring.avail;
+
+    *head = svq->free_head;
+
+    /* We need some descriptors here */
+    if (unlikely(!elem->out_num && !elem->in_num)) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+                      "Guest provided element with no descriptors");
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
+                            false);
+    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+
+    /*
+     * Put the entry in the available array (but don't update avail->idx until
+     * they do sync).
+     */
+    avail_idx = svq->shadow_avail_idx & (svq->vring.num - 1);
+    avail->ring[avail_idx] = cpu_to_le16(*head);
+    svq->shadow_avail_idx++;
+
+    /* Update the avail index after write the descriptor */
+    smp_wmb();
+    avail->idx = cpu_to_le16(svq->shadow_avail_idx);
+
+    return true;
+}
+
+static bool vhost_svq_add(VhostShadowVirtqueue *svq, VirtQueueElement *elem)
+{
+    unsigned qemu_head;
+    bool ok = vhost_svq_add_split(svq, elem, &qemu_head);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    svq->ring_id_maps[qemu_head] = elem;
+    return true;
+}
+
+static void vhost_svq_kick(VhostShadowVirtqueue *svq)
+{
+    /*
+     * We need to expose the available array entries before checking the used
+     * flags
+     */
+    smp_mb();
+    if (svq->vring.used->flags & VRING_USED_F_NO_NOTIFY) {
+        return;
+    }
+
+    event_notifier_set(&svq->hdev_kick);
+}
+
+/**
+ * Forward available buffers.
+ *
+ * @svq: Shadow VirtQueue
+ *
+ * Note that this function does not guarantee that all guest's available
+ * buffers are available to the device in SVQ avail ring. The guest may have
+ * exposed a GPA / GIOVA contiguous buffer, but it may not be contiguous in
+ * qemu vaddr.
+ *
+ * If that happens, guest's kick notifications will be disabled until the
+ * device uses some buffers.
+ */
+static void vhost_handle_guest_kick(VhostShadowVirtqueue *svq)
+{
+    /* Clear event notifier */
+    event_notifier_test_and_clear(&svq->svq_kick);
+
+    /* Forward to the device as many available buffers as possible */
+    do {
+        virtio_queue_set_notification(svq->vq, false);
+
+        while (true) {
+            VirtQueueElement *elem;
+            bool ok;
+
+            if (svq->next_guest_avail_elem) {
+                elem = g_steal_pointer(&svq->next_guest_avail_elem);
+            } else {
+                elem = virtqueue_pop(svq->vq, sizeof(*elem));
+            }
+
+            if (!elem) {
+                break;
+            }
+
+            if (elem->out_num + elem->in_num > vhost_svq_available_slots(svq)) {
+                /*
+                 * This condition is possible since a contiguous buffer in GPA
+                 * does not imply a contiguous buffer in qemu's VA
+                 * scatter-gather segments. If that happens, the buffer exposed
+                 * to the device needs to be a chain of descriptors at this
+                 * moment.
+                 *
+                 * SVQ cannot hold more available buffers if we are here:
+                 * queue the current guest descriptor and ignore further kicks
+                 * until some elements are used.
+                 */
+                svq->next_guest_avail_elem = elem;
+                return;
+            }
+
+            ok = vhost_svq_add(svq, elem);
+            if (unlikely(!ok)) {
+                /* VQ is broken, just return and ignore any other kicks */
+                return;
+            }
+            vhost_svq_kick(svq);
+        }
+
+        virtio_queue_set_notification(svq->vq, true);
+    } while (!virtio_queue_empty(svq->vq));
+}
+
+/**
+ * Handle guest's kick.
  *
  * @n: guest kick event notifier, the one that guest set to notify svq.
  */
-static void vhost_handle_guest_kick(EventNotifier *n)
+static void vhost_handle_guest_kick_notifier(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue, svq_kick);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->hdev_kick);
+    vhost_handle_guest_kick(svq);
+}
+
+static bool vhost_svq_more_used(VhostShadowVirtqueue *svq)
+{
+    if (svq->last_used_idx != svq->shadow_used_idx) {
+        return true;
+    }
+
+    svq->shadow_used_idx = cpu_to_le16(svq->vring.used->idx);
+
+    return svq->last_used_idx != svq->shadow_used_idx;
 }
 
 /**
- * Forward vhost notifications
+ * Enable vhost device calls after disable them.
+ *
+ * @svq: The svq
+ *
+ * It returns false if there are pending used buffers from the vhost device,
+ * avoiding the possible races between SVQ checking for more work and enabling
+ * callbacks. True if SVQ used vring has no more pending buffers.
+ */
+static bool vhost_svq_enable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags &= ~cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+    /* Make sure the flag is written before the read of used_idx */
+    smp_mb();
+    return !vhost_svq_more_used(svq);
+}
+
+static void vhost_svq_disable_notification(VhostShadowVirtqueue *svq)
+{
+    svq->vring.avail->flags |= cpu_to_le16(VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+static VirtQueueElement *vhost_svq_get_buf(VhostShadowVirtqueue *svq,
+                                           uint32_t *len)
+{
+    vring_desc_t *descs = svq->vring.desc;
+    const vring_used_t *used = svq->vring.used;
+    vring_used_elem_t used_elem;
+    uint16_t last_used;
+
+    if (!vhost_svq_more_used(svq)) {
+        return NULL;
+    }
+
+    /* Only get used array entries after they have been exposed by dev */
+    smp_rmb();
+    last_used = svq->last_used_idx & (svq->vring.num - 1);
+    used_elem.id = le32_to_cpu(used->ring[last_used].id);
+    used_elem.len = le32_to_cpu(used->ring[last_used].len);
+
+    svq->last_used_idx++;
+    if (unlikely(used_elem.id >= svq->vring.num)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "Device %s says index %u is used",
+                      svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    if (unlikely(!svq->ring_id_maps[used_elem.id])) {
+        qemu_log_mask(LOG_GUEST_ERROR,
+            "Device %s says index %u is used, but it was not available",
+            svq->vdev->name, used_elem.id);
+        return NULL;
+    }
+
+    descs[used_elem.id].next = svq->free_head;
+    svq->free_head = used_elem.id;
+
+    *len = used_elem.len;
+    return g_steal_pointer(&svq->ring_id_maps[used_elem.id]);
+}
+
+static void vhost_svq_flush(VhostShadowVirtqueue *svq,
+                            bool check_for_avail_queue)
+{
+    VirtQueue *vq = svq->vq;
+
+    /* Forward as many used buffers as possible. */
+    do {
+        unsigned i = 0;
+
+        vhost_svq_disable_notification(svq);
+        while (true) {
+            uint32_t len;
+            g_autofree VirtQueueElement *elem = vhost_svq_get_buf(svq, &len);
+            if (!elem) {
+                break;
+            }
+
+            if (unlikely(i >= svq->vring.num)) {
+                qemu_log_mask(LOG_GUEST_ERROR,
+                         "More than %u used buffers obtained in a %u size SVQ",
+                         i, svq->vring.num);
+                virtqueue_fill(vq, elem, len, i);
+                virtqueue_flush(vq, i);
+                return;
+            }
+            virtqueue_fill(vq, elem, len, i++);
+        }
+
+        virtqueue_flush(vq, i);
+        event_notifier_set(&svq->svq_call);
+
+        if (check_for_avail_queue && svq->next_guest_avail_elem) {
+            /*
+             * Avail ring was full when vhost_svq_flush was called, so it's a
+             * good moment to make more descriptors available if possible.
+             */
+            vhost_handle_guest_kick(svq);
+        }
+    } while (!vhost_svq_enable_notification(svq));
+}
+
+/**
+ * Forward used buffers.
  *
  * @n: hdev call event notifier, the one that device set to notify svq.
+ *
+ * Note that we are not making any buffers available in the loop, there is no
+ * way that it runs more than virtqueue size times.
  */
 static void vhost_svq_handle_call(EventNotifier *n)
 {
     VhostShadowVirtqueue *svq = container_of(n, VhostShadowVirtqueue,
                                              hdev_call);
     event_notifier_test_and_clear(n);
-    event_notifier_set(&svq->svq_call);
+    vhost_svq_flush(svq, true);
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
     if (poll_start) {
         event_notifier_init_fd(svq_kick, svq_kick_fd);
         event_notifier_set(svq_kick);
-        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick);
+        event_notifier_set_handler(svq_kick, vhost_handle_guest_kick_notifier);
+    }
+}
+
+/**
+ * Start the shadow virtqueue operation.
+ *
+ * @svq: Shadow Virtqueue
+ * @vdev: VirtIO device
+ * @vq: Virtqueue to shadow
+ */
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq)
+{
+    size_t desc_size, driver_size, device_size;
+
+    svq->next_guest_avail_elem = NULL;
+    svq->shadow_avail_idx = 0;
+    svq->shadow_used_idx = 0;
+    svq->last_used_idx = 0;
+    svq->vdev = vdev;
+    svq->vq = vq;
+
+    svq->vring.num = virtio_queue_get_num(vdev, virtio_get_queue_index(vq));
+    driver_size = vhost_svq_driver_area_size(svq);
+    device_size = vhost_svq_device_area_size(svq);
+    svq->vring.desc = qemu_memalign(qemu_real_host_page_size, driver_size);
+    desc_size = sizeof(vring_desc_t) * svq->vring.num;
+    svq->vring.avail = (void *)((char *)svq->vring.desc + desc_size);
+    memset(svq->vring.desc, 0, driver_size);
+    svq->vring.used = qemu_memalign(qemu_real_host_page_size, device_size);
+    memset(svq->vring.used, 0, device_size);
+    svq->ring_id_maps = g_new0(VirtQueueElement *, svq->vring.num);
+    for (unsigned i = 0; i < svq->vring.num - 1; i++) {
+        svq->vring.desc[i].next = cpu_to_le16(i + 1);
     }
 }
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_kick_fd(VhostShadowVirtqueue *svq, int svq_kick_fd)
 void vhost_svq_stop(VhostShadowVirtqueue *svq)
 {
     event_notifier_set_handler(&svq->svq_kick, NULL);
+    g_autofree VirtQueueElement *next_avail_elem = NULL;
+
+    if (!svq->vq) {
+        return;
+    }
+
+    /* Send all pending used descriptors to guest */
+    vhost_svq_flush(svq, false);
+
+    for (unsigned i = 0; i < svq->vring.num; ++i) {
+        g_autofree VirtQueueElement *elem = NULL;
+        elem = g_steal_pointer(&svq->ring_id_maps[i]);
+        if (elem) {
+            virtqueue_detach_element(svq->vq, elem, 0);
+        }
+    }
+
+    next_avail_elem = g_steal_pointer(&svq->next_guest_avail_elem);
+    if (next_avail_elem) {
+        virtqueue_detach_element(svq->vq, next_avail_elem, 0);
+    }
+    svq->vq = NULL;
+    g_free(svq->ring_id_maps);
+    qemu_vfree(svq->vring.desc);
+    qemu_vfree(svq->vring.used);
 }
 
 /**
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
 
     /* Guest's call notifier, where the SVQ calls guest. */
     EventNotifier svq_call;
+
+    /* Virtio queue shadowing */
+    VirtQueue *vq;
+
+    /* Virtio device */
+    VirtIODevice *vdev;
+
+    /* Map for use the guest's descriptors */
+    VirtQueueElement **ring_id_maps;
+
+    /* Next VirtQueue element that guest made available */
+    VirtQueueElement *next_guest_avail_elem;
+
+    /* Next head to expose to the device */
+    uint16_t shadow_avail_idx;
+
+    /* Next free descriptor */
+    uint16_t free_head;
+
+    /* Last seen used idx */
+    uint16_t shadow_used_idx;
+
+    /* Next head to consume from the device */
+    uint16_t last_used_idx;
 } VhostShadowVirtqueue;
 
 bool vhost_svq_valid_features(uint64_t features, Error **errp);
@@ -XXX,XX +XXX,XX @@ void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq);
 size_t vhost_svq_device_area_size(const VhostShadowVirtqueue *svq);
 
+void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
+                     VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
 VhostShadowVirtqueue *vhost_svq_new(void);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_dev_addr(struct vhost_dev *dev,
  * Note that this function does not rewind kick file descriptor if cannot set
  * call one.
  */
-static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
-                                 VhostShadowVirtqueue *svq, unsigned idx,
-                                 Error **errp)
+static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
+                                  VhostShadowVirtqueue *svq, unsigned idx,
+                                  Error **errp)
 {
     struct vhost_vring_file file = {
         .index = dev->vq_index + idx,
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
     r = vhost_vdpa_set_vring_dev_kick(dev, &file);
     if (unlikely(r != 0)) {
         error_setg_errno(errp, -r, "Can't set device kick fd");
-        return false;
+        return r;
     }
 
     event_notifier = &svq->hdev_call;
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
         error_setg_errno(errp, -r, "Can't set device call fd");
     }
 
+    return r;
+}
+
+/**
+ * Unmap a SVQ area in the device
+ */
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
+                                      hwaddr size)
+{
+    int r;
+
+    size = ROUND_UP(size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, iova, size);
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
+                                       const VhostShadowVirtqueue *svq)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    struct vhost_vring_addr svq_addr;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    bool ok;
+
+    vhost_svq_get_vring_addr(svq, &svq_addr);
+
+    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+}
+
+/**
+ * Map the shadow virtqueue rings in the device
+ *
+ * @dev: The vhost device
+ * @svq: The shadow virtqueue
+ * @addr: Assigned IOVA addresses
+ * @errp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
+                                     const VhostShadowVirtqueue *svq,
+                                     struct vhost_vring_addr *addr,
+                                     Error **errp)
+{
+    struct vhost_vdpa *v = dev->opaque;
+    size_t device_size = vhost_svq_device_area_size(svq);
+    size_t driver_size = vhost_svq_driver_area_size(svq);
+    int r;
+
+    ERRP_GUARD();
+    vhost_svq_get_vring_addr(svq, addr);
+
+    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
+                           (void *)(uintptr_t)addr->desc_user_addr, true);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
+                           (void *)(intptr_t)addr->used_user_addr, false);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    }
+
+    return r == 0;
+}
+
+static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
+                                 VhostShadowVirtqueue *svq, unsigned idx,
+                                 Error **errp)
+{
+    uint16_t vq_index = dev->vq_index + idx;
+    struct vhost_vring_state s = {
+        .index = vq_index,
+    };
+    int r;
+
+    r = vhost_vdpa_set_dev_vring_base(dev, &s);
+    if (unlikely(r)) {
+        error_setg_errno(errp, -r, "Cannot set vring base");
+        return false;
+    }
+
+    r = vhost_vdpa_svq_set_fds(dev, svq, idx, errp);
     return r == 0;
 }
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svqs_start(struct vhost_dev *dev)
     }
 
     for (i = 0; i < v->shadow_vqs->len; ++i) {
+        VirtQueue *vq = virtio_get_queue(dev->vdev, dev->vq_index + i);
         VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        struct vhost_vring_addr addr = {
+            .index = i,
+        };
+        int r;
         bool ok = vhost_vdpa_svq_setup(dev, svq, i, &err);
         if (unlikely(!ok)) {
-            error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+            goto err;
+        }
+
+        vhost_svq_start(svq, dev->vdev, vq);
+        ok = vhost_vdpa_svq_map_rings(dev, svq, &addr, &err);
+        if (unlikely(!ok)) {
+            goto err_map;
+        }
+
+        /* Override vring GPA set by vhost subsystem */
+        r = vhost_vdpa_set_vring_dev_addr(dev, &addr);
+        if (unlikely(r != 0)) {
+            error_setg_errno(&err, -r, "Cannot set device address");
+            goto err_set_addr;
+        }
+    }
+
+    return true;
+
+err_set_addr:
+    vhost_vdpa_svq_unmap_rings(dev, g_ptr_array_index(v->shadow_vqs, i));
+
+err_map:
+    vhost_svq_stop(g_ptr_array_index(v->shadow_vqs, i));
+
+err:
+    error_reportf_err(err, "Cannot setup SVQ %u: ", i);
+    for (unsigned j = 0; j < i; ++j) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, j);
+        vhost_vdpa_svq_unmap_rings(dev, svq);
+        vhost_svq_stop(svq);
+    }
+
+    return false;
+}
+
+static bool vhost_vdpa_svqs_stop(struct vhost_dev *dev)
+{
+    struct vhost_vdpa *v = dev->opaque;
+
+    if (!v->shadow_vqs) {
+        return true;
+    }
+
+    for (unsigned i = 0; i < v->shadow_vqs->len; ++i) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs, i);
+        bool ok = vhost_vdpa_svq_unmap_rings(dev, svq);
+        if (unlikely(!ok)) {
             return false;
         }
     }
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_dev_start(struct vhost_dev *dev, bool started)
         }
         vhost_vdpa_set_vring_ready(dev);
     } else {
+        ok = vhost_vdpa_svqs_stop(dev);
+        if (unlikely(!ok)) {
+            return -1;
+        }
         vhost_vdpa_host_notifiers_uninit(dev, dev->nvqs);
     }
 
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This iova tree function allows it to look for a hole in allocated
regions and return a totally new translation for a given translated
address.

It's usage is mainly to allow devices to access qemu address space,
remapping guest's one into a new iova space where qemu can add chunks of
addresses.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Reviewed-by: Peter Xu <peterx@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h |  18 +++++++
 util/iova-tree.c         | 136 +++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 154 insertions(+)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@
 #define  IOVA_OK           (0)
 #define  IOVA_ERR_INVALID  (-1) /* Invalid parameters */
 #define  IOVA_ERR_OVERLAP  (-2) /* IOVA range overlapped */
+#define  IOVA_ERR_NOMEM    (-3) /* Cannot allocate */
 
 typedef struct IOVATree IOVATree;
 typedef struct DMAMap {
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova);
 void iova_tree_foreach(IOVATree *tree, iova_tree_iterator iterator);
 
 /**
+ * iova_tree_alloc_map:
+ *
+ * @tree: the iova tree to allocate from
+ * @map: the new map (as translated addr & size) to allocate in the iova region
+ * @iova_begin: the minimum address of the allocation
+ * @iova_end: the maximum addressable direction of the allocation
+ *
+ * Allocates a new region of a given size, between iova_min and iova_max.
+ *
+ * Return: Same as iova_tree_insert, but cannot overlap and can return error if
+ * iova tree is out of free contiguous range. The caller gets the assigned iova
+ * in map->iova.
+ */
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_end);
+
+/**
  * iova_tree_destroy:
  *
  * @tree: the iova tree to destroy
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATree {
     GTree *tree;
 };
 
+/* Args to pass to iova_tree_alloc foreach function. */
+struct IOVATreeAllocArgs {
+    /* Size of the desired allocation */
+    size_t new_size;
+
+    /* The minimum address allowed in the allocation */
+    hwaddr iova_begin;
+
+    /* Map at the left of the hole, can be NULL if "this" is first one */
+    const DMAMap *prev;
+
+    /* Map at the right of the hole, can be NULL if "prev" is the last one */
+    const DMAMap *this;
+
+    /* If found, we fill in the IOVA here */
+    hwaddr iova_result;
+
+    /* Whether have we found a valid IOVA */
+    bool iova_found;
+};
+
+/**
+ * Iterate args to the next hole
+ *
+ * @args: The alloc arguments
+ * @next: The next mapping in the tree. Can be NULL to signal the last one
+ */
+static void iova_tree_alloc_args_iterate(struct IOVATreeAllocArgs *args,
+                                         const DMAMap *next)
+{
+    args->prev = args->this;
+    args->this = next;
+}
+
 static int iova_tree_compare(gconstpointer a, gconstpointer b, gpointer data)
 {
     const DMAMap *m1 = a, *m2 = b;
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map)
     return IOVA_OK;
 }
 
+/**
+ * Try to find an unallocated IOVA range between prev and this elements.
+ *
+ * @args: Arguments to allocation
+ *
+ * Cases:
+ *
+ * (1) !prev, !this: No entries allocated, always succeed
+ *
+ * (2) !prev, this: We're iterating at the 1st element.
+ *
+ * (3) prev, !this: We're iterating at the last element.
+ *
+ * (4) prev, this: this is the most common case, we'll try to find a hole
+ * between "prev" and "this" mapping.
+ *
+ * Note that this function assumes the last valid iova is HWADDR_MAX, but it
+ * searches linearly so it's easy to discard the result if it's not the case.
+ */
+static void iova_tree_alloc_map_in_hole(struct IOVATreeAllocArgs *args)
+{
+    const DMAMap *prev = args->prev, *this = args->this;
+    uint64_t hole_start, hole_last;
+
+    if (this && this->iova + this->size < args->iova_begin) {
+        return;
+    }
+
+    hole_start = MAX(prev ? prev->iova + prev->size + 1 : 0, args->iova_begin);
+    hole_last = this ? this->iova : HWADDR_MAX;
+
+    if (hole_last - hole_start > args->new_size) {
+        args->iova_result = hole_start;
+        args->iova_found = true;
+    }
+}
+
+/**
+ * Foreach dma node in the tree, compare if there is a hole with its previous
+ * node (or minimum iova address allowed) and the node.
+ *
+ * @key: Node iterating
+ * @value: Node iterating
+ * @pargs: Struct to communicate with the outside world
+ *
+ * Return: false to keep iterating, true if needs break.
+ */
+static gboolean iova_tree_alloc_traverse(gpointer key, gpointer value,
+                                         gpointer pargs)
+{
+    struct IOVATreeAllocArgs *args = pargs;
+    DMAMap *node = value;
+
+    assert(key == value);
+
+    iova_tree_alloc_args_iterate(args, node);
+    iova_tree_alloc_map_in_hole(args);
+    return args->iova_found;
+}
+
+int iova_tree_alloc_map(IOVATree *tree, DMAMap *map, hwaddr iova_begin,
+                        hwaddr iova_last)
+{
+    struct IOVATreeAllocArgs args = {
+        .new_size = map->size,
+        .iova_begin = iova_begin,
+    };
+
+    if (unlikely(iova_last < iova_begin)) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /*
+     * Find a valid hole for the mapping
+     *
+     * Assuming low iova_begin, so no need to do a binary search to
+     * locate the first node.
+     *
+     * TODO: Replace all this with g_tree_node_first/next/last when available
+     * (from glib since 2.68). To do it with g_tree_foreach complicates the
+     * code a lot.
+     *
+     */
+    g_tree_foreach(tree->tree, iova_tree_alloc_traverse, &args);
+    if (!args.iova_found) {
+        /*
+         * Either tree is empty or the last hole is still not checked.
+         * g_tree_foreach does not compare (last, iova_last] range, so we check
+         * it here.
+         */
+        iova_tree_alloc_args_iterate(&args, NULL);
+        iova_tree_alloc_map_in_hole(&args);
+    }
+
+    if (!args.iova_found || args.iova_result + map->size > iova_last) {
+        return IOVA_ERR_NOMEM;
+    }
+
+    map->iova = args.iova_result;
+    return iova_tree_insert(tree, map);
+}
+
 void iova_tree_destroy(IOVATree *tree)
 {
     g_tree_destroy(tree->tree);
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This function does the reverse operation of iova_tree_find: To look for
a mapping that match a translated address so we can do the reverse.

This have linear complexity instead of logarithmic, but it supports
overlapping HVA. Future developments could reduce it.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 include/qemu/iova-tree.h | 20 +++++++++++++++++++-
 util/iova-tree.c         | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/qemu/iova-tree.h b/include/qemu/iova-tree.h
index XXXXXXX..XXXXXXX 100644
--- a/include/qemu/iova-tree.h
+++ b/include/qemu/iova-tree.h
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
  * @tree: the iova tree to search from
  * @map: the mapping to search
  *
- * Search for a mapping in the iova tree that overlaps with the
+ * Search for a mapping in the iova tree that iova overlaps with the
  * mapping range specified.  Only the first found mapping will be
  * returned.
  *
@@ -XXX,XX +XXX,XX @@ int iova_tree_remove(IOVATree *tree, const DMAMap *map);
 const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map);
 
 /**
+ * iova_tree_find_iova:
+ *
+ * @tree: the iova tree to search from
+ * @map: the mapping to search
+ *
+ * Search for a mapping in the iova tree that translated_addr overlaps with the
+ * mapping range specified.  Only the first found mapping will be
+ * returned.
+ *
+ * Return: DMAMap pointer if found, or NULL if not found.  Note that
+ * the returned DMAMap pointer is maintained internally.  User should
+ * only read the content but never modify or free the content.  Also,
+ * user is responsible to make sure the pointer is valid (say, no
+ * concurrent deletion in progress).
+ */
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map);
+
+/**
  * iova_tree_find_address:
  *
  * @tree: the iova tree to search from
diff --git a/util/iova-tree.c b/util/iova-tree.c
index XXXXXXX..XXXXXXX 100644
--- a/util/iova-tree.c
+++ b/util/iova-tree.c
@@ -XXX,XX +XXX,XX @@ struct IOVATreeAllocArgs {
     bool iova_found;
 };
 
+typedef struct IOVATreeFindIOVAArgs {
+    const DMAMap *needle;
+    const DMAMap *result;
+} IOVATreeFindIOVAArgs;
+
 /**
  * Iterate args to the next hole
  *
@@ -XXX,XX +XXX,XX @@ const DMAMap *iova_tree_find(const IOVATree *tree, const DMAMap *map)
     return g_tree_lookup(tree->tree, map);
 }
 
+static gboolean iova_tree_find_address_iterator(gpointer key, gpointer value,
+                                                gpointer data)
+{
+    const DMAMap *map = key;
+    IOVATreeFindIOVAArgs *args = data;
+    const DMAMap *needle;
+
+    g_assert(key == value);
+
+    needle = args->needle;
+    if (map->translated_addr + map->size < needle->translated_addr ||
+        needle->translated_addr + needle->size < map->translated_addr) {
+        return false;
+    }
+
+    args->result = map;
+    return true;
+}
+
+const DMAMap *iova_tree_find_iova(const IOVATree *tree, const DMAMap *map)
+{
+    IOVATreeFindIOVAArgs args = {
+        .needle = map,
+    };
+
+    g_tree_foreach(tree->tree, iova_tree_find_address_iterator, &args);
+    return args.result;
+}
+
 const DMAMap *iova_tree_find_address(const IOVATree *tree, hwaddr iova)
 {
     const DMAMap map = { .iova = iova, .size = 0 };
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This tree is able to look for a translated address from an IOVA address.

At first glance it is similar to util/iova-tree. However, SVQ working on
devices with limited IOVA space need more capabilities, like allocating
IOVA chunks or performing reverse translations (qemu addresses to iova).

The allocation capability, as "assign a free IOVA address to this chunk
of memory in qemu's address space" allows shadow virtqueue to create a
new address space that is not restricted by guest's addressable one, so
we can allocate shadow vqs vrings outside of it.

It duplicates the tree so it can search efficiently in both directions,
and it will signal overlap if iova or the translated address is present
in any tree.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/meson.build       |   2 +-
 hw/virtio/vhost-iova-tree.c | 110 ++++++++++++++++++++++++++++++++++++++++++++
 hw/virtio/vhost-iova-tree.h |  27 +++++++++++
 3 files changed, 138 insertions(+), 1 deletion(-)
 create mode 100644 hw/virtio/vhost-iova-tree.c
 create mode 100644 hw/virtio/vhost-iova-tree.h

diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -XXX,XX +XXX,XX @@ softmmu_ss.add(when: 'CONFIG_ALL', if_true: files('vhost-stub.c'))
 
 virtio_ss = ss.source_set()
 virtio_ss.add(files('virtio.c'))
-virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c'))
+virtio_ss.add(when: 'CONFIG_VHOST', if_true: files('vhost.c', 'vhost-backend.c', 'vhost-shadow-virtqueue.c', 'vhost-iova-tree.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_USER', if_true: files('vhost-user.c'))
 virtio_ss.add(when: 'CONFIG_VHOST_VDPA', if_true: files('vhost-vdpa.c'))
 virtio_ss.add(when: 'CONFIG_VIRTIO_BALLOON', if_true: files('virtio-balloon.c'))
diff --git a/hw/virtio/vhost-iova-tree.c b/hw/virtio/vhost-iova-tree.c
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.c
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qemu/iova-tree.h"
+#include "vhost-iova-tree.h"
+
+#define iova_min_addr qemu_real_host_page_size
+
+/**
+ * VhostIOVATree, able to:
+ * - Translate iova address
+ * - Reverse translate iova address (from translated to iova)
+ * - Allocate IOVA regions for translated range (linear operation)
+ */
+struct VhostIOVATree {
+    /* First addressable iova address in the device */
+    uint64_t iova_first;
+
+    /* Last addressable iova address in the device */
+    uint64_t iova_last;
+
+    /* IOVA address to qemu memory maps. */
+    IOVATree *iova_taddr_map;
+};
+
+/**
+ * Create a new IOVA tree
+ *
+ * Returns the new IOVA tree
+ */
+VhostIOVATree *vhost_iova_tree_new(hwaddr iova_first, hwaddr iova_last)
+{
+    VhostIOVATree *tree = g_new(VhostIOVATree, 1);
+
+    /* Some devices do not like 0 addresses */
+    tree->iova_first = MAX(iova_first, iova_min_addr);
+    tree->iova_last = iova_last;
+
+    tree->iova_taddr_map = iova_tree_new();
+    return tree;
+}
+
+/**
+ * Delete an iova tree
+ */
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree)
+{
+    iova_tree_destroy(iova_tree->iova_taddr_map);
+    g_free(iova_tree);
+}
+
+/**
+ * Find the IOVA address stored from a memory address
+ *
+ * @tree: The iova tree
+ * @map: The map with the memory address
+ *
+ * Return the stored mapping, or NULL if not found.
+ */
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *tree,
+                                        const DMAMap *map)
+{
+    return iova_tree_find_iova(tree->iova_taddr_map, map);
+}
+
+/**
+ * Allocate a new mapping
+ *
+ * @tree: The iova tree
+ * @map: The iova map
+ *
+ * Returns:
+ * - IOVA_OK if the map fits in the container
+ * - IOVA_ERR_INVALID if the map does not make sense (like size overflow)
+ * - IOVA_ERR_NOMEM if tree cannot allocate more space.
+ *
+ * It returns assignated iova in map->iova if return value is VHOST_DMA_MAP_OK.
+ */
+int vhost_iova_tree_map_alloc(VhostIOVATree *tree, DMAMap *map)
+{
+    /* Some vhost devices do not like addr 0. Skip first page */
+    hwaddr iova_first = tree->iova_first ?: qemu_real_host_page_size;
+
+    if (map->translated_addr + map->size < map->translated_addr ||
+        map->perm == IOMMU_NONE) {
+        return IOVA_ERR_INVALID;
+    }
+
+    /* Allocate a node in IOVA address */
+    return iova_tree_alloc_map(tree->iova_taddr_map, map, iova_first,
+                               tree->iova_last);
+}
+
+/**
+ * Remove existing mappings from iova tree
+ *
+ * @iova_tree: The vhost iova tree
+ * @map: The map to remove
+ */
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map)
+{
+    iova_tree_remove(iova_tree->iova_taddr_map, map);
+}
diff --git a/hw/virtio/vhost-iova-tree.h b/hw/virtio/vhost-iova-tree.h
new file mode 100644
index XXXXXXX..XXXXXXX
--- /dev/null
+++ b/hw/virtio/vhost-iova-tree.h
@@ -XXX,XX +XXX,XX @@
+/*
+ * vhost software live migration iova tree
+ *
+ * SPDX-FileCopyrightText: Red Hat, Inc. 2021
+ * SPDX-FileContributor: Author: Eugenio Pérez <eperezma@redhat.com>
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef HW_VIRTIO_VHOST_IOVA_TREE_H
+#define HW_VIRTIO_VHOST_IOVA_TREE_H
+
+#include "qemu/iova-tree.h"
+#include "exec/memory.h"
+
+typedef struct VhostIOVATree VhostIOVATree;
+
+VhostIOVATree *vhost_iova_tree_new(uint64_t iova_first, uint64_t iova_last);
+void vhost_iova_tree_delete(VhostIOVATree *iova_tree);
+G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostIOVATree, vhost_iova_tree_delete);
+
+const DMAMap *vhost_iova_tree_find_iova(const VhostIOVATree *iova_tree,
+                                        const DMAMap *map);
+int vhost_iova_tree_map_alloc(VhostIOVATree *iova_tree, DMAMap *map);
+void vhost_iova_tree_remove(VhostIOVATree *iova_tree, const DMAMap *map);
+
+#endif
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

Use translations added in VhostIOVATree in SVQ.

Only introduce usage here, not allocation and deallocation. As with
previous patches, we use the dead code paths of shadow_vqs_enabled to
avoid commiting too many changes at once. These are impossible to take
at the moment.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-shadow-virtqueue.c |  86 +++++++++++++++++++++++---
 hw/virtio/vhost-shadow-virtqueue.h |   6 +-
 hw/virtio/vhost-vdpa.c             | 122 +++++++++++++++++++++++++++++++------
 include/hw/virtio/vhost-vdpa.h     |   3 +
 4 files changed, 187 insertions(+), 30 deletions(-)

diff --git a/hw/virtio/vhost-shadow-virtqueue.c b/hw/virtio/vhost-shadow-virtqueue.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.c
+++ b/hw/virtio/vhost-shadow-virtqueue.c
@@ -XXX,XX +XXX,XX @@ static uint16_t vhost_svq_available_slots(const VhostShadowVirtqueue *svq)
     return svq->vring.num - (svq->shadow_avail_idx - svq->shadow_used_idx);
 }
 
-static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
+/**
+ * Translate addresses between the qemu's virtual address and the SVQ IOVA
+ *
+ * @svq: Shadow VirtQueue
+ * @vaddr: Translated IOVA addresses
+ * @iovec: Source qemu's VA addresses
+ * @num: Length of iovec and minimum length of vaddr
+ */
+static bool vhost_svq_translate_addr(const VhostShadowVirtqueue *svq,
+                                     hwaddr *addrs, const struct iovec *iovec,
+                                     size_t num)
+{
+    if (num == 0) {
+        return true;
+    }
+
+    for (size_t i = 0; i < num; ++i) {
+        DMAMap needle = {
+            .translated_addr = (hwaddr)(uintptr_t)iovec[i].iov_base,
+            .size = iovec[i].iov_len,
+        };
+        Int128 needle_last, map_last;
+        size_t off;
+
+        const DMAMap *map = vhost_iova_tree_find_iova(svq->iova_tree, &needle);
+        /*
+         * Map cannot be NULL since iova map contains all guest space and
+         * qemu already has a physical address mapped
+         */
+        if (unlikely(!map)) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Invalid address 0x%"HWADDR_PRIx" given by guest",
+                          needle.translated_addr);
+            return false;
+        }
+
+        off = needle.translated_addr - map->translated_addr;
+        addrs[i] = map->iova + off;
+
+        needle_last = int128_add(int128_make64(needle.translated_addr),
+                                 int128_make64(iovec[i].iov_len));
+        map_last = int128_make64(map->translated_addr + map->size);
+        if (unlikely(int128_gt(needle_last, map_last))) {
+            qemu_log_mask(LOG_GUEST_ERROR,
+                          "Guest buffer expands over iova range");
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static void vhost_vring_write_descs(VhostShadowVirtqueue *svq, hwaddr *sg,
                                     const struct iovec *iovec, size_t num,
                                     bool more_descs, bool write)
 {
@@ -XXX,XX +XXX,XX @@ static void vhost_vring_write_descs(VhostShadowVirtqueue *svq,
         } else {
             descs[i].flags = flags;
         }
-        descs[i].addr = cpu_to_le64((hwaddr)(intptr_t)iovec[n].iov_base);
+        descs[i].addr = cpu_to_le64(sg[n]);
         descs[i].len = cpu_to_le32(iovec[n].iov_len);
 
         last = i;
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
 {
     unsigned avail_idx;
     vring_avail_t *avail = svq->vring.avail;
+    bool ok;
+    g_autofree hwaddr *sgs = g_new(hwaddr, MAX(elem->out_num, elem->in_num));
 
     *head = svq->free_head;
 
@@ -XXX,XX +XXX,XX @@ static bool vhost_svq_add_split(VhostShadowVirtqueue *svq,
         return false;
     }
 
-    vhost_vring_write_descs(svq, elem->out_sg, elem->out_num, elem->in_num > 0,
-                            false);
-    vhost_vring_write_descs(svq, elem->in_sg, elem->in_num, false, true);
+    ok = vhost_svq_translate_addr(svq, sgs, elem->out_sg, elem->out_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+    vhost_vring_write_descs(svq, sgs, elem->out_sg, elem->out_num,
+                            elem->in_num > 0, false);
+
+
+    ok = vhost_svq_translate_addr(svq, sgs, elem->in_sg, elem->in_num);
+    if (unlikely(!ok)) {
+        return false;
+    }
+
+    vhost_vring_write_descs(svq, sgs, elem->in_sg, elem->in_num, false, true);
 
     /*
      * Put the entry in the available array (but don't update avail->idx until
@@ -XXX,XX +XXX,XX @@ void vhost_svq_set_svq_call_fd(VhostShadowVirtqueue *svq, int call_fd)
 void vhost_svq_get_vring_addr(const VhostShadowVirtqueue *svq,
                               struct vhost_vring_addr *addr)
 {
-    addr->desc_user_addr = (uint64_t)(intptr_t)svq->vring.desc;
-    addr->avail_user_addr = (uint64_t)(intptr_t)svq->vring.avail;
-    addr->used_user_addr = (uint64_t)(intptr_t)svq->vring.used;
+    addr->desc_user_addr = (uint64_t)(uintptr_t)svq->vring.desc;
+    addr->avail_user_addr = (uint64_t)(uintptr_t)svq->vring.avail;
+    addr->used_user_addr = (uint64_t)(uintptr_t)svq->vring.used;
 }
 
 size_t vhost_svq_driver_area_size(const VhostShadowVirtqueue *svq)
@@ -XXX,XX +XXX,XX @@ void vhost_svq_stop(VhostShadowVirtqueue *svq)
  * Creates vhost shadow virtqueue, and instructs the vhost device to use the
  * shadow methods and file descriptors.
  *
+ * @iova_tree: Tree to perform descriptors translations
+ *
  * Returns the new virtqueue or NULL.
  *
  * In case of error, reason is reported through error_report.
  */
-VhostShadowVirtqueue *vhost_svq_new(void)
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree)
 {
     g_autofree VhostShadowVirtqueue *svq = g_new0(VhostShadowVirtqueue, 1);
     int r;
@@ -XXX,XX +XXX,XX @@ VhostShadowVirtqueue *vhost_svq_new(void)
 
     event_notifier_init_fd(&svq->svq_kick, VHOST_FILE_UNBIND);
     event_notifier_set_handler(&svq->hdev_call, vhost_svq_handle_call);
+    svq->iova_tree = iova_tree;
     return g_steal_pointer(&svq);
 
 err_init_hdev_call:
diff --git a/hw/virtio/vhost-shadow-virtqueue.h b/hw/virtio/vhost-shadow-virtqueue.h
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-shadow-virtqueue.h
+++ b/hw/virtio/vhost-shadow-virtqueue.h
@@ -XXX,XX +XXX,XX @@
 #include "qemu/event_notifier.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
+#include "hw/virtio/vhost-iova-tree.h"
 
 /* Shadow virtqueue to relay notifications */
 typedef struct VhostShadowVirtqueue {
@@ -XXX,XX +XXX,XX @@ typedef struct VhostShadowVirtqueue {
     /* Virtio device */
     VirtIODevice *vdev;
 
+    /* IOVA mapping */
+    VhostIOVATree *iova_tree;
+
     /* Map for use the guest's descriptors */
     VirtQueueElement **ring_id_maps;
 
@@ -XXX,XX +XXX,XX @@ void vhost_svq_start(VhostShadowVirtqueue *svq, VirtIODevice *vdev,
                      VirtQueue *vq);
 void vhost_svq_stop(VhostShadowVirtqueue *svq);
 
-VhostShadowVirtqueue *vhost_svq_new(void);
+VhostShadowVirtqueue *vhost_svq_new(VhostIOVATree *iova_tree);
 
 void vhost_svq_free(gpointer vq);
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(VhostShadowVirtqueue, vhost_svq_free);
diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_add(MemoryListener *listener,
                                          vaddr, section->readonly);
 
     llsize = int128_sub(llend, int128_make64(iova));
+    if (v->shadow_vqs_enabled) {
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+            .perm = IOMMU_ACCESS_FLAG(true, section->readonly),
+        };
+
+        int r = vhost_iova_tree_map_alloc(v->iova_tree, &mem_region);
+        if (unlikely(r != IOVA_OK)) {
+            error_report("Can't allocate a mapping (%d)", r);
+            goto fail;
+        }
+
+        iova = mem_region.iova;
+    }
 
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_map(v, iova, int128_get64(llsize),
@@ -XXX,XX +XXX,XX @@ static void vhost_vdpa_listener_region_del(MemoryListener *listener,
 
     llsize = int128_sub(llend, int128_make64(iova));
 
+    if (v->shadow_vqs_enabled) {
+        const DMAMap *result;
+        const void *vaddr = memory_region_get_ram_ptr(section->mr) +
+            section->offset_within_region +
+            (iova - section->offset_within_address_space);
+        DMAMap mem_region = {
+            .translated_addr = (hwaddr)(uintptr_t)vaddr,
+            .size = int128_get64(llsize) - 1,
+        };
+
+        result = vhost_iova_tree_find_iova(v->iova_tree, &mem_region);
+        iova = result->iova;
+        vhost_iova_tree_remove(v->iova_tree, &mem_region);
+    }
     vhost_vdpa_iotlb_batch_begin_once(v);
     ret = vhost_vdpa_dma_unmap(v, iova, int128_get64(llsize));
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
 
     shadow_vqs = g_ptr_array_new_full(hdev->nvqs, vhost_svq_free);
     for (unsigned n = 0; n < hdev->nvqs; ++n) {
-        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new();
+        g_autoptr(VhostShadowVirtqueue) svq = vhost_svq_new(v->iova_tree);
 
         if (unlikely(!svq)) {
             error_setg(errp, "Cannot create svq %u", n);
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_svq_set_fds(struct vhost_dev *dev,
 /**
  * Unmap a SVQ area in the device
  */
-static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v, hwaddr iova,
-                                      hwaddr size)
+static bool vhost_vdpa_svq_unmap_ring(struct vhost_vdpa *v,
+                                      const DMAMap *needle)
 {
+    const DMAMap *result = vhost_iova_tree_find_iova(v->iova_tree, needle);
+    hwaddr size;
     int r;
 
-    size = ROUND_UP(size, qemu_real_host_page_size);
-    r = vhost_vdpa_dma_unmap(v, iova, size);
+    if (unlikely(!result)) {
+        error_report("Unable to find SVQ address to unmap");
+        return false;
+    }
+
+    size = ROUND_UP(result->size, qemu_real_host_page_size);
+    r = vhost_vdpa_dma_unmap(v, result->iova, size);
     return r == 0;
 }
 
 static bool vhost_vdpa_svq_unmap_rings(struct vhost_dev *dev,
                                        const VhostShadowVirtqueue *svq)
 {
+    DMAMap needle = {};
     struct vhost_vdpa *v = dev->opaque;
     struct vhost_vring_addr svq_addr;
-    size_t device_size = vhost_svq_device_area_size(svq);
-    size_t driver_size = vhost_svq_driver_area_size(svq);
     bool ok;
 
     vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    ok = vhost_vdpa_svq_unmap_ring(v, svq_addr.desc_user_addr, driver_size);
+    needle.translated_addr = svq_addr.desc_user_addr;
+    ok = vhost_vdpa_svq_unmap_ring(v, &needle);
     if (unlikely(!ok)) {
         return false;
     }
 
-    return vhost_vdpa_svq_unmap_ring(v, svq_addr.used_user_addr, device_size);
+    needle.translated_addr = svq_addr.used_user_addr;
+    return vhost_vdpa_svq_unmap_ring(v, &needle);
+}
+
+/**
+ * Map the SVQ area in the device
+ *
+ * @v: Vhost-vdpa device
+ * @needle: The area to search iova
+ * @errorp: Error pointer
+ */
+static bool vhost_vdpa_svq_map_ring(struct vhost_vdpa *v, DMAMap *needle,
+                                    Error **errp)
+{
+    int r;
+
+    r = vhost_iova_tree_map_alloc(v->iova_tree, needle);
+    if (unlikely(r != IOVA_OK)) {
+        error_setg(errp, "Cannot allocate iova (%d)", r);
+        return false;
+    }
+
+    r = vhost_vdpa_dma_map(v, needle->iova, needle->size + 1,
+                           (void *)(uintptr_t)needle->translated_addr,
+                           needle->perm == IOMMU_RO);
+    if (unlikely(r != 0)) {
+        error_setg_errno(errp, -r, "Cannot map region to device");
+        vhost_iova_tree_remove(v->iova_tree, needle);
+    }
+
+    return r == 0;
 }
 
 /**
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_svq_map_rings(struct vhost_dev *dev,
                                      struct vhost_vring_addr *addr,
                                      Error **errp)
 {
+    DMAMap device_region, driver_region;
+    struct vhost_vring_addr svq_addr;
     struct vhost_vdpa *v = dev->opaque;
     size_t device_size = vhost_svq_device_area_size(svq);
     size_t driver_size = vhost_svq_driver_area_size(svq);
-    int r;
+    size_t avail_offset;
+    bool ok;
 
     ERRP_GUARD();
-    vhost_svq_get_vring_addr(svq, addr);
+    vhost_svq_get_vring_addr(svq, &svq_addr);
 
-    r = vhost_vdpa_dma_map(v, addr->desc_user_addr, driver_size,
-                           (void *)(uintptr_t)addr->desc_user_addr, true);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq driver region: ");
+    driver_region = (DMAMap) {
+        .translated_addr = svq_addr.desc_user_addr,
+        .size = driver_size - 1,
+        .perm = IOMMU_RO,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &driver_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq driver region: ");
         return false;
     }
+    addr->desc_user_addr = driver_region.iova;
+    avail_offset = svq_addr.avail_user_addr - svq_addr.desc_user_addr;
+    addr->avail_user_addr = driver_region.iova + avail_offset;
 
-    r = vhost_vdpa_dma_map(v, addr->used_user_addr, device_size,
-                           (void *)(intptr_t)addr->used_user_addr, false);
-    if (unlikely(r != 0)) {
-        error_setg_errno(errp, -r, "Cannot create vq device region: ");
+    device_region = (DMAMap) {
+        .translated_addr = svq_addr.used_user_addr,
+        .size = device_size - 1,
+        .perm = IOMMU_RW,
+    };
+    ok = vhost_vdpa_svq_map_ring(v, &device_region, errp);
+    if (unlikely(!ok)) {
+        error_prepend(errp, "Cannot create vq device region: ");
+        vhost_vdpa_svq_unmap_ring(v, &driver_region);
     }
+    addr->used_user_addr = device_region.iova;
 
-    return r == 0;
+    return ok;
 }
 
 static bool vhost_vdpa_svq_setup(struct vhost_dev *dev,
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@
 
 #include <gmodule.h>
 
+#include "hw/virtio/vhost-iova-tree.h"
 #include "hw/virtio/virtio.h"
 #include "standard-headers/linux/vhost_types.h"
 
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
     bool shadow_vqs_enabled;
+    /* IOVA mapping used by the Shadow Virtqueue */
+    VhostIOVATree *iova_tree;
     GPtrArray *shadow_vqs;
     struct vhost_dev *dev;
     VhostVDPAHostNotifier notifier[VIRTIO_QUEUE_MAX];
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

This is needed to achieve migration, so the destination can restore its
index.

Setting base as last used idx, so destination will see as available all
the entries that the device did not use, including the in-flight
processing ones.

This is ok for networking, but other kinds of devices might have
problems with these retransmissions.

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_base(struct vhost_dev *dev,
 static int vhost_vdpa_get_vring_base(struct vhost_dev *dev,
                                        struct vhost_vring_state *ring)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
+    if (v->shadow_vqs_enabled) {
+        VhostShadowVirtqueue *svq = g_ptr_array_index(v->shadow_vqs,
+                                                      ring->index);
+
+        /*
+         * Setting base as last used idx, so destination will see as available
+         * all the entries that the device did not use, including the in-flight
+         * processing ones.
+         *
+         * TODO: This is ok for networking, but other kinds of devices might
+         * have problems with these retransmissions.
+         */
+        ring->num = svq->last_used_idx;
+        return 0;
+    }
+
     ret = vhost_vdpa_call(dev, VHOST_GET_VRING_BASE, ring);
     trace_vhost_vdpa_get_vring_base(dev, ring->index, ring->num);
     return ret;
-- 
2.7.4

From: Eugenio Pérez <eperezma@redhat.com>

SVQ is able to log the dirty bits by itself, so let's use it to not
block migration.

Also, ignore set and clear of VHOST_F_LOG_ALL on set_features if SVQ is
enabled. Even if the device supports it, the reports would be nonsense
because SVQ memory is in the qemu region.

The log region is still allocated. Future changes might skip that, but
this series is already long enough.

Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 hw/virtio/vhost-vdpa.c         | 39 +++++++++++++++++++++++++++++++++++----
 include/hw/virtio/vhost-vdpa.h |  1 +
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c
index XXXXXXX..XXXXXXX 100644
--- a/hw/virtio/vhost-vdpa.c
+++ b/hw/virtio/vhost-vdpa.c
@@ -XXX,XX +XXX,XX @@ static bool vhost_vdpa_one_time_request(struct vhost_dev *dev)
     return v->index != 0;
 }
 
+static int vhost_vdpa_get_dev_features(struct vhost_dev *dev,
+                                       uint64_t *features)
+{
+    int ret;
+
+    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
+    trace_vhost_vdpa_get_features(dev, *features);
+    return ret;
+}
+
 static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
                                Error **errp)
 {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_init_svq(struct vhost_dev *hdev, struct vhost_vdpa *v,
         return 0;
     }
 
-    r = hdev->vhost_ops->vhost_get_features(hdev, &dev_features);
+    r = vhost_vdpa_get_dev_features(hdev, &dev_features);
     if (r != 0) {
         error_setg_errno(errp, -r, "Can't get vdpa device features");
         return r;
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_mem_table(struct vhost_dev *dev,
 static int vhost_vdpa_set_features(struct vhost_dev *dev,
                                    uint64_t features)
 {
+    struct vhost_vdpa *v = dev->opaque;
     int ret;
 
     if (vhost_vdpa_one_time_request(dev)) {
         return 0;
     }
 
+    if (v->shadow_vqs_enabled) {
+        if ((v->acked_features ^ features) == BIT_ULL(VHOST_F_LOG_ALL)) {
+            /*
+             * QEMU is just trying to enable or disable logging. SVQ handles
+             * this sepparately, so no need to forward this.
+             */
+            v->acked_features = features;
+            return 0;
+        }
+
+        v->acked_features = features;
+
+        /* We must not ack _F_LOG if SVQ is enabled */
+        features &= ~BIT_ULL(VHOST_F_LOG_ALL);
+    }
+
     trace_vhost_vdpa_set_features(dev, features);
     ret = vhost_vdpa_call(dev, VHOST_SET_FEATURES, &features);
     if (ret) {
@@ -XXX,XX +XXX,XX @@ static int vhost_vdpa_set_vring_call(struct vhost_dev *dev,
 static int vhost_vdpa_get_features(struct vhost_dev *dev,
                                      uint64_t *features)
 {
-    int ret;
+    struct vhost_vdpa *v = dev->opaque;
+    int ret = vhost_vdpa_get_dev_features(dev, features);
+
+    if (ret == 0 && v->shadow_vqs_enabled) {
+        /* Add SVQ logging capabilities */
+        *features |= BIT_ULL(VHOST_F_LOG_ALL);
+    }
 
-    ret = vhost_vdpa_call(dev, VHOST_GET_FEATURES, features);
-    trace_vhost_vdpa_get_features(dev, *features);
     return ret;
 }
 
diff --git a/include/hw/virtio/vhost-vdpa.h b/include/hw/virtio/vhost-vdpa.h
index XXXXXXX..XXXXXXX 100644
--- a/include/hw/virtio/vhost-vdpa.h
+++ b/include/hw/virtio/vhost-vdpa.h
@@ -XXX,XX +XXX,XX @@ typedef struct vhost_vdpa {
     bool iotlb_batch_begin_sent;
     MemoryListener listener;
     struct vhost_vdpa_iova_range iova_range;
+    uint64_t acked_features;
     bool shadow_vqs_enabled;
     /* IOVA mapping used by the Shadow Virtqueue */
     VhostIOVATree *iova_tree;
-- 
2.7.4