[v7] hw/net/virtio-net: add support for notification coalescing

[PATCH v7] hw/net/virtio-net: add support for notification coalescing

Posted by Koushik Dutta 2 weeks ago

Implement VirtIO Network Notification Coalescing (Bit 53).
This allows the guest to manage interrupt frequency using ethtool
-C for both RX and TX paths.

- Added VIRTIO_NET_F_NOTF_COAL to host features.
- Implemented VIRTIO_NET_CTRL_NOTF_COAL class handling in
  virtio_net_handle_ctrl_iov.
- Added logic to store and apply rx/tx usecs and max_packets.
- Added packet counters and threshold logic for both RX and TX data paths.
- Dynamic Dispatcher: Implemented a dispatcher mechanism that
  dynamically switches/activates the notification callback logic
  only after the guest enables TX coalescing via ethtool.
- After VM LM coalescing parameters persist in the destination VM.

This reduces interrupt overhead by batching notifications based on
either a packet count or a time-based threshold.

Signed-off-by: Koushik Dutta <kdutta@redhat.com>
---
v7 changes:
 - Fixed time unit consistency: TX path now uses microseconds matching RX
 - Changed timer_new_ns to timer_new_us for TX timers
 - Changed qemu_clock_get_ns to qemu_clock_get_us in TX path
 - Convert txtimer from nanoseconds to microseconds (txtimer / 1000)
 - Removed unused rx_index_timer variable from struct and initialization
 - Fixed feature flag handling: removed conditional check in ctrl handler
---
 hw/net/virtio-net.c            | 193 +++++++++++++++++++++++++++++----
 include/hw/virtio/virtio-net.h |   9 +-
 net/passt.c                    |   1 +
 net/tap.c                      |   1 +
 net/vhost-user.c               |   1 +
 net/vhost-vdpa.c               |   1 +
 6 files changed, 183 insertions(+), 23 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 2a5d642a64..5a98b52ded 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -157,6 +157,16 @@ static void flush_or_purge_queued_packets(NetClientState *nc)
  * - we could suppress RX interrupt if we were so inclined.
  */
 
+static void virtio_net_rx_notify(void *opaque)
+{
+    VirtIONetQueue *q = opaque;
+    VirtIONet *n = q->n;
+    VirtIODevice *vdev = VIRTIO_DEVICE(n);
+
+    n->rx_pkt_cnt = 0;
+    virtio_notify(vdev, q->rx_vq);
+}
+
 static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
 {
     VirtIONet *n = VIRTIO_NET(vdev);
@@ -435,7 +445,7 @@ static int virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
         if (queue_started) {
             if (q->tx_timer) {
                 timer_mod(q->tx_timer,
-                               qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
+                               qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
             } else {
                 replay_bh_schedule_event(q->tx_bh);
             }
@@ -1080,6 +1090,52 @@ static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
     }
 }
 
+static void virtio_net_tx_timer(void *opaque);
+
+static int virtio_net_handle_coal(VirtIONet *n, uint8_t cmd,
+                                  struct iovec *iov, unsigned int iov_cnt)
+{
+    struct virtio_net_ctrl_coal coal;
+    VirtIONetQueue *q;
+    size_t s;
+    int i;
+
+    s = iov_to_buf(iov, iov_cnt, 0, &coal, sizeof(coal));
+    if (s != sizeof(coal)) {
+        return VIRTIO_NET_ERR;
+    }
+
+    if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_RX_SET) {
+        n->rx_coal_usecs = le32_to_cpu(coal.max_usecs);
+        n->rx_coal_packets = le32_to_cpu(coal.max_packets);
+        if (n->rx_coal_usecs > 0) {
+            for (i = 0; i < n->max_queue_pairs; i++) {
+                q = &n->vqs[i];
+                if (!q->rx_timer) {
+                    q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
+                                               virtio_net_rx_notify,
+                                               q);
+                }
+            }
+        }
+    } else if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_TX_SET) {
+        n->tx_coal_usecs = le32_to_cpu(coal.max_usecs);
+        n->tx_coal_packets = le32_to_cpu(coal.max_packets);
+        if (n->tx_coal_usecs > 0) {
+            for (i = 0; i < n->max_queue_pairs; i++) {
+                q = &n->vqs[i];
+                if (!q->tx_timer && n->tx_coal_usecs > 0) {
+                    q->tx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
+                                               virtio_net_tx_timer,
+                                               q);
+                }
+            }
+        }
+    }
+
+    return VIRTIO_NET_OK;
+}
+
 static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
                                  struct iovec *iov, unsigned int iov_cnt)
 {
@@ -1581,6 +1637,8 @@ size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
         status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
     } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
         status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
+    } else if (ctrl.class == VIRTIO_NET_CTRL_NOTF_COAL) {
+        status = virtio_net_handle_coal(n, ctrl.cmd, iov, out_num);
     }
 
     s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
@@ -2040,7 +2098,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
     }
 
     virtqueue_flush(q->rx_vq, i);
-    virtio_notify(vdev, q->rx_vq);
+
+    /* rx coalescing */
+    n->rx_pkt_cnt += i;
+    if (n->rx_coal_usecs == 0 || n->rx_pkt_cnt >= n->rx_coal_packets) {
+        if (q->rx_timer) {
+            timer_del(q->rx_timer);
+        }
+        virtio_net_rx_notify(q);
+    } else {
+        if (q->rx_timer) {
+            if (!timer_pending(q->rx_timer)) {
+                timer_mod(q->rx_timer,
+                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
+            }
+        }
+    }
 
     return size;
 
@@ -2708,7 +2781,7 @@ static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
             replay_bh_schedule_event(q->tx_bh);
         } else {
             timer_mod(q->tx_timer,
-                      qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
+                      qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
         }
         q->tx_waiting = 1;
     }
@@ -2817,7 +2890,6 @@ detach:
     return -EINVAL;
 }
 
-static void virtio_net_tx_timer(void *opaque);
 
 static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
 {
@@ -2842,7 +2914,7 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
     } else {
         /* re-arm timer to flush it (and more) on next tick */
         timer_mod(q->tx_timer,
-                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
+                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
         q->tx_waiting = 1;
         virtio_queue_set_notification(vq, 0);
     }
@@ -2899,6 +2971,12 @@ static void virtio_net_tx_timer(void *opaque)
     if (ret == -EBUSY || ret == -EINVAL) {
         return;
     }
+    if (n->tx_pkt_cnt < ret) {
+        n->tx_pkt_cnt = 0;
+    } else {
+        n->tx_pkt_cnt -= ret;
+    }
+
     /*
      * If we flush a full burst of packets, assume there are
      * more coming and immediately rearm
@@ -2906,7 +2984,7 @@ static void virtio_net_tx_timer(void *opaque)
     if (ret >= n->tx_burst) {
         q->tx_waiting = 1;
         timer_mod(q->tx_timer,
-                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
+                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
         return;
     }
     /*
@@ -2918,9 +2996,10 @@ static void virtio_net_tx_timer(void *opaque)
     ret = virtio_net_flush_tx(q);
     if (ret > 0) {
         virtio_queue_set_notification(q->tx_vq, 0);
+        n->tx_pkt_cnt -= ret;
         q->tx_waiting = 1;
         timer_mod(q->tx_timer,
-                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
+                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
     }
 }
 
@@ -2973,6 +3052,32 @@ static void virtio_net_tx_bh(void *opaque)
     }
 }
 
+static void virtio_net_handle_tx_dispatch(VirtIODevice *vdev, VirtQueue *vq)
+{
+    VirtIONet *n = VIRTIO_NET(vdev);
+    VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
+    bool use_timer = n->tx_timer_activate || n->tx_coal_usecs > 0 ||
+                     n->tx_coal_packets > 0;
+    bool pkt_limit = (n->tx_coal_packets > 0);
+
+    if (use_timer) {
+        n->tx_pkt_cnt++;
+        if (!pkt_limit || n->tx_pkt_cnt < n->tx_coal_packets) {
+            if (q->tx_timer) {
+                virtio_net_handle_tx_timer(vdev, vq);
+                return;
+            }
+        }
+        n->tx_pkt_cnt = 0;
+        if (q->tx_timer) {
+            timer_del(q->tx_timer);
+        }
+        virtio_net_handle_tx_bh(vdev, vq);
+    } else {
+        virtio_net_handle_tx_bh(vdev, vq);
+    }
+}
+
 static void virtio_net_add_queue(VirtIONet *n, int index)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(n);
@@ -2980,20 +3085,15 @@ static void virtio_net_add_queue(VirtIONet *n, int index)
     n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
                                            virtio_net_handle_rx);
 
-    if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
-        n->vqs[index].tx_vq =
-            virtio_add_queue(vdev, n->net_conf.tx_queue_size,
-                             virtio_net_handle_tx_timer);
-        n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
-                                              virtio_net_tx_timer,
-                                              &n->vqs[index]);
-    } else {
-        n->vqs[index].tx_vq =
-            virtio_add_queue(vdev, n->net_conf.tx_queue_size,
-                             virtio_net_handle_tx_bh);
-        n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
-                                                  &DEVICE(vdev)->mem_reentrancy_guard);
-    }
+    n->vqs[index].tx_vq =
+        virtio_add_queue(vdev,
+                         n->net_conf.tx_queue_size,
+                         virtio_net_handle_tx_dispatch);
+
+    n->vqs[index].tx_bh =
+        qemu_bh_new_guarded(virtio_net_tx_bh,
+                            &n->vqs[index],
+                            &DEVICE(vdev)->mem_reentrancy_guard);
 
     n->vqs[index].tx_waiting = 0;
     n->vqs[index].n = n;
@@ -3088,6 +3188,9 @@ static void virtio_net_get_features(VirtIODevice *vdev, uint64_t *features,
     virtio_features_or(features, features, n->host_features_ex);
 
     virtio_add_feature_ex(features, VIRTIO_NET_F_MAC);
+    if (n->tx_timer_activate) {
+        virtio_clear_feature_ex(features, VIRTIO_NET_F_NOTF_COAL);
+    }
 
     if (!peer_has_vnet_hdr(n)) {
         virtio_clear_feature_ex(features, VIRTIO_NET_F_CSUM);
@@ -3242,6 +3345,35 @@ static int virtio_net_post_load_device(void *opaque, int version_id)
     }
 
     virtio_net_commit_rss_config(n);
+    if (n->tx_coal_usecs > 0 || n->rx_coal_usecs > 0) {
+
+        for (i = 0; i < n->max_queue_pairs; i++) {
+            VirtIONetQueue *q = &n->vqs[i];
+            if (!q->rx_timer && n->rx_coal_usecs > 0) {
+                q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
+                                           virtio_net_rx_notify,
+                                           q);
+            }
+
+            if (!q->tx_timer && n->tx_coal_usecs > 0) {
+                q->tx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
+                                           virtio_net_tx_timer,
+                                           q);
+            }
+
+            if (n->tx_coal_usecs > 0 && q->tx_timer) {
+                n->tx_pkt_cnt = 0;
+                timer_mod(q->tx_timer,
+                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
+            }
+
+            if (n->rx_coal_usecs > 0 && q->rx_timer) {
+                timer_mod(q->rx_timer,
+                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
+            }
+        }
+    }
+
     return 0;
 }
 
@@ -3617,6 +3749,10 @@ static const VMStateDescription vmstate_virtio_net_device = {
                          vmstate_virtio_net_tx_waiting),
         VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
                             has_ctrl_guest_offloads),
+        VMSTATE_UINT32(rx_coal_usecs, VirtIONet),
+        VMSTATE_UINT32(tx_coal_usecs, VirtIONet),
+        VMSTATE_UINT32(rx_coal_packets, VirtIONet),
+        VMSTATE_UINT32(tx_coal_packets, VirtIONet),
         VMSTATE_END_OF_LIST()
     },
     .subsections = (const VMStateDescription * const []) {
@@ -3960,7 +4096,6 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
     }
     n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
     n->curr_queue_pairs = 1;
-    n->tx_timeout = n->net_conf.txtimer;
 
     if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
                        && strcmp(n->net_conf.tx, "bh")) {
@@ -3970,6 +4105,13 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
         error_printf("Defaulting to \"bh\"");
     }
 
+    if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer") == 0) {
+        n->tx_coal_usecs = n->net_conf.txtimer / 1000;
+        n->tx_timer_activate = true;
+    } else {
+        n->tx_coal_usecs = 0;
+    }
+
     n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
                                     n->net_conf.tx_queue_size);
 
@@ -4046,6 +4188,11 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
             n->rss_data.specified_hash_types.on_bits |
             n->rss_data.specified_hash_types.auto_bits;
     }
+    n->rx_pkt_cnt = 0;
+    n->tx_pkt_cnt = 0;
+    n->rx_coal_usecs = 0;
+    n->rx_coal_packets = 0;
+    n->tx_coal_packets = 0;
 }
 
 static void virtio_net_device_unrealize(DeviceState *dev)
@@ -4258,6 +4405,8 @@ static const Property virtio_net_properties[] = {
                       VIRTIO_NET_F_GUEST_USO6, true),
     DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
                       VIRTIO_NET_F_HOST_USO, true),
+    DEFINE_PROP_BIT64("vq_notf_coal", VirtIONet, host_features,
+                      VIRTIO_NET_F_NOTF_COAL, true),
     DEFINE_PROP_ON_OFF_AUTO_BIT64("hash-ipv4", VirtIONet,
                                   rss_data.specified_hash_types,
                                   VIRTIO_NET_HASH_REPORT_IPv4 - 1,
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index 371e376428..b3a7df5ad8 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -158,6 +158,7 @@ typedef struct VirtioNetRssData {
 typedef struct VirtIONetQueue {
     VirtQueue *rx_vq;
     VirtQueue *tx_vq;
+    QEMUTimer *rx_timer;
     QEMUTimer *tx_timer;
     QEMUBH *tx_bh;
     uint32_t tx_waiting;
@@ -177,7 +178,6 @@ struct VirtIONet {
     /* RSC Chains - temporary storage of coalesced data,
        all these data are lost in case of migration */
     QTAILQ_HEAD(, VirtioNetRscChain) rsc_chains;
-    uint32_t tx_timeout;
     int32_t tx_burst;
     uint32_t has_vnet_hdr;
     size_t host_hdr_len;
@@ -230,6 +230,13 @@ struct VirtIONet {
     struct EBPFRSSContext ebpf_rss;
     uint32_t nr_ebpf_rss_fds;
     char **ebpf_rss_fds;
+    uint32_t rx_coal_usecs;
+    uint32_t rx_coal_packets;
+    uint32_t rx_pkt_cnt;
+    uint32_t tx_coal_usecs;
+    uint32_t tx_coal_packets;
+    uint32_t tx_pkt_cnt;
+    bool tx_timer_activate;
 };
 
 size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
diff --git a/net/passt.c b/net/passt.c
index 4ff94ee509..0b0d9e222a 100644
--- a/net/passt.c
+++ b/net/passt.c
@@ -52,6 +52,7 @@ static const int user_feature_bits[] = {
     VIRTIO_NET_F_GUEST_USO4,
     VIRTIO_NET_F_GUEST_USO6,
     VIRTIO_NET_F_HOST_USO,
+    VIRTIO_NET_F_NOTF_COAL,
 
     /* This bit implies RARP isn't sent by QEMU out of band */
     VIRTIO_NET_F_GUEST_ANNOUNCE,
diff --git a/net/tap.c b/net/tap.c
index 8d7ab6ba6f..ea5987a3dc 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -62,6 +62,7 @@ static const int kernel_feature_bits[] = {
     VIRTIO_F_NOTIFICATION_DATA,
     VIRTIO_NET_F_RSC_EXT,
     VIRTIO_NET_F_HASH_REPORT,
+    VIRTIO_NET_F_NOTF_COAL,
     VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO,
     VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO,
     VHOST_INVALID_FEATURE_BIT
diff --git a/net/vhost-user.c b/net/vhost-user.c
index a4bb49bbcf..f0b3752d7c 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -54,6 +54,7 @@ static const int user_feature_bits[] = {
     VIRTIO_NET_F_GUEST_USO4,
     VIRTIO_NET_F_GUEST_USO6,
     VIRTIO_NET_F_HOST_USO,
+    VIRTIO_NET_F_NOTF_COAL,
 
     /* This bit implies RARP isn't sent by QEMU out of band */
     VIRTIO_NET_F_GUEST_ANNOUNCE,
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 3df6091274..4ab8f26ceb 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -70,6 +70,7 @@ static const int vdpa_feature_bits[] = {
     VIRTIO_NET_F_CTRL_RX,
     VIRTIO_NET_F_CTRL_RX_EXTRA,
     VIRTIO_NET_F_CTRL_VLAN,
+    VIRTIO_NET_F_NOTF_COAL,
     VIRTIO_NET_F_CTRL_VQ,
     VIRTIO_NET_F_GSO,
     VIRTIO_NET_F_GUEST_CSUM,
-- 
2.53.0

Re: [PATCH v7] hw/net/virtio-net: add support for notification coalescing

Posted by Eugenio Perez Martin 2 days, 3 hours ago

On Fri, May 15, 2026 at 8:16 PM Koushik Dutta <kdutta@redhat.com> wrote:
>
> Implement VirtIO Network Notification Coalescing (Bit 53).
> This allows the guest to manage interrupt frequency using ethtool
> -C for both RX and TX paths.
>
> - Added VIRTIO_NET_F_NOTF_COAL to host features.
> - Implemented VIRTIO_NET_CTRL_NOTF_COAL class handling in
>   virtio_net_handle_ctrl_iov.
> - Added logic to store and apply rx/tx usecs and max_packets.
> - Added packet counters and threshold logic for both RX and TX data paths.
> - Dynamic Dispatcher: Implemented a dispatcher mechanism that
>   dynamically switches/activates the notification callback logic
>   only after the guest enables TX coalescing via ethtool.
> - After VM LM coalescing parameters persist in the destination VM.
>
> This reduces interrupt overhead by batching notifications based on
> either a packet count or a time-based threshold.
>
> Signed-off-by: Koushik Dutta <kdutta@redhat.com>
> ---
> v7 changes:
>  - Fixed time unit consistency: TX path now uses microseconds matching RX
>  - Changed timer_new_ns to timer_new_us for TX timers
>  - Changed qemu_clock_get_ns to qemu_clock_get_us in TX path
>  - Convert txtimer from nanoseconds to microseconds (txtimer / 1000)
>  - Removed unused rx_index_timer variable from struct and initialization
>  - Fixed feature flag handling: removed conditional check in ctrl handler
> ---
>  hw/net/virtio-net.c            | 193 +++++++++++++++++++++++++++++----
>  include/hw/virtio/virtio-net.h |   9 +-
>  net/passt.c                    |   1 +
>  net/tap.c                      |   1 +
>  net/vhost-user.c               |   1 +
>  net/vhost-vdpa.c               |   1 +
>  6 files changed, 183 insertions(+), 23 deletions(-)
>
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 2a5d642a64..5a98b52ded 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -157,6 +157,16 @@ static void flush_or_purge_queued_packets(NetClientState *nc)
>   * - we could suppress RX interrupt if we were so inclined.
>   */
>
> +static void virtio_net_rx_notify(void *opaque)
> +{
> +    VirtIONetQueue *q = opaque;
> +    VirtIONet *n = q->n;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
> +
> +    n->rx_pkt_cnt = 0;
> +    virtio_notify(vdev, q->rx_vq);
> +}
> +
>  static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
>  {
>      VirtIONet *n = VIRTIO_NET(vdev);
> @@ -435,7 +445,7 @@ static int virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
>          if (queue_started) {
>              if (q->tx_timer) {
>                  timer_mod(q->tx_timer,
> -                               qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> +                               qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
>              } else {
>                  replay_bh_schedule_event(q->tx_bh);
>              }
> @@ -1080,6 +1090,52 @@ static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
>      }
>  }
>
> +static void virtio_net_tx_timer(void *opaque);
> +
> +static int virtio_net_handle_coal(VirtIONet *n, uint8_t cmd,
> +                                  struct iovec *iov, unsigned int iov_cnt)
> +{
> +    struct virtio_net_ctrl_coal coal;
> +    VirtIONetQueue *q;
> +    size_t s;
> +    int i;
> +
> +    s = iov_to_buf(iov, iov_cnt, 0, &coal, sizeof(coal));
> +    if (s != sizeof(coal)) {
> +        return VIRTIO_NET_ERR;
> +    }
> +
> +    if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_RX_SET) {
> +        n->rx_coal_usecs = le32_to_cpu(coal.max_usecs);
> +        n->rx_coal_packets = le32_to_cpu(coal.max_packets);
> +        if (n->rx_coal_usecs > 0) {
> +            for (i = 0; i < n->max_queue_pairs; i++) {
> +                q = &n->vqs[i];
> +                if (!q->rx_timer) {
> +                    q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> +                                               virtio_net_rx_notify,
> +                                               q);
> +                }
> +            }
> +        }

Should we remove the timer if n->rx_coal_usecs == 0? Same for tx.

> +    } else if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_TX_SET) {
> +        n->tx_coal_usecs = le32_to_cpu(coal.max_usecs);
> +        n->tx_coal_packets = le32_to_cpu(coal.max_packets);
> +        if (n->tx_coal_usecs > 0) {
> +            for (i = 0; i < n->max_queue_pairs; i++) {
> +                q = &n->vqs[i];
> +                if (!q->tx_timer && n->tx_coal_usecs > 0) {
> +                    q->tx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> +                                               virtio_net_tx_timer,
> +                                               q);
> +                }
> +            }
> +        }
> +    }
> +
> +    return VIRTIO_NET_OK;
> +}
> +
>  static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
>                                   struct iovec *iov, unsigned int iov_cnt)
>  {
> @@ -1581,6 +1637,8 @@ size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
>          status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
>      } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
>          status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
> +    } else if (ctrl.class == VIRTIO_NET_CTRL_NOTF_COAL) {
> +        status = virtio_net_handle_coal(n, ctrl.cmd, iov, out_num);
>      }
>
>      s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
> @@ -2040,7 +2098,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
>      }
>
>      virtqueue_flush(q->rx_vq, i);
> -    virtio_notify(vdev, q->rx_vq);
> +
> +    /* rx coalescing */
> +    n->rx_pkt_cnt += i;
> +    if (n->rx_coal_usecs == 0 || n->rx_pkt_cnt >= n->rx_coal_packets) {
> +        if (q->rx_timer) {
> +            timer_del(q->rx_timer);
> +        }
> +        virtio_net_rx_notify(q);
> +    } else {
> +        if (q->rx_timer) {
> +            if (!timer_pending(q->rx_timer)) {
> +                timer_mod(q->rx_timer,
> +                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
> +            }
> +        }
> +    }
>
>      return size;
>
> @@ -2708,7 +2781,7 @@ static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
>              replay_bh_schedule_event(q->tx_bh);
>          } else {
>              timer_mod(q->tx_timer,
> -                      qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> +                      qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
>          }
>          q->tx_waiting = 1;
>      }
> @@ -2817,7 +2890,6 @@ detach:
>      return -EINVAL;
>  }
>
> -static void virtio_net_tx_timer(void *opaque);
>
>  static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
>  {
> @@ -2842,7 +2914,7 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
>      } else {
>          /* re-arm timer to flush it (and more) on next tick */
>          timer_mod(q->tx_timer,
> -                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
>          q->tx_waiting = 1;
>          virtio_queue_set_notification(vq, 0);
>      }
> @@ -2899,6 +2971,12 @@ static void virtio_net_tx_timer(void *opaque)
>      if (ret == -EBUSY || ret == -EINVAL) {
>          return;
>      }
> +    if (n->tx_pkt_cnt < ret) {
> +        n->tx_pkt_cnt = 0;
> +    } else {
> +        n->tx_pkt_cnt -= ret;
> +    }
> +
>      /*
>       * If we flush a full burst of packets, assume there are
>       * more coming and immediately rearm
> @@ -2906,7 +2984,7 @@ static void virtio_net_tx_timer(void *opaque)
>      if (ret >= n->tx_burst) {
>          q->tx_waiting = 1;
>          timer_mod(q->tx_timer,
> -                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
>          return;
>      }
>      /*
> @@ -2918,9 +2996,10 @@ static void virtio_net_tx_timer(void *opaque)
>      ret = virtio_net_flush_tx(q);
>      if (ret > 0) {
>          virtio_queue_set_notification(q->tx_vq, 0);
> +        n->tx_pkt_cnt -= ret;
>          q->tx_waiting = 1;
>          timer_mod(q->tx_timer,
> -                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
>      }
>  }
>
> @@ -2973,6 +3052,32 @@ static void virtio_net_tx_bh(void *opaque)
>      }
>  }
>
> +static void virtio_net_handle_tx_dispatch(VirtIODevice *vdev, VirtQueue *vq)
> +{
> +    VirtIONet *n = VIRTIO_NET(vdev);
> +    VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
> +    bool use_timer = n->tx_timer_activate || n->tx_coal_usecs > 0 ||
> +                     n->tx_coal_packets > 0;

This is from V2, but why use_timer is true as long as tx_coal_packets
> 0? It should be renamed or handled independently.

I'd expect something like:

if (coal_pkts_reached) {
  clear timer if needed()
  notify()
} else if(timer is enabled) {
  do nothing, wait for the timer
}

> +    bool pkt_limit = (n->tx_coal_packets > 0);
> +
> +    if (use_timer) {
> +        n->tx_pkt_cnt++;
> +        if (!pkt_limit || n->tx_pkt_cnt < n->tx_coal_packets) {
> +            if (q->tx_timer) {
> +                virtio_net_handle_tx_timer(vdev, vq);
> +                return;
> +            }
> +        }
> +        n->tx_pkt_cnt = 0;
> +        if (q->tx_timer) {
> +            timer_del(q->tx_timer);
> +        }
> +        virtio_net_handle_tx_bh(vdev, vq);
> +    } else {
> +        virtio_net_handle_tx_bh(vdev, vq);
> +    }
> +}
> +
>  static void virtio_net_add_queue(VirtIONet *n, int index)
>  {
>      VirtIODevice *vdev = VIRTIO_DEVICE(n);
> @@ -2980,20 +3085,15 @@ static void virtio_net_add_queue(VirtIONet *n, int index)
>      n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
>                                             virtio_net_handle_rx);
>
> -    if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
> -        n->vqs[index].tx_vq =
> -            virtio_add_queue(vdev, n->net_conf.tx_queue_size,
> -                             virtio_net_handle_tx_timer);
> -        n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
> -                                              virtio_net_tx_timer,
> -                                              &n->vqs[index]);
> -    } else {
> -        n->vqs[index].tx_vq =
> -            virtio_add_queue(vdev, n->net_conf.tx_queue_size,
> -                             virtio_net_handle_tx_bh);
> -        n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
> -                                                  &DEVICE(vdev)->mem_reentrancy_guard);
> -    }
> +    n->vqs[index].tx_vq =
> +        virtio_add_queue(vdev,
> +                         n->net_conf.tx_queue_size,
> +                         virtio_net_handle_tx_dispatch);
> +
> +    n->vqs[index].tx_bh =
> +        qemu_bh_new_guarded(virtio_net_tx_bh,
> +                            &n->vqs[index],
> +                            &DEVICE(vdev)->mem_reentrancy_guard);
>
>      n->vqs[index].tx_waiting = 0;
>      n->vqs[index].n = n;
> @@ -3088,6 +3188,9 @@ static void virtio_net_get_features(VirtIODevice *vdev, uint64_t *features,
>      virtio_features_or(features, features, n->host_features_ex);
>
>      virtio_add_feature_ex(features, VIRTIO_NET_F_MAC);
> +    if (n->tx_timer_activate) {
> +        virtio_clear_feature_ex(features, VIRTIO_NET_F_NOTF_COAL);
> +    }
>
>      if (!peer_has_vnet_hdr(n)) {
>          virtio_clear_feature_ex(features, VIRTIO_NET_F_CSUM);
> @@ -3242,6 +3345,35 @@ static int virtio_net_post_load_device(void *opaque, int version_id)
>      }
>
>      virtio_net_commit_rss_config(n);
> +    if (n->tx_coal_usecs > 0 || n->rx_coal_usecs > 0) {
> +
> +        for (i = 0; i < n->max_queue_pairs; i++) {
> +            VirtIONetQueue *q = &n->vqs[i];
> +            if (!q->rx_timer && n->rx_coal_usecs > 0) {
> +                q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> +                                           virtio_net_rx_notify,
> +                                           q);
> +            }
> +
> +            if (!q->tx_timer && n->tx_coal_usecs > 0) {
> +                q->tx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> +                                           virtio_net_tx_timer,
> +                                           q);
> +            }
> +
> +            if (n->tx_coal_usecs > 0 && q->tx_timer) {
> +                n->tx_pkt_cnt = 0;
> +                timer_mod(q->tx_timer,
> +                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);

Why not do these two steps in the previous if (!q->rx_timer &&
n->rx_coal_usecs > 0) block?

But also, what is the reason to call timer_mod here? The guest will
kick QEMU at the first tx packet, isn't it? Same in rx path.

> +            }
> +
> +            if (n->rx_coal_usecs > 0 && q->rx_timer) {
> +                timer_mod(q->rx_timer,
> +                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
> +            }
> +        }
> +    }
> +
>      return 0;
>  }
>
> @@ -3617,6 +3749,10 @@ static const VMStateDescription vmstate_virtio_net_device = {
>                           vmstate_virtio_net_tx_waiting),
>          VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
>                              has_ctrl_guest_offloads),
> +        VMSTATE_UINT32(rx_coal_usecs, VirtIONet),
> +        VMSTATE_UINT32(tx_coal_usecs, VirtIONet),
> +        VMSTATE_UINT32(rx_coal_packets, VirtIONet),
> +        VMSTATE_UINT32(tx_coal_packets, VirtIONet),
>          VMSTATE_END_OF_LIST()
>      },
>      .subsections = (const VMStateDescription * const []) {
> @@ -3960,7 +4096,6 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
>      }
>      n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
>      n->curr_queue_pairs = 1;
> -    n->tx_timeout = n->net_conf.txtimer;
>
>      if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
>                         && strcmp(n->net_conf.tx, "bh")) {
> @@ -3970,6 +4105,13 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
>          error_printf("Defaulting to \"bh\"");
>      }
>
> +    if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer") == 0) {
> +        n->tx_coal_usecs = n->net_conf.txtimer / 1000;
> +        n->tx_timer_activate = true;
> +    } else {
> +        n->tx_coal_usecs = 0;
> +    }
> +
>      n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
>                                      n->net_conf.tx_queue_size);
>
> @@ -4046,6 +4188,11 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
>              n->rss_data.specified_hash_types.on_bits |
>              n->rss_data.specified_hash_types.auto_bits;
>      }
> +    n->rx_pkt_cnt = 0;
> +    n->tx_pkt_cnt = 0;
> +    n->rx_coal_usecs = 0;
> +    n->rx_coal_packets = 0;
> +    n->tx_coal_packets = 0;
>  }
>
>  static void virtio_net_device_unrealize(DeviceState *dev)
> @@ -4258,6 +4405,8 @@ static const Property virtio_net_properties[] = {
>                        VIRTIO_NET_F_GUEST_USO6, true),
>      DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
>                        VIRTIO_NET_F_HOST_USO, true),
> +    DEFINE_PROP_BIT64("vq_notf_coal", VirtIONet, host_features,
> +                      VIRTIO_NET_F_NOTF_COAL, true),
>      DEFINE_PROP_ON_OFF_AUTO_BIT64("hash-ipv4", VirtIONet,
>                                    rss_data.specified_hash_types,
>                                    VIRTIO_NET_HASH_REPORT_IPv4 - 1,
> diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
> index 371e376428..b3a7df5ad8 100644
> --- a/include/hw/virtio/virtio-net.h
> +++ b/include/hw/virtio/virtio-net.h
> @@ -158,6 +158,7 @@ typedef struct VirtioNetRssData {
>  typedef struct VirtIONetQueue {
>      VirtQueue *rx_vq;
>      VirtQueue *tx_vq;
> +    QEMUTimer *rx_timer;
>      QEMUTimer *tx_timer;
>      QEMUBH *tx_bh;
>      uint32_t tx_waiting;
> @@ -177,7 +178,6 @@ struct VirtIONet {
>      /* RSC Chains - temporary storage of coalesced data,
>         all these data are lost in case of migration */
>      QTAILQ_HEAD(, VirtioNetRscChain) rsc_chains;
> -    uint32_t tx_timeout;
>      int32_t tx_burst;
>      uint32_t has_vnet_hdr;
>      size_t host_hdr_len;
> @@ -230,6 +230,13 @@ struct VirtIONet {
>      struct EBPFRSSContext ebpf_rss;
>      uint32_t nr_ebpf_rss_fds;
>      char **ebpf_rss_fds;
> +    uint32_t rx_coal_usecs;
> +    uint32_t rx_coal_packets;
> +    uint32_t rx_pkt_cnt;
> +    uint32_t tx_coal_usecs;
> +    uint32_t tx_coal_packets;
> +    uint32_t tx_pkt_cnt;
> +    bool tx_timer_activate;
>  };
>
>  size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
> diff --git a/net/passt.c b/net/passt.c
> index 4ff94ee509..0b0d9e222a 100644
> --- a/net/passt.c
> +++ b/net/passt.c
> @@ -52,6 +52,7 @@ static const int user_feature_bits[] = {
>      VIRTIO_NET_F_GUEST_USO4,
>      VIRTIO_NET_F_GUEST_USO6,
>      VIRTIO_NET_F_HOST_USO,
> +    VIRTIO_NET_F_NOTF_COAL,
>
>      /* This bit implies RARP isn't sent by QEMU out of band */
>      VIRTIO_NET_F_GUEST_ANNOUNCE,
> diff --git a/net/tap.c b/net/tap.c
> index 8d7ab6ba6f..ea5987a3dc 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -62,6 +62,7 @@ static const int kernel_feature_bits[] = {
>      VIRTIO_F_NOTIFICATION_DATA,
>      VIRTIO_NET_F_RSC_EXT,
>      VIRTIO_NET_F_HASH_REPORT,
> +    VIRTIO_NET_F_NOTF_COAL,
>      VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO,
>      VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO,
>      VHOST_INVALID_FEATURE_BIT
> diff --git a/net/vhost-user.c b/net/vhost-user.c
> index a4bb49bbcf..f0b3752d7c 100644
> --- a/net/vhost-user.c
> +++ b/net/vhost-user.c
> @@ -54,6 +54,7 @@ static const int user_feature_bits[] = {
>      VIRTIO_NET_F_GUEST_USO4,
>      VIRTIO_NET_F_GUEST_USO6,
>      VIRTIO_NET_F_HOST_USO,
> +    VIRTIO_NET_F_NOTF_COAL,
>
>      /* This bit implies RARP isn't sent by QEMU out of band */
>      VIRTIO_NET_F_GUEST_ANNOUNCE,
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index 3df6091274..4ab8f26ceb 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -70,6 +70,7 @@ static const int vdpa_feature_bits[] = {
>      VIRTIO_NET_F_CTRL_RX,
>      VIRTIO_NET_F_CTRL_RX_EXTRA,
>      VIRTIO_NET_F_CTRL_VLAN,
> +    VIRTIO_NET_F_NOTF_COAL,
>      VIRTIO_NET_F_CTRL_VQ,
>      VIRTIO_NET_F_GSO,
>      VIRTIO_NET_F_GUEST_CSUM,
> --
> 2.53.0
>

Re: [PATCH v7] hw/net/virtio-net: add support for notification coalescing

Posted by Michael S. Tsirkin 6 days, 1 hour ago

On Fri, May 15, 2026 at 11:46:20PM +0530, Koushik Dutta wrote:
> Implement VirtIO Network Notification Coalescing (Bit 53).
> This allows the guest to manage interrupt frequency using ethtool
> -C for both RX and TX paths.
> 
> - Added VIRTIO_NET_F_NOTF_COAL to host features.
> - Implemented VIRTIO_NET_CTRL_NOTF_COAL class handling in
>   virtio_net_handle_ctrl_iov.
> - Added logic to store and apply rx/tx usecs and max_packets.
> - Added packet counters and threshold logic for both RX and TX data paths.
> - Dynamic Dispatcher: Implemented a dispatcher mechanism that
>   dynamically switches/activates the notification callback logic
>   only after the guest enables TX coalescing via ethtool.
> - After VM LM coalescing parameters persist in the destination VM.
> 
> This reduces interrupt overhead by batching notifications based on
> either a packet count or a time-based threshold.
> 
> Signed-off-by: Koushik Dutta <kdutta@redhat.com>
> ---
> v7 changes:
>  - Fixed time unit consistency: TX path now uses microseconds matching RX
>  - Changed timer_new_ns to timer_new_us for TX timers
>  - Changed qemu_clock_get_ns to qemu_clock_get_us in TX path
>  - Convert txtimer from nanoseconds to microseconds (txtimer / 1000)
>  - Removed unused rx_index_timer variable from struct and initialization
>  - Fixed feature flag handling: removed conditional check in ctrl handler

Thanks!
Yet something to improve:

> ---
>  hw/net/virtio-net.c            | 193 +++++++++++++++++++++++++++++----
>  include/hw/virtio/virtio-net.h |   9 +-
>  net/passt.c                    |   1 +
>  net/tap.c                      |   1 +
>  net/vhost-user.c               |   1 +
>  net/vhost-vdpa.c               |   1 +
>  6 files changed, 183 insertions(+), 23 deletions(-)
> 
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 2a5d642a64..5a98b52ded 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -157,6 +157,16 @@ static void flush_or_purge_queued_packets(NetClientState *nc)
>   * - we could suppress RX interrupt if we were so inclined.
>   */
>  
> +static void virtio_net_rx_notify(void *opaque)
> +{
> +    VirtIONetQueue *q = opaque;
> +    VirtIONet *n = q->n;
> +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
> +
> +    n->rx_pkt_cnt = 0;
> +    virtio_notify(vdev, q->rx_vq);
> +}
> +
>  static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
>  {
>      VirtIONet *n = VIRTIO_NET(vdev);
> @@ -435,7 +445,7 @@ static int virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
>          if (queue_started) {
>              if (q->tx_timer) {
>                  timer_mod(q->tx_timer,
> -                               qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> +                               qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
>              } else {
>                  replay_bh_schedule_event(q->tx_bh);
>              }
> @@ -1080,6 +1090,52 @@ static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
>      }
>  }
>  
> +static void virtio_net_tx_timer(void *opaque);
> +


I do not like forward declarations like this - pls just order
the code sensibly.

If you need to move some code, it can be a separate preparatory patch.

> +static int virtio_net_handle_coal(VirtIONet *n, uint8_t cmd,
> +                                  struct iovec *iov, unsigned int iov_cnt)
> +{
> +    struct virtio_net_ctrl_coal coal;
> +    VirtIONetQueue *q;
> +    size_t s;
> +    int i;
> +
> +    s = iov_to_buf(iov, iov_cnt, 0, &coal, sizeof(coal));
> +    if (s != sizeof(coal)) {
> +        return VIRTIO_NET_ERR;
> +    }
> +
> +    if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_RX_SET) {
> +        n->rx_coal_usecs = le32_to_cpu(coal.max_usecs);
> +        n->rx_coal_packets = le32_to_cpu(coal.max_packets);
> +        if (n->rx_coal_usecs > 0) {
> +            for (i = 0; i < n->max_queue_pairs; i++) {
> +                q = &n->vqs[i];
> +                if (!q->rx_timer) {
> +                    q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> +                                               virtio_net_rx_notify,
> +                                               q);
> +                }
> +            }
> +        }
> +    } else if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_TX_SET) {
> +        n->tx_coal_usecs = le32_to_cpu(coal.max_usecs);
> +        n->tx_coal_packets = le32_to_cpu(coal.max_packets);
> +        if (n->tx_coal_usecs > 0) {
> +            for (i = 0; i < n->max_queue_pairs; i++) {
> +                q = &n->vqs[i];
> +                if (!q->tx_timer && n->tx_coal_usecs > 0) {
> +                    q->tx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> +                                               virtio_net_tx_timer,
> +                                               q);
> +                }
> +            }
> +        }
> +    }

So, the value is never propagated to any of vhost/vhost-user/vdpa.

Thus if they decide to implement and advertise it,
They will not get the value and will not coalesce appropriately.
Makes this of a rather limited use - most deployments use some kind
of offload.


> +
> +    return VIRTIO_NET_OK;
> +}
> +
>  static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
>                                   struct iovec *iov, unsigned int iov_cnt)
>  {
> @@ -1581,6 +1637,8 @@ size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
>          status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
>      } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
>          status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
> +    } else if (ctrl.class == VIRTIO_NET_CTRL_NOTF_COAL) {
> +        status = virtio_net_handle_coal(n, ctrl.cmd, iov, out_num);
>      }
>  
>      s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
> @@ -2040,7 +2098,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
>      }
>  
>      virtqueue_flush(q->rx_vq, i);
> -    virtio_notify(vdev, q->rx_vq);
> +
> +    /* rx coalescing */
> +    n->rx_pkt_cnt += i;
> +    if (n->rx_coal_usecs == 0 || n->rx_pkt_cnt >= n->rx_coal_packets) {
> +        if (q->rx_timer) {
> +            timer_del(q->rx_timer);
> +        }
> +        virtio_net_rx_notify(q);
> +    } else {
> +        if (q->rx_timer) {
> +            if (!timer_pending(q->rx_timer)) {
> +                timer_mod(q->rx_timer,
> +                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
> +            }
> +        }
> +    }
>  
>      return size;
>  
> @@ -2708,7 +2781,7 @@ static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
>              replay_bh_schedule_event(q->tx_bh);
>          } else {
>              timer_mod(q->tx_timer,
> -                      qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> +                      qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
>          }
>          q->tx_waiting = 1;
>      }
> @@ -2817,7 +2890,6 @@ detach:
>      return -EINVAL;
>  }
>  
> -static void virtio_net_tx_timer(void *opaque);
>  
>  static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
>  {
> @@ -2842,7 +2914,7 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
>      } else {
>          /* re-arm timer to flush it (and more) on next tick */
>          timer_mod(q->tx_timer,
> -                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
>          q->tx_waiting = 1;
>          virtio_queue_set_notification(vq, 0);
>      }
> @@ -2899,6 +2971,12 @@ static void virtio_net_tx_timer(void *opaque)
>      if (ret == -EBUSY || ret == -EINVAL) {
>          return;
>      }
> +    if (n->tx_pkt_cnt < ret) {
> +        n->tx_pkt_cnt = 0;
> +    } else {
> +        n->tx_pkt_cnt -= ret;
> +    }
> +
>      /*
>       * If we flush a full burst of packets, assume there are
>       * more coming and immediately rearm
> @@ -2906,7 +2984,7 @@ static void virtio_net_tx_timer(void *opaque)
>      if (ret >= n->tx_burst) {
>          q->tx_waiting = 1;
>          timer_mod(q->tx_timer,
> -                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
>          return;
>      }
>      /*
> @@ -2918,9 +2996,10 @@ static void virtio_net_tx_timer(void *opaque)
>      ret = virtio_net_flush_tx(q);
>      if (ret > 0) {
>          virtio_queue_set_notification(q->tx_vq, 0);
> +        n->tx_pkt_cnt -= ret;
>          q->tx_waiting = 1;
>          timer_mod(q->tx_timer,
> -                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
>      }
>  }
>  
> @@ -2973,6 +3052,32 @@ static void virtio_net_tx_bh(void *opaque)
>      }
>  }
>  
> +static void virtio_net_handle_tx_dispatch(VirtIODevice *vdev, VirtQueue *vq)
> +{
> +    VirtIONet *n = VIRTIO_NET(vdev);
> +    VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
> +    bool use_timer = n->tx_timer_activate || n->tx_coal_usecs > 0 ||
> +                     n->tx_coal_packets > 0;
> +    bool pkt_limit = (n->tx_coal_packets > 0);
> +
> +    if (use_timer) {
> +        n->tx_pkt_cnt++;
> +        if (!pkt_limit || n->tx_pkt_cnt < n->tx_coal_packets) {


It seems coalescing is exclusive with bh use?



> +            if (q->tx_timer) {
> +                virtio_net_handle_tx_timer(vdev, vq);
> +                return;
> +            }
> +        }
> +        n->tx_pkt_cnt = 0;
> +        if (q->tx_timer) {
> +            timer_del(q->tx_timer);
> +        }
> +        virtio_net_handle_tx_bh(vdev, vq);
> +    } else {
> +        virtio_net_handle_tx_bh(vdev, vq);
> +    }
> +}
> +

So, you unified tx handling in one place?
this is better done in a preparatory patch.

>  static void virtio_net_add_queue(VirtIONet *n, int index)
>  {
>      VirtIODevice *vdev = VIRTIO_DEVICE(n);
> @@ -2980,20 +3085,15 @@ static void virtio_net_add_queue(VirtIONet *n, int index)
>      n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
>                                             virtio_net_handle_rx);
>  
> -    if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
> -        n->vqs[index].tx_vq =
> -            virtio_add_queue(vdev, n->net_conf.tx_queue_size,
> -                             virtio_net_handle_tx_timer);
> -        n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
> -                                              virtio_net_tx_timer,
> -                                              &n->vqs[index]);
> -    } else {
> -        n->vqs[index].tx_vq =
> -            virtio_add_queue(vdev, n->net_conf.tx_queue_size,
> -                             virtio_net_handle_tx_bh);
> -        n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
> -                                                  &DEVICE(vdev)->mem_reentrancy_guard);
> -    }
> +    n->vqs[index].tx_vq =
> +        virtio_add_queue(vdev,
> +                         n->net_conf.tx_queue_size,
> +                         virtio_net_handle_tx_dispatch);
> +
> +    n->vqs[index].tx_bh =
> +        qemu_bh_new_guarded(virtio_net_tx_bh,
> +                            &n->vqs[index],
> +                            &DEVICE(vdev)->mem_reentrancy_guard);
>  
>      n->vqs[index].tx_waiting = 0;
>      n->vqs[index].n = n;
> @@ -3088,6 +3188,9 @@ static void virtio_net_get_features(VirtIODevice *vdev, uint64_t *features,
>      virtio_features_or(features, features, n->host_features_ex);
>  
>      virtio_add_feature_ex(features, VIRTIO_NET_F_MAC);
> +    if (n->tx_timer_activate) {
> +        virtio_clear_feature_ex(features, VIRTIO_NET_F_NOTF_COAL);
> +    }
>  
>      if (!peer_has_vnet_hdr(n)) {
>          virtio_clear_feature_ex(features, VIRTIO_NET_F_CSUM);
> @@ -3242,6 +3345,35 @@ static int virtio_net_post_load_device(void *opaque, int version_id)
>      }
>  
>      virtio_net_commit_rss_config(n);
> +    if (n->tx_coal_usecs > 0 || n->rx_coal_usecs > 0) {
> +
> +        for (i = 0; i < n->max_queue_pairs; i++) {
> +            VirtIONetQueue *q = &n->vqs[i];
> +            if (!q->rx_timer && n->rx_coal_usecs > 0) {
> +                q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> +                                           virtio_net_rx_notify,
> +                                           q);
> +            }
> +
> +            if (!q->tx_timer && n->tx_coal_usecs > 0) {
> +                q->tx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> +                                           virtio_net_tx_timer,
> +                                           q);
> +            }
> +
> +            if (n->tx_coal_usecs > 0 && q->tx_timer) {
> +                n->tx_pkt_cnt = 0;
> +                timer_mod(q->tx_timer,
> +                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> +            }
> +
> +            if (n->rx_coal_usecs > 0 && q->rx_timer) {
> +                timer_mod(q->rx_timer,
> +                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
> +            }
> +        }
> +    }
> +
>      return 0;
>  }
>  
> @@ -3617,6 +3749,10 @@ static const VMStateDescription vmstate_virtio_net_device = {
>                           vmstate_virtio_net_tx_waiting),
>          VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
>                              has_ctrl_guest_offloads),
> +        VMSTATE_UINT32(rx_coal_usecs, VirtIONet),
> +        VMSTATE_UINT32(tx_coal_usecs, VirtIONet),
> +        VMSTATE_UINT32(rx_coal_packets, VirtIONet),
> +        VMSTATE_UINT32(tx_coal_packets, VirtIONet),
>          VMSTATE_END_OF_LIST()
>      },
>      .subsections = (const VMStateDescription * const []) {
> @@ -3960,7 +4096,6 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
>      }
>      n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
>      n->curr_queue_pairs = 1;
> -    n->tx_timeout = n->net_conf.txtimer;
>  
>      if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
>                         && strcmp(n->net_conf.tx, "bh")) {
> @@ -3970,6 +4105,13 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
>          error_printf("Defaulting to \"bh\"");
>      }
>  
> +    if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer") == 0) {
> +        n->tx_coal_usecs = n->net_conf.txtimer / 1000;

add a code comment explaining what is going on. why is losing
precision not a concern? maybe we should do the reverse and multiply?


> +        n->tx_timer_activate = true;
> +    } else {
> +        n->tx_coal_usecs = 0;
> +    }
> +
>      n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
>                                      n->net_conf.tx_queue_size);
>  
> @@ -4046,6 +4188,11 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
>              n->rss_data.specified_hash_types.on_bits |
>              n->rss_data.specified_hash_types.auto_bits;
>      }
> +    n->rx_pkt_cnt = 0;
> +    n->tx_pkt_cnt = 0;
> +    n->rx_coal_usecs = 0;
> +    n->rx_coal_packets = 0;
> +    n->tx_coal_packets = 0;
>  }
>  
>  static void virtio_net_device_unrealize(DeviceState *dev)
> @@ -4258,6 +4405,8 @@ static const Property virtio_net_properties[] = {
>                        VIRTIO_NET_F_GUEST_USO6, true),
>      DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
>                        VIRTIO_NET_F_HOST_USO, true),
> +    DEFINE_PROP_BIT64("vq_notf_coal", VirtIONet, host_features,
> +                      VIRTIO_NET_F_NOTF_COAL, true),


We can't change host features like this without compat machinery.



>      DEFINE_PROP_ON_OFF_AUTO_BIT64("hash-ipv4", VirtIONet,
>                                    rss_data.specified_hash_types,
>                                    VIRTIO_NET_HASH_REPORT_IPv4 - 1,
> diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
> index 371e376428..b3a7df5ad8 100644
> --- a/include/hw/virtio/virtio-net.h
> +++ b/include/hw/virtio/virtio-net.h
> @@ -158,6 +158,7 @@ typedef struct VirtioNetRssData {
>  typedef struct VirtIONetQueue {
>      VirtQueue *rx_vq;
>      VirtQueue *tx_vq;
> +    QEMUTimer *rx_timer;
>      QEMUTimer *tx_timer;
>      QEMUBH *tx_bh;
>      uint32_t tx_waiting;
> @@ -177,7 +178,6 @@ struct VirtIONet {
>      /* RSC Chains - temporary storage of coalesced data,
>         all these data are lost in case of migration */
>      QTAILQ_HEAD(, VirtioNetRscChain) rsc_chains;
> -    uint32_t tx_timeout;
>      int32_t tx_burst;
>      uint32_t has_vnet_hdr;
>      size_t host_hdr_len;
> @@ -230,6 +230,13 @@ struct VirtIONet {
>      struct EBPFRSSContext ebpf_rss;
>      uint32_t nr_ebpf_rss_fds;
>      char **ebpf_rss_fds;
> +    uint32_t rx_coal_usecs;
> +    uint32_t rx_coal_packets;
> +    uint32_t rx_pkt_cnt;
> +    uint32_t tx_coal_usecs;
> +    uint32_t tx_coal_packets;
> +    uint32_t tx_pkt_cnt;
> +    bool tx_timer_activate;
>  };

Can we get some documentation on what each of these is?


>  
>  size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
> diff --git a/net/passt.c b/net/passt.c
> index 4ff94ee509..0b0d9e222a 100644
> --- a/net/passt.c
> +++ b/net/passt.c
> @@ -52,6 +52,7 @@ static const int user_feature_bits[] = {
>      VIRTIO_NET_F_GUEST_USO4,
>      VIRTIO_NET_F_GUEST_USO6,
>      VIRTIO_NET_F_HOST_USO,
> +    VIRTIO_NET_F_NOTF_COAL,
>  
>      /* This bit implies RARP isn't sent by QEMU out of band */
>      VIRTIO_NET_F_GUEST_ANNOUNCE,
> diff --git a/net/tap.c b/net/tap.c
> index 8d7ab6ba6f..ea5987a3dc 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -62,6 +62,7 @@ static const int kernel_feature_bits[] = {
>      VIRTIO_F_NOTIFICATION_DATA,
>      VIRTIO_NET_F_RSC_EXT,
>      VIRTIO_NET_F_HASH_REPORT,
> +    VIRTIO_NET_F_NOTF_COAL,
>      VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO,
>      VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO,
>      VHOST_INVALID_FEATURE_BIT
> diff --git a/net/vhost-user.c b/net/vhost-user.c
> index a4bb49bbcf..f0b3752d7c 100644
> --- a/net/vhost-user.c
> +++ b/net/vhost-user.c
> @@ -54,6 +54,7 @@ static const int user_feature_bits[] = {
>      VIRTIO_NET_F_GUEST_USO4,
>      VIRTIO_NET_F_GUEST_USO6,
>      VIRTIO_NET_F_HOST_USO,
> +    VIRTIO_NET_F_NOTF_COAL,
>  
>      /* This bit implies RARP isn't sent by QEMU out of band */
>      VIRTIO_NET_F_GUEST_ANNOUNCE,
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index 3df6091274..4ab8f26ceb 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -70,6 +70,7 @@ static const int vdpa_feature_bits[] = {
>      VIRTIO_NET_F_CTRL_RX,
>      VIRTIO_NET_F_CTRL_RX_EXTRA,
>      VIRTIO_NET_F_CTRL_VLAN,
> +    VIRTIO_NET_F_NOTF_COAL,
>      VIRTIO_NET_F_CTRL_VQ,
>      VIRTIO_NET_F_GSO,
>      VIRTIO_NET_F_GUEST_CSUM,
> -- 
> 2.53.0

Re: [PATCH v7] hw/net/virtio-net: add support for notification coalescing

Posted by Eugenio Perez Martin 2 days, 3 hours ago

On Sun, May 24, 2026 at 6:42 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, May 15, 2026 at 11:46:20PM +0530, Koushik Dutta wrote:
> > Implement VirtIO Network Notification Coalescing (Bit 53).
> > This allows the guest to manage interrupt frequency using ethtool
> > -C for both RX and TX paths.
> >
> > - Added VIRTIO_NET_F_NOTF_COAL to host features.
> > - Implemented VIRTIO_NET_CTRL_NOTF_COAL class handling in
> >   virtio_net_handle_ctrl_iov.
> > - Added logic to store and apply rx/tx usecs and max_packets.
> > - Added packet counters and threshold logic for both RX and TX data paths.
> > - Dynamic Dispatcher: Implemented a dispatcher mechanism that
> >   dynamically switches/activates the notification callback logic
> >   only after the guest enables TX coalescing via ethtool.
> > - After VM LM coalescing parameters persist in the destination VM.
> >
> > This reduces interrupt overhead by batching notifications based on
> > either a packet count or a time-based threshold.
> >
> > Signed-off-by: Koushik Dutta <kdutta@redhat.com>
> > ---
> > v7 changes:
> >  - Fixed time unit consistency: TX path now uses microseconds matching RX
> >  - Changed timer_new_ns to timer_new_us for TX timers
> >  - Changed qemu_clock_get_ns to qemu_clock_get_us in TX path
> >  - Convert txtimer from nanoseconds to microseconds (txtimer / 1000)
> >  - Removed unused rx_index_timer variable from struct and initialization
> >  - Fixed feature flag handling: removed conditional check in ctrl handler
>
> Thanks!
> Yet something to improve:
>
> > ---
> >  hw/net/virtio-net.c            | 193 +++++++++++++++++++++++++++++----
> >  include/hw/virtio/virtio-net.h |   9 +-
> >  net/passt.c                    |   1 +
> >  net/tap.c                      |   1 +
> >  net/vhost-user.c               |   1 +
> >  net/vhost-vdpa.c               |   1 +
> >  6 files changed, 183 insertions(+), 23 deletions(-)
> >
> > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> > index 2a5d642a64..5a98b52ded 100644
> > --- a/hw/net/virtio-net.c
> > +++ b/hw/net/virtio-net.c
> > @@ -157,6 +157,16 @@ static void flush_or_purge_queued_packets(NetClientState *nc)
> >   * - we could suppress RX interrupt if we were so inclined.
> >   */
> >
> > +static void virtio_net_rx_notify(void *opaque)
> > +{
> > +    VirtIONetQueue *q = opaque;
> > +    VirtIONet *n = q->n;
> > +    VirtIODevice *vdev = VIRTIO_DEVICE(n);
> > +
> > +    n->rx_pkt_cnt = 0;
> > +    virtio_notify(vdev, q->rx_vq);
> > +}
> > +
> >  static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
> >  {
> >      VirtIONet *n = VIRTIO_NET(vdev);
> > @@ -435,7 +445,7 @@ static int virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
> >          if (queue_started) {
> >              if (q->tx_timer) {
> >                  timer_mod(q->tx_timer,
> > -                               qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> > +                               qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> >              } else {
> >                  replay_bh_schedule_event(q->tx_bh);
> >              }
> > @@ -1080,6 +1090,52 @@ static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
> >      }
> >  }
> >
> > +static void virtio_net_tx_timer(void *opaque);
> > +
>
>
> I do not like forward declarations like this - pls just order
> the code sensibly.
>
> If you need to move some code, it can be a separate preparatory patch.
>
> > +static int virtio_net_handle_coal(VirtIONet *n, uint8_t cmd,
> > +                                  struct iovec *iov, unsigned int iov_cnt)
> > +{
> > +    struct virtio_net_ctrl_coal coal;
> > +    VirtIONetQueue *q;
> > +    size_t s;
> > +    int i;
> > +
> > +    s = iov_to_buf(iov, iov_cnt, 0, &coal, sizeof(coal));
> > +    if (s != sizeof(coal)) {
> > +        return VIRTIO_NET_ERR;
> > +    }
> > +
> > +    if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_RX_SET) {
> > +        n->rx_coal_usecs = le32_to_cpu(coal.max_usecs);
> > +        n->rx_coal_packets = le32_to_cpu(coal.max_packets);
> > +        if (n->rx_coal_usecs > 0) {
> > +            for (i = 0; i < n->max_queue_pairs; i++) {
> > +                q = &n->vqs[i];
> > +                if (!q->rx_timer) {
> > +                    q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> > +                                               virtio_net_rx_notify,
> > +                                               q);
> > +                }
> > +            }
> > +        }
> > +    } else if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_TX_SET) {
> > +        n->tx_coal_usecs = le32_to_cpu(coal.max_usecs);
> > +        n->tx_coal_packets = le32_to_cpu(coal.max_packets);
> > +        if (n->tx_coal_usecs > 0) {
> > +            for (i = 0; i < n->max_queue_pairs; i++) {
> > +                q = &n->vqs[i];
> > +                if (!q->tx_timer && n->tx_coal_usecs > 0) {
> > +                    q->tx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> > +                                               virtio_net_tx_timer,
> > +                                               q);
> > +                }
> > +            }
> > +        }
> > +    }
>
> So, the value is never propagated to any of vhost/vhost-user/vdpa.
>
> Thus if they decide to implement and advertise it,
> They will not get the value and will not coalesce appropriately.
> Makes this of a rather limited use - most deployments use some kind
> of offload.
>

Vhost-vdpa devices should support this feature with this patch as long
as they advertise it. Migration with this feature using a vhost-vdpa
backend is not supported though because the shadow control virtqueue
doesn't support it yet. Support for SVQ is planned for a subsequent
patch. It's easy, in the line of loading the rest of the features at
the destination QEMU. Koushik please add it to the patch message for
the next version.

However, there is no vhost-kernel ioctl or vhost-user message, so they
should not offer the feature until it's supported by them. Or should
we also block them someway else?

Now that we are here, maybe the timers should not be allocated as long
as there is a vhost (nc->peer). But their lifecycle gets more
complicated. MST, what do you think?

>
> > +
> > +    return VIRTIO_NET_OK;
> > +}
> > +
> >  static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
> >                                   struct iovec *iov, unsigned int iov_cnt)
> >  {
> > @@ -1581,6 +1637,8 @@ size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
> >          status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
> >      } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
> >          status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
> > +    } else if (ctrl.class == VIRTIO_NET_CTRL_NOTF_COAL) {
> > +        status = virtio_net_handle_coal(n, ctrl.cmd, iov, out_num);
> >      }
> >
> >      s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
> > @@ -2040,7 +2098,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
> >      }
> >
> >      virtqueue_flush(q->rx_vq, i);
> > -    virtio_notify(vdev, q->rx_vq);
> > +
> > +    /* rx coalescing */
> > +    n->rx_pkt_cnt += i;
> > +    if (n->rx_coal_usecs == 0 || n->rx_pkt_cnt >= n->rx_coal_packets) {
> > +        if (q->rx_timer) {
> > +            timer_del(q->rx_timer);
> > +        }
> > +        virtio_net_rx_notify(q);
> > +    } else {
> > +        if (q->rx_timer) {
> > +            if (!timer_pending(q->rx_timer)) {
> > +                timer_mod(q->rx_timer,
> > +                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
> > +            }
> > +        }
> > +    }
> >
> >      return size;
> >
> > @@ -2708,7 +2781,7 @@ static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
> >              replay_bh_schedule_event(q->tx_bh);
> >          } else {
> >              timer_mod(q->tx_timer,
> > -                      qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> > +                      qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> >          }
> >          q->tx_waiting = 1;
> >      }
> > @@ -2817,7 +2890,6 @@ detach:
> >      return -EINVAL;
> >  }
> >
> > -static void virtio_net_tx_timer(void *opaque);
> >
> >  static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
> >  {
> > @@ -2842,7 +2914,7 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
> >      } else {
> >          /* re-arm timer to flush it (and more) on next tick */
> >          timer_mod(q->tx_timer,
> > -                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> > +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> >          q->tx_waiting = 1;
> >          virtio_queue_set_notification(vq, 0);
> >      }
> > @@ -2899,6 +2971,12 @@ static void virtio_net_tx_timer(void *opaque)
> >      if (ret == -EBUSY || ret == -EINVAL) {
> >          return;
> >      }
> > +    if (n->tx_pkt_cnt < ret) {
> > +        n->tx_pkt_cnt = 0;
> > +    } else {
> > +        n->tx_pkt_cnt -= ret;
> > +    }
> > +
> >      /*
> >       * If we flush a full burst of packets, assume there are
> >       * more coming and immediately rearm
> > @@ -2906,7 +2984,7 @@ static void virtio_net_tx_timer(void *opaque)
> >      if (ret >= n->tx_burst) {
> >          q->tx_waiting = 1;
> >          timer_mod(q->tx_timer,
> > -                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> > +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> >          return;
> >      }
> >      /*
> > @@ -2918,9 +2996,10 @@ static void virtio_net_tx_timer(void *opaque)
> >      ret = virtio_net_flush_tx(q);
> >      if (ret > 0) {
> >          virtio_queue_set_notification(q->tx_vq, 0);
> > +        n->tx_pkt_cnt -= ret;
> >          q->tx_waiting = 1;
> >          timer_mod(q->tx_timer,
> > -                  qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> > +                  qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> >      }
> >  }
> >
> > @@ -2973,6 +3052,32 @@ static void virtio_net_tx_bh(void *opaque)
> >      }
> >  }
> >
> > +static void virtio_net_handle_tx_dispatch(VirtIODevice *vdev, VirtQueue *vq)
> > +{
> > +    VirtIONet *n = VIRTIO_NET(vdev);
> > +    VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
> > +    bool use_timer = n->tx_timer_activate || n->tx_coal_usecs > 0 ||
> > +                     n->tx_coal_packets > 0;
> > +    bool pkt_limit = (n->tx_coal_packets > 0);
> > +
> > +    if (use_timer) {
> > +        n->tx_pkt_cnt++;
> > +        if (!pkt_limit || n->tx_pkt_cnt < n->tx_coal_packets) {
>
>
> It seems coalescing is exclusive with bh use?
>
>
>
> > +            if (q->tx_timer) {
> > +                virtio_net_handle_tx_timer(vdev, vq);
> > +                return;
> > +            }
> > +        }
> > +        n->tx_pkt_cnt = 0;
> > +        if (q->tx_timer) {
> > +            timer_del(q->tx_timer);
> > +        }
> > +        virtio_net_handle_tx_bh(vdev, vq);
> > +    } else {
> > +        virtio_net_handle_tx_bh(vdev, vq);
> > +    }
> > +}
> > +
>
> So, you unified tx handling in one place?
> this is better done in a preparatory patch.
>
> >  static void virtio_net_add_queue(VirtIONet *n, int index)
> >  {
> >      VirtIODevice *vdev = VIRTIO_DEVICE(n);
> > @@ -2980,20 +3085,15 @@ static void virtio_net_add_queue(VirtIONet *n, int index)
> >      n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
> >                                             virtio_net_handle_rx);
> >
> > -    if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
> > -        n->vqs[index].tx_vq =
> > -            virtio_add_queue(vdev, n->net_conf.tx_queue_size,
> > -                             virtio_net_handle_tx_timer);
> > -        n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
> > -                                              virtio_net_tx_timer,
> > -                                              &n->vqs[index]);
> > -    } else {
> > -        n->vqs[index].tx_vq =
> > -            virtio_add_queue(vdev, n->net_conf.tx_queue_size,
> > -                             virtio_net_handle_tx_bh);
> > -        n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
> > -                                                  &DEVICE(vdev)->mem_reentrancy_guard);
> > -    }
> > +    n->vqs[index].tx_vq =
> > +        virtio_add_queue(vdev,
> > +                         n->net_conf.tx_queue_size,
> > +                         virtio_net_handle_tx_dispatch);
> > +
> > +    n->vqs[index].tx_bh =
> > +        qemu_bh_new_guarded(virtio_net_tx_bh,
> > +                            &n->vqs[index],
> > +                            &DEVICE(vdev)->mem_reentrancy_guard);
> >
> >      n->vqs[index].tx_waiting = 0;
> >      n->vqs[index].n = n;
> > @@ -3088,6 +3188,9 @@ static void virtio_net_get_features(VirtIODevice *vdev, uint64_t *features,
> >      virtio_features_or(features, features, n->host_features_ex);
> >
> >      virtio_add_feature_ex(features, VIRTIO_NET_F_MAC);
> > +    if (n->tx_timer_activate) {
> > +        virtio_clear_feature_ex(features, VIRTIO_NET_F_NOTF_COAL);
> > +    }
> >
> >      if (!peer_has_vnet_hdr(n)) {
> >          virtio_clear_feature_ex(features, VIRTIO_NET_F_CSUM);
> > @@ -3242,6 +3345,35 @@ static int virtio_net_post_load_device(void *opaque, int version_id)
> >      }
> >
> >      virtio_net_commit_rss_config(n);
> > +    if (n->tx_coal_usecs > 0 || n->rx_coal_usecs > 0) {
> > +
> > +        for (i = 0; i < n->max_queue_pairs; i++) {
> > +            VirtIONetQueue *q = &n->vqs[i];
> > +            if (!q->rx_timer && n->rx_coal_usecs > 0) {
> > +                q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> > +                                           virtio_net_rx_notify,
> > +                                           q);
> > +            }
> > +
> > +            if (!q->tx_timer && n->tx_coal_usecs > 0) {
> > +                q->tx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> > +                                           virtio_net_tx_timer,
> > +                                           q);
> > +            }
> > +
> > +            if (n->tx_coal_usecs > 0 && q->tx_timer) {
> > +                n->tx_pkt_cnt = 0;
> > +                timer_mod(q->tx_timer,
> > +                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> > +            }
> > +
> > +            if (n->rx_coal_usecs > 0 && q->rx_timer) {
> > +                timer_mod(q->rx_timer,
> > +                          qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
> > +            }
> > +        }
> > +    }
> > +
> >      return 0;
> >  }
> >
> > @@ -3617,6 +3749,10 @@ static const VMStateDescription vmstate_virtio_net_device = {
> >                           vmstate_virtio_net_tx_waiting),
> >          VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
> >                              has_ctrl_guest_offloads),
> > +        VMSTATE_UINT32(rx_coal_usecs, VirtIONet),
> > +        VMSTATE_UINT32(tx_coal_usecs, VirtIONet),
> > +        VMSTATE_UINT32(rx_coal_packets, VirtIONet),
> > +        VMSTATE_UINT32(tx_coal_packets, VirtIONet),
> >          VMSTATE_END_OF_LIST()
> >      },
> >      .subsections = (const VMStateDescription * const []) {
> > @@ -3960,7 +4096,6 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
> >      }
> >      n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
> >      n->curr_queue_pairs = 1;
> > -    n->tx_timeout = n->net_conf.txtimer;
> >
> >      if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
> >                         && strcmp(n->net_conf.tx, "bh")) {
> > @@ -3970,6 +4105,13 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
> >          error_printf("Defaulting to \"bh\"");
> >      }
> >
> > +    if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer") == 0) {
> > +        n->tx_coal_usecs = n->net_conf.txtimer / 1000;
>
> add a code comment explaining what is going on. why is losing
> precision not a concern? maybe we should do the reverse and multiply?
>
>
> > +        n->tx_timer_activate = true;
> > +    } else {
> > +        n->tx_coal_usecs = 0;
> > +    }
> > +
> >      n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
> >                                      n->net_conf.tx_queue_size);
> >
> > @@ -4046,6 +4188,11 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
> >              n->rss_data.specified_hash_types.on_bits |
> >              n->rss_data.specified_hash_types.auto_bits;
> >      }
> > +    n->rx_pkt_cnt = 0;
> > +    n->tx_pkt_cnt = 0;
> > +    n->rx_coal_usecs = 0;
> > +    n->rx_coal_packets = 0;
> > +    n->tx_coal_packets = 0;
> >  }
> >
> >  static void virtio_net_device_unrealize(DeviceState *dev)
> > @@ -4258,6 +4405,8 @@ static const Property virtio_net_properties[] = {
> >                        VIRTIO_NET_F_GUEST_USO6, true),
> >      DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
> >                        VIRTIO_NET_F_HOST_USO, true),
> > +    DEFINE_PROP_BIT64("vq_notf_coal", VirtIONet, host_features,
> > +                      VIRTIO_NET_F_NOTF_COAL, true),
>
>
> We can't change host features like this without compat machinery.
>

Koushik: To do it, please replicate the changes of hw/core/machine.c
of the commit 1c79ab6937ae938d3dfd4da1c01afc7eb599857e ("virtio-net:
Advertise UDP tunnel GSO support by default"). Afterwards, try
migrating from a QEMU stable version like 11.0 to a QEMU with your
changes.


>
>
> >      DEFINE_PROP_ON_OFF_AUTO_BIT64("hash-ipv4", VirtIONet,
> >                                    rss_data.specified_hash_types,
> >                                    VIRTIO_NET_HASH_REPORT_IPv4 - 1,
> > diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
> > index 371e376428..b3a7df5ad8 100644
> > --- a/include/hw/virtio/virtio-net.h
> > +++ b/include/hw/virtio/virtio-net.h
> > @@ -158,6 +158,7 @@ typedef struct VirtioNetRssData {
> >  typedef struct VirtIONetQueue {
> >      VirtQueue *rx_vq;
> >      VirtQueue *tx_vq;
> > +    QEMUTimer *rx_timer;
> >      QEMUTimer *tx_timer;
> >      QEMUBH *tx_bh;
> >      uint32_t tx_waiting;
> > @@ -177,7 +178,6 @@ struct VirtIONet {
> >      /* RSC Chains - temporary storage of coalesced data,
> >         all these data are lost in case of migration */
> >      QTAILQ_HEAD(, VirtioNetRscChain) rsc_chains;
> > -    uint32_t tx_timeout;
> >      int32_t tx_burst;
> >      uint32_t has_vnet_hdr;
> >      size_t host_hdr_len;
> > @@ -230,6 +230,13 @@ struct VirtIONet {
> >      struct EBPFRSSContext ebpf_rss;
> >      uint32_t nr_ebpf_rss_fds;
> >      char **ebpf_rss_fds;
> > +    uint32_t rx_coal_usecs;
> > +    uint32_t rx_coal_packets;
> > +    uint32_t rx_pkt_cnt;
> > +    uint32_t tx_coal_usecs;
> > +    uint32_t tx_coal_packets;
> > +    uint32_t tx_pkt_cnt;
> > +    bool tx_timer_activate;
> >  };
>
> Can we get some documentation on what each of these is?
>
>
> >
> >  size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
> > diff --git a/net/passt.c b/net/passt.c
> > index 4ff94ee509..0b0d9e222a 100644
> > --- a/net/passt.c
> > +++ b/net/passt.c
> > @@ -52,6 +52,7 @@ static const int user_feature_bits[] = {
> >      VIRTIO_NET_F_GUEST_USO4,
> >      VIRTIO_NET_F_GUEST_USO6,
> >      VIRTIO_NET_F_HOST_USO,
> > +    VIRTIO_NET_F_NOTF_COAL,
> >
> >      /* This bit implies RARP isn't sent by QEMU out of band */
> >      VIRTIO_NET_F_GUEST_ANNOUNCE,
> > diff --git a/net/tap.c b/net/tap.c
> > index 8d7ab6ba6f..ea5987a3dc 100644
> > --- a/net/tap.c
> > +++ b/net/tap.c
> > @@ -62,6 +62,7 @@ static const int kernel_feature_bits[] = {
> >      VIRTIO_F_NOTIFICATION_DATA,
> >      VIRTIO_NET_F_RSC_EXT,
> >      VIRTIO_NET_F_HASH_REPORT,
> > +    VIRTIO_NET_F_NOTF_COAL,
> >      VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO,
> >      VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO,
> >      VHOST_INVALID_FEATURE_BIT
> > diff --git a/net/vhost-user.c b/net/vhost-user.c
> > index a4bb49bbcf..f0b3752d7c 100644
> > --- a/net/vhost-user.c
> > +++ b/net/vhost-user.c
> > @@ -54,6 +54,7 @@ static const int user_feature_bits[] = {
> >      VIRTIO_NET_F_GUEST_USO4,
> >      VIRTIO_NET_F_GUEST_USO6,
> >      VIRTIO_NET_F_HOST_USO,
> > +    VIRTIO_NET_F_NOTF_COAL,
> >
> >      /* This bit implies RARP isn't sent by QEMU out of band */
> >      VIRTIO_NET_F_GUEST_ANNOUNCE,
> > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> > index 3df6091274..4ab8f26ceb 100644
> > --- a/net/vhost-vdpa.c
> > +++ b/net/vhost-vdpa.c
> > @@ -70,6 +70,7 @@ static const int vdpa_feature_bits[] = {
> >      VIRTIO_NET_F_CTRL_RX,
> >      VIRTIO_NET_F_CTRL_RX_EXTRA,
> >      VIRTIO_NET_F_CTRL_VLAN,
> > +    VIRTIO_NET_F_NOTF_COAL,
> >      VIRTIO_NET_F_CTRL_VQ,
> >      VIRTIO_NET_F_GSO,
> >      VIRTIO_NET_F_GUEST_CSUM,
> > --
> > 2.53.0
>