hw/net/virtio-net.c | 193 +++++++++++++++++++++++++++++---- include/hw/virtio/virtio-net.h | 10 +- net/passt.c | 1 + net/tap.c | 1 + net/vhost-user.c | 1 + net/vhost-vdpa.c | 1 + 6 files changed, 184 insertions(+), 23 deletions(-)
Implement VirtIO Network Notification Coalescing (Bit 53).
This allows the guest to manage interrupt frequency using ethtool
-C for both RX and TX paths.
- Added VIRTIO_NET_F_NOTF_COAL to host features.
- Implemented VIRTIO_NET_CTRL_NOTF_COAL class handling in
virtio_net_handle_ctrl_iov.
- Added logic to store and apply rx/tx usecs and max_packets.
- Added packet counters and threshold logic for both RX and TX data paths.
- Dynamic Dispatcher: Implemented a dispatcher mechanism that
dynamically switches/activates the notification callback logic
only after the guest enables TX coalescing via ethtool.
- After VM LM coalescing parameters persist in the destination VM.
This reduces interrupt overhead by batching notifications based on
either a packet count or a time-based threshold.
Signed-off-by: Koushik Dutta <kdutta@redhat.com>
---
hw/net/virtio-net.c | 193 +++++++++++++++++++++++++++++----
include/hw/virtio/virtio-net.h | 10 +-
net/passt.c | 1 +
net/tap.c | 1 +
net/vhost-user.c | 1 +
net/vhost-vdpa.c | 1 +
6 files changed, 184 insertions(+), 23 deletions(-)
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 2a5d642a64..aabba6a4be 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -157,6 +157,16 @@ static void flush_or_purge_queued_packets(NetClientState *nc)
* - we could suppress RX interrupt if we were so inclined.
*/
+static void virtio_net_rx_notify(void *opaque)
+{
+ VirtIONetQueue *q = opaque;
+ VirtIONet *n = q->n;
+ VirtIODevice *vdev = VIRTIO_DEVICE(n);
+
+ n->rx_pkt_cnt = 0;
+ virtio_notify(vdev, q->rx_vq);
+}
+
static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
{
VirtIONet *n = VIRTIO_NET(vdev);
@@ -435,7 +445,7 @@ static int virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
if (queue_started) {
if (q->tx_timer) {
timer_mod(q->tx_timer,
- qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
+ qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
} else {
replay_bh_schedule_event(q->tx_bh);
}
@@ -1080,6 +1090,52 @@ static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
}
}
+static void virtio_net_tx_timer(void *opaque);
+
+static int virtio_net_handle_coal(VirtIONet *n, uint8_t cmd,
+ struct iovec *iov, unsigned int iov_cnt)
+{
+ struct virtio_net_ctrl_coal coal;
+ VirtIONetQueue *q;
+ size_t s;
+ int i;
+
+ s = iov_to_buf(iov, iov_cnt, 0, &coal, sizeof(coal));
+ if (s != sizeof(coal)) {
+ return VIRTIO_NET_ERR;
+ }
+
+ if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_RX_SET) {
+ n->rx_coal_usecs = le32_to_cpu(coal.max_usecs);
+ n->rx_coal_packets = le32_to_cpu(coal.max_packets);
+ if (n->rx_coal_usecs > 0) {
+ for (i = 0; i < n->max_queue_pairs; i++) {
+ q = &n->vqs[i];
+ if (!q->rx_timer) {
+ q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
+ virtio_net_rx_notify,
+ q);
+ }
+ }
+ }
+ } else if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_TX_SET) {
+ n->tx_coal_usecs = le32_to_cpu(coal.max_usecs) * 1000;
+ n->tx_coal_packets = le32_to_cpu(coal.max_packets);
+ if (n->tx_coal_usecs > 0) {
+ for (i = 0; i < n->max_queue_pairs; i++) {
+ q = &n->vqs[i];
+ if (!q->tx_timer && n->tx_coal_usecs > 0) {
+ q->tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+ virtio_net_tx_timer,
+ q);
+ }
+ }
+ }
+ }
+
+ return VIRTIO_NET_OK;
+}
+
static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
struct iovec *iov, unsigned int iov_cnt)
{
@@ -1581,6 +1637,10 @@ size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
} else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
+ } else if (ctrl.class == VIRTIO_NET_CTRL_NOTF_COAL) {
+ if (!n->tx_timer_activate) {
+ status = virtio_net_handle_coal(n, ctrl.cmd, iov, out_num);
+ }
}
s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
@@ -2040,7 +2100,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
}
virtqueue_flush(q->rx_vq, i);
- virtio_notify(vdev, q->rx_vq);
+
+ /* rx coalescing */
+ n->rx_pkt_cnt += i;
+ if (n->rx_coal_usecs == 0 || n->rx_pkt_cnt >= n->rx_coal_packets) {
+ if (q->rx_timer) {
+ timer_del(q->rx_timer);
+ }
+ virtio_net_rx_notify(q);
+ } else {
+ if (q->rx_timer) {
+ if (!timer_pending(q->rx_timer)) {
+ timer_mod(q->rx_timer,
+ qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
+ }
+ }
+ }
return size;
@@ -2708,7 +2783,7 @@ static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
replay_bh_schedule_event(q->tx_bh);
} else {
timer_mod(q->tx_timer,
- qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
+ qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
}
q->tx_waiting = 1;
}
@@ -2817,7 +2892,6 @@ detach:
return -EINVAL;
}
-static void virtio_net_tx_timer(void *opaque);
static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
{
@@ -2842,7 +2916,7 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
} else {
/* re-arm timer to flush it (and more) on next tick */
timer_mod(q->tx_timer,
- qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
+ qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
q->tx_waiting = 1;
virtio_queue_set_notification(vq, 0);
}
@@ -2899,6 +2973,12 @@ static void virtio_net_tx_timer(void *opaque)
if (ret == -EBUSY || ret == -EINVAL) {
return;
}
+ if (n->tx_pkt_cnt < ret) {
+ n->tx_pkt_cnt = 0;
+ } else {
+ n->tx_pkt_cnt -= ret;
+ }
+
/*
* If we flush a full burst of packets, assume there are
* more coming and immediately rearm
@@ -2906,7 +2986,7 @@ static void virtio_net_tx_timer(void *opaque)
if (ret >= n->tx_burst) {
q->tx_waiting = 1;
timer_mod(q->tx_timer,
- qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
+ qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
return;
}
/*
@@ -2918,9 +2998,10 @@ static void virtio_net_tx_timer(void *opaque)
ret = virtio_net_flush_tx(q);
if (ret > 0) {
virtio_queue_set_notification(q->tx_vq, 0);
+ n->tx_pkt_cnt -= ret;
q->tx_waiting = 1;
timer_mod(q->tx_timer,
- qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
+ qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
}
}
@@ -2973,6 +3054,32 @@ static void virtio_net_tx_bh(void *opaque)
}
}
+static void virtio_net_handle_tx_dispatch(VirtIODevice *vdev, VirtQueue *vq)
+{
+ VirtIONet *n = VIRTIO_NET(vdev);
+ VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
+ bool use_timer = n->tx_timer_activate || n->tx_coal_usecs > 0 ||
+ n->tx_coal_packets > 0;
+ bool pkt_limit = (n->tx_coal_packets > 0);
+
+ if (use_timer) {
+ n->tx_pkt_cnt++;
+ if (!pkt_limit || n->tx_pkt_cnt < n->tx_coal_packets) {
+ if (q->tx_timer) {
+ virtio_net_handle_tx_timer(vdev, vq);
+ return;
+ }
+ }
+ n->tx_pkt_cnt = 0;
+ if (q->tx_timer) {
+ timer_del(q->tx_timer);
+ }
+ virtio_net_handle_tx_bh(vdev, vq);
+ } else {
+ virtio_net_handle_tx_bh(vdev, vq);
+ }
+}
+
static void virtio_net_add_queue(VirtIONet *n, int index)
{
VirtIODevice *vdev = VIRTIO_DEVICE(n);
@@ -2980,20 +3087,15 @@ static void virtio_net_add_queue(VirtIONet *n, int index)
n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
virtio_net_handle_rx);
- if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
- n->vqs[index].tx_vq =
- virtio_add_queue(vdev, n->net_conf.tx_queue_size,
- virtio_net_handle_tx_timer);
- n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
- virtio_net_tx_timer,
- &n->vqs[index]);
- } else {
- n->vqs[index].tx_vq =
- virtio_add_queue(vdev, n->net_conf.tx_queue_size,
- virtio_net_handle_tx_bh);
- n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
- &DEVICE(vdev)->mem_reentrancy_guard);
- }
+ n->vqs[index].tx_vq =
+ virtio_add_queue(vdev,
+ n->net_conf.tx_queue_size,
+ virtio_net_handle_tx_dispatch);
+
+ n->vqs[index].tx_bh =
+ qemu_bh_new_guarded(virtio_net_tx_bh,
+ &n->vqs[index],
+ &DEVICE(vdev)->mem_reentrancy_guard);
n->vqs[index].tx_waiting = 0;
n->vqs[index].n = n;
@@ -3242,6 +3344,35 @@ static int virtio_net_post_load_device(void *opaque, int version_id)
}
virtio_net_commit_rss_config(n);
+ if (n->tx_coal_usecs > 0 || n->rx_coal_usecs > 0) {
+
+ for (i = 0; i < n->max_queue_pairs; i++) {
+ VirtIONetQueue *q = &n->vqs[i];
+ if (!q->rx_timer && n->rx_coal_usecs > 0) {
+ q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
+ virtio_net_rx_notify,
+ q);
+ }
+
+ if (!q->tx_timer && n->tx_coal_usecs > 0) {
+ q->tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
+ virtio_net_tx_timer,
+ q);
+ }
+
+ if (n->tx_coal_usecs > 0 && q->tx_timer) {
+ n->tx_pkt_cnt = 0;
+ timer_mod(q->tx_timer,
+ qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
+ }
+
+ if (n->rx_coal_usecs > 0 && q->rx_timer) {
+ timer_mod(q->rx_timer,
+ qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
+ }
+ }
+ }
+
return 0;
}
@@ -3617,6 +3748,10 @@ static const VMStateDescription vmstate_virtio_net_device = {
vmstate_virtio_net_tx_waiting),
VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
has_ctrl_guest_offloads),
+ VMSTATE_UINT32(rx_coal_usecs, VirtIONet),
+ VMSTATE_UINT32(tx_coal_usecs, VirtIONet),
+ VMSTATE_UINT32(rx_coal_packets, VirtIONet),
+ VMSTATE_UINT32(tx_coal_packets, VirtIONet),
VMSTATE_END_OF_LIST()
},
.subsections = (const VMStateDescription * const []) {
@@ -3960,7 +4095,6 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
}
n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
n->curr_queue_pairs = 1;
- n->tx_timeout = n->net_conf.txtimer;
if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
&& strcmp(n->net_conf.tx, "bh")) {
@@ -3970,6 +4104,13 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
error_printf("Defaulting to \"bh\"");
}
+ if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer") == 0) {
+ n->tx_coal_usecs = n->net_conf.txtimer;
+ n->tx_timer_activate = true;
+ } else {
+ n->tx_coal_usecs = 0;
+ }
+
n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
n->net_conf.tx_queue_size);
@@ -4046,6 +4187,12 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
n->rss_data.specified_hash_types.on_bits |
n->rss_data.specified_hash_types.auto_bits;
}
+ n->rx_pkt_cnt = 0;
+ n->tx_pkt_cnt = 0;
+ n->rx_coal_usecs = 0;
+ n->rx_coal_packets = 0;
+ n->tx_coal_packets = 0;
+ n->rx_index_timer = NULL;
}
static void virtio_net_device_unrealize(DeviceState *dev)
@@ -4258,6 +4405,8 @@ static const Property virtio_net_properties[] = {
VIRTIO_NET_F_GUEST_USO6, true),
DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
VIRTIO_NET_F_HOST_USO, true),
+ DEFINE_PROP_BIT64("vq_notf_coal", VirtIONet, host_features,
+ VIRTIO_NET_F_NOTF_COAL, true),
DEFINE_PROP_ON_OFF_AUTO_BIT64("hash-ipv4", VirtIONet,
rss_data.specified_hash_types,
VIRTIO_NET_HASH_REPORT_IPv4 - 1,
diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
index 371e376428..a76d271096 100644
--- a/include/hw/virtio/virtio-net.h
+++ b/include/hw/virtio/virtio-net.h
@@ -158,6 +158,7 @@ typedef struct VirtioNetRssData {
typedef struct VirtIONetQueue {
VirtQueue *rx_vq;
VirtQueue *tx_vq;
+ QEMUTimer *rx_timer;
QEMUTimer *tx_timer;
QEMUBH *tx_bh;
uint32_t tx_waiting;
@@ -177,7 +178,6 @@ struct VirtIONet {
/* RSC Chains - temporary storage of coalesced data,
all these data are lost in case of migration */
QTAILQ_HEAD(, VirtioNetRscChain) rsc_chains;
- uint32_t tx_timeout;
int32_t tx_burst;
uint32_t has_vnet_hdr;
size_t host_hdr_len;
@@ -230,6 +230,14 @@ struct VirtIONet {
struct EBPFRSSContext ebpf_rss;
uint32_t nr_ebpf_rss_fds;
char **ebpf_rss_fds;
+ QEMUTimer *rx_index_timer;
+ uint32_t rx_coal_usecs;
+ uint32_t rx_coal_packets;
+ uint32_t rx_pkt_cnt;
+ uint32_t tx_coal_usecs;
+ uint32_t tx_coal_packets;
+ uint32_t tx_pkt_cnt;
+ bool tx_timer_activate;
};
size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
diff --git a/net/passt.c b/net/passt.c
index 4ff94ee509..0b0d9e222a 100644
--- a/net/passt.c
+++ b/net/passt.c
@@ -52,6 +52,7 @@ static const int user_feature_bits[] = {
VIRTIO_NET_F_GUEST_USO4,
VIRTIO_NET_F_GUEST_USO6,
VIRTIO_NET_F_HOST_USO,
+ VIRTIO_NET_F_NOTF_COAL,
/* This bit implies RARP isn't sent by QEMU out of band */
VIRTIO_NET_F_GUEST_ANNOUNCE,
diff --git a/net/tap.c b/net/tap.c
index 8d7ab6ba6f..ea5987a3dc 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -62,6 +62,7 @@ static const int kernel_feature_bits[] = {
VIRTIO_F_NOTIFICATION_DATA,
VIRTIO_NET_F_RSC_EXT,
VIRTIO_NET_F_HASH_REPORT,
+ VIRTIO_NET_F_NOTF_COAL,
VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO,
VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO,
VHOST_INVALID_FEATURE_BIT
diff --git a/net/vhost-user.c b/net/vhost-user.c
index a4bb49bbcf..f0b3752d7c 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -54,6 +54,7 @@ static const int user_feature_bits[] = {
VIRTIO_NET_F_GUEST_USO4,
VIRTIO_NET_F_GUEST_USO6,
VIRTIO_NET_F_HOST_USO,
+ VIRTIO_NET_F_NOTF_COAL,
/* This bit implies RARP isn't sent by QEMU out of band */
VIRTIO_NET_F_GUEST_ANNOUNCE,
diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
index 3df6091274..4ab8f26ceb 100644
--- a/net/vhost-vdpa.c
+++ b/net/vhost-vdpa.c
@@ -70,6 +70,7 @@ static const int vdpa_feature_bits[] = {
VIRTIO_NET_F_CTRL_RX,
VIRTIO_NET_F_CTRL_RX_EXTRA,
VIRTIO_NET_F_CTRL_VLAN,
+ VIRTIO_NET_F_NOTF_COAL,
VIRTIO_NET_F_CTRL_VQ,
VIRTIO_NET_F_GSO,
VIRTIO_NET_F_GUEST_CSUM,
--
2.53.0
On Mon, May 4, 2026 at 11:18 AM Koushik Dutta <kdutta@redhat.com> wrote:
>
> Implement VirtIO Network Notification Coalescing (Bit 53).
> This allows the guest to manage interrupt frequency using ethtool
> -C for both RX and TX paths.
>
> - Added VIRTIO_NET_F_NOTF_COAL to host features.
> - Implemented VIRTIO_NET_CTRL_NOTF_COAL class handling in
> virtio_net_handle_ctrl_iov.
> - Added logic to store and apply rx/tx usecs and max_packets.
> - Added packet counters and threshold logic for both RX and TX data paths.
> - Dynamic Dispatcher: Implemented a dispatcher mechanism that
> dynamically switches/activates the notification callback logic
> only after the guest enables TX coalescing via ethtool.
> - After VM LM coalescing parameters persist in the destination VM.
>
> This reduces interrupt overhead by batching notifications based on
> either a packet count or a time-based threshold.
>
> Signed-off-by: Koushik Dutta <kdutta@redhat.com>
Please include a changelog across the different versions of the series.
> ---
> hw/net/virtio-net.c | 193 +++++++++++++++++++++++++++++----
> include/hw/virtio/virtio-net.h | 10 +-
> net/passt.c | 1 +
> net/tap.c | 1 +
> net/vhost-user.c | 1 +
> net/vhost-vdpa.c | 1 +
> 6 files changed, 184 insertions(+), 23 deletions(-)
>
> diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
> index 2a5d642a64..aabba6a4be 100644
> --- a/hw/net/virtio-net.c
> +++ b/hw/net/virtio-net.c
> @@ -157,6 +157,16 @@ static void flush_or_purge_queued_packets(NetClientState *nc)
> * - we could suppress RX interrupt if we were so inclined.
> */
>
> +static void virtio_net_rx_notify(void *opaque)
> +{
> + VirtIONetQueue *q = opaque;
> + VirtIONet *n = q->n;
> + VirtIODevice *vdev = VIRTIO_DEVICE(n);
> +
> + n->rx_pkt_cnt = 0;
> + virtio_notify(vdev, q->rx_vq);
> +}
> +
> static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config)
> {
> VirtIONet *n = VIRTIO_NET(vdev);
> @@ -435,7 +445,7 @@ static int virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status)
> if (queue_started) {
> if (q->tx_timer) {
> timer_mod(q->tx_timer,
> - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
This is adding up ns + us, different units. We need to stick with us,
as the command-line parameter was already in us.
> } else {
> replay_bh_schedule_event(q->tx_bh);
> }
> @@ -1080,6 +1090,52 @@ static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd,
> }
> }
>
> +static void virtio_net_tx_timer(void *opaque);
> +
> +static int virtio_net_handle_coal(VirtIONet *n, uint8_t cmd,
> + struct iovec *iov, unsigned int iov_cnt)
> +{
> + struct virtio_net_ctrl_coal coal;
> + VirtIONetQueue *q;
> + size_t s;
> + int i;
> +
> + s = iov_to_buf(iov, iov_cnt, 0, &coal, sizeof(coal));
> + if (s != sizeof(coal)) {
> + return VIRTIO_NET_ERR;
> + }
> +
> + if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_RX_SET) {
> + n->rx_coal_usecs = le32_to_cpu(coal.max_usecs);
> + n->rx_coal_packets = le32_to_cpu(coal.max_packets);
> + if (n->rx_coal_usecs > 0) {
> + for (i = 0; i < n->max_queue_pairs; i++) {
> + q = &n->vqs[i];
> + if (!q->rx_timer) {
> + q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> + virtio_net_rx_notify,
> + q);
> + }
> + }
> + }
> + } else if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_TX_SET) {
> + n->tx_coal_usecs = le32_to_cpu(coal.max_usecs) * 1000;
> + n->tx_coal_packets = le32_to_cpu(coal.max_packets);
> + if (n->tx_coal_usecs > 0) {
> + for (i = 0; i < n->max_queue_pairs; i++) {
> + q = &n->vqs[i];
> + if (!q->tx_timer && n->tx_coal_usecs > 0) {
> + q->tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
> + virtio_net_tx_timer,
> + q);
> + }
> + }
> + }
> + }
> +
> + return VIRTIO_NET_OK;
> +}
> +
> static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd,
> struct iovec *iov, unsigned int iov_cnt)
> {
> @@ -1581,6 +1637,10 @@ size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
> status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num);
> } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) {
> status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num);
> + } else if (ctrl.class == VIRTIO_NET_CTRL_NOTF_COAL) {
> + if (!n->tx_timer_activate) {
Even if the feature is best effort I think it is better not to forward
the feature flag either.
> + status = virtio_net_handle_coal(n, ctrl.cmd, iov, out_num);
> + }
> }
>
> s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status));
> @@ -2040,7 +2100,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf,
> }
>
> virtqueue_flush(q->rx_vq, i);
> - virtio_notify(vdev, q->rx_vq);
> +
> + /* rx coalescing */
> + n->rx_pkt_cnt += i;
> + if (n->rx_coal_usecs == 0 || n->rx_pkt_cnt >= n->rx_coal_packets) {
> + if (q->rx_timer) {
> + timer_del(q->rx_timer);
> + }
> + virtio_net_rx_notify(q);
> + } else {
> + if (q->rx_timer) {
> + if (!timer_pending(q->rx_timer)) {
> + timer_mod(q->rx_timer,
> + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
> + }
> + }
> + }
>
> return size;
>
> @@ -2708,7 +2783,7 @@ static void virtio_net_tx_complete(NetClientState *nc, ssize_t len)
> replay_bh_schedule_event(q->tx_bh);
> } else {
> timer_mod(q->tx_timer,
> - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> }
> q->tx_waiting = 1;
> }
> @@ -2817,7 +2892,6 @@ detach:
> return -EINVAL;
> }
>
> -static void virtio_net_tx_timer(void *opaque);
>
> static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
> {
> @@ -2842,7 +2916,7 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq)
> } else {
> /* re-arm timer to flush it (and more) on next tick */
> timer_mod(q->tx_timer,
> - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> q->tx_waiting = 1;
> virtio_queue_set_notification(vq, 0);
> }
> @@ -2899,6 +2973,12 @@ static void virtio_net_tx_timer(void *opaque)
> if (ret == -EBUSY || ret == -EINVAL) {
> return;
> }
> + if (n->tx_pkt_cnt < ret) {
> + n->tx_pkt_cnt = 0;
> + } else {
> + n->tx_pkt_cnt -= ret;
> + }
> +
> /*
> * If we flush a full burst of packets, assume there are
> * more coming and immediately rearm
> @@ -2906,7 +2986,7 @@ static void virtio_net_tx_timer(void *opaque)
> if (ret >= n->tx_burst) {
> q->tx_waiting = 1;
> timer_mod(q->tx_timer,
> - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> return;
> }
> /*
> @@ -2918,9 +2998,10 @@ static void virtio_net_tx_timer(void *opaque)
> ret = virtio_net_flush_tx(q);
> if (ret > 0) {
> virtio_queue_set_notification(q->tx_vq, 0);
> + n->tx_pkt_cnt -= ret;
> q->tx_waiting = 1;
> timer_mod(q->tx_timer,
> - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout);
> + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> }
> }
>
> @@ -2973,6 +3054,32 @@ static void virtio_net_tx_bh(void *opaque)
> }
> }
>
> +static void virtio_net_handle_tx_dispatch(VirtIODevice *vdev, VirtQueue *vq)
> +{
> + VirtIONet *n = VIRTIO_NET(vdev);
> + VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))];
> + bool use_timer = n->tx_timer_activate || n->tx_coal_usecs > 0 ||
> + n->tx_coal_packets > 0;
> + bool pkt_limit = (n->tx_coal_packets > 0);
> +
> + if (use_timer) {
> + n->tx_pkt_cnt++;
> + if (!pkt_limit || n->tx_pkt_cnt < n->tx_coal_packets) {
> + if (q->tx_timer) {
> + virtio_net_handle_tx_timer(vdev, vq);
> + return;
> + }
> + }
> + n->tx_pkt_cnt = 0;
> + if (q->tx_timer) {
> + timer_del(q->tx_timer);
> + }
> + virtio_net_handle_tx_bh(vdev, vq);
> + } else {
> + virtio_net_handle_tx_bh(vdev, vq);
> + }
> +}
> +
> static void virtio_net_add_queue(VirtIONet *n, int index)
> {
> VirtIODevice *vdev = VIRTIO_DEVICE(n);
> @@ -2980,20 +3087,15 @@ static void virtio_net_add_queue(VirtIONet *n, int index)
> n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size,
> virtio_net_handle_rx);
>
> - if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) {
> - n->vqs[index].tx_vq =
> - virtio_add_queue(vdev, n->net_conf.tx_queue_size,
> - virtio_net_handle_tx_timer);
> - n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
> - virtio_net_tx_timer,
> - &n->vqs[index]);
> - } else {
> - n->vqs[index].tx_vq =
> - virtio_add_queue(vdev, n->net_conf.tx_queue_size,
> - virtio_net_handle_tx_bh);
> - n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index],
> - &DEVICE(vdev)->mem_reentrancy_guard);
> - }
> + n->vqs[index].tx_vq =
> + virtio_add_queue(vdev,
> + n->net_conf.tx_queue_size,
> + virtio_net_handle_tx_dispatch);
> +
> + n->vqs[index].tx_bh =
> + qemu_bh_new_guarded(virtio_net_tx_bh,
> + &n->vqs[index],
> + &DEVICE(vdev)->mem_reentrancy_guard);
>
> n->vqs[index].tx_waiting = 0;
> n->vqs[index].n = n;
> @@ -3242,6 +3344,35 @@ static int virtio_net_post_load_device(void *opaque, int version_id)
> }
>
> virtio_net_commit_rss_config(n);
> + if (n->tx_coal_usecs > 0 || n->rx_coal_usecs > 0) {
> +
> + for (i = 0; i < n->max_queue_pairs; i++) {
> + VirtIONetQueue *q = &n->vqs[i];
> + if (!q->rx_timer && n->rx_coal_usecs > 0) {
> + q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL,
> + virtio_net_rx_notify,
> + q);
> + }
> +
> + if (!q->tx_timer && n->tx_coal_usecs > 0) {
> + q->tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL,
> + virtio_net_tx_timer,
> + q);
> + }
> +
> + if (n->tx_coal_usecs > 0 && q->tx_timer) {
> + n->tx_pkt_cnt = 0;
> + timer_mod(q->tx_timer,
> + qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs);
> + }
> +
> + if (n->rx_coal_usecs > 0 && q->rx_timer) {
> + timer_mod(q->rx_timer,
> + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs);
> + }
> + }
> + }
> +
> return 0;
> }
>
> @@ -3617,6 +3748,10 @@ static const VMStateDescription vmstate_virtio_net_device = {
> vmstate_virtio_net_tx_waiting),
> VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet,
> has_ctrl_guest_offloads),
> + VMSTATE_UINT32(rx_coal_usecs, VirtIONet),
> + VMSTATE_UINT32(tx_coal_usecs, VirtIONet),
> + VMSTATE_UINT32(rx_coal_packets, VirtIONet),
> + VMSTATE_UINT32(tx_coal_packets, VirtIONet),
> VMSTATE_END_OF_LIST()
> },
> .subsections = (const VMStateDescription * const []) {
> @@ -3960,7 +4095,6 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
> }
> n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs);
> n->curr_queue_pairs = 1;
> - n->tx_timeout = n->net_conf.txtimer;
>
> if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")
> && strcmp(n->net_conf.tx, "bh")) {
> @@ -3970,6 +4104,13 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
> error_printf("Defaulting to \"bh\"");
> }
>
> + if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer") == 0) {
> + n->tx_coal_usecs = n->net_conf.txtimer;
> + n->tx_timer_activate = true;
> + } else {
> + n->tx_coal_usecs = 0;
> + }
> +
> n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
> n->net_conf.tx_queue_size);
>
> @@ -4046,6 +4187,12 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp)
> n->rss_data.specified_hash_types.on_bits |
> n->rss_data.specified_hash_types.auto_bits;
> }
> + n->rx_pkt_cnt = 0;
> + n->tx_pkt_cnt = 0;
> + n->rx_coal_usecs = 0;
> + n->rx_coal_packets = 0;
> + n->tx_coal_packets = 0;
> + n->rx_index_timer = NULL;
n->rx_index_timer is added to the struct but it is not used in the ret
of the patch, isn't it?
> }
>
> static void virtio_net_device_unrealize(DeviceState *dev)
> @@ -4258,6 +4405,8 @@ static const Property virtio_net_properties[] = {
> VIRTIO_NET_F_GUEST_USO6, true),
> DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features,
> VIRTIO_NET_F_HOST_USO, true),
> + DEFINE_PROP_BIT64("vq_notf_coal", VirtIONet, host_features,
> + VIRTIO_NET_F_NOTF_COAL, true),
> DEFINE_PROP_ON_OFF_AUTO_BIT64("hash-ipv4", VirtIONet,
> rss_data.specified_hash_types,
> VIRTIO_NET_HASH_REPORT_IPv4 - 1,
> diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h
> index 371e376428..a76d271096 100644
> --- a/include/hw/virtio/virtio-net.h
> +++ b/include/hw/virtio/virtio-net.h
> @@ -158,6 +158,7 @@ typedef struct VirtioNetRssData {
> typedef struct VirtIONetQueue {
> VirtQueue *rx_vq;
> VirtQueue *tx_vq;
> + QEMUTimer *rx_timer;
> QEMUTimer *tx_timer;
> QEMUBH *tx_bh;
> uint32_t tx_waiting;
> @@ -177,7 +178,6 @@ struct VirtIONet {
> /* RSC Chains - temporary storage of coalesced data,
> all these data are lost in case of migration */
> QTAILQ_HEAD(, VirtioNetRscChain) rsc_chains;
> - uint32_t tx_timeout;
> int32_t tx_burst;
> uint32_t has_vnet_hdr;
> size_t host_hdr_len;
> @@ -230,6 +230,14 @@ struct VirtIONet {
> struct EBPFRSSContext ebpf_rss;
> uint32_t nr_ebpf_rss_fds;
> char **ebpf_rss_fds;
> + QEMUTimer *rx_index_timer;
> + uint32_t rx_coal_usecs;
> + uint32_t rx_coal_packets;
> + uint32_t rx_pkt_cnt;
> + uint32_t tx_coal_usecs;
> + uint32_t tx_coal_packets;
> + uint32_t tx_pkt_cnt;
> + bool tx_timer_activate;
> };
>
> size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev,
> diff --git a/net/passt.c b/net/passt.c
> index 4ff94ee509..0b0d9e222a 100644
> --- a/net/passt.c
> +++ b/net/passt.c
> @@ -52,6 +52,7 @@ static const int user_feature_bits[] = {
> VIRTIO_NET_F_GUEST_USO4,
> VIRTIO_NET_F_GUEST_USO6,
> VIRTIO_NET_F_HOST_USO,
> + VIRTIO_NET_F_NOTF_COAL,
>
> /* This bit implies RARP isn't sent by QEMU out of band */
> VIRTIO_NET_F_GUEST_ANNOUNCE,
> diff --git a/net/tap.c b/net/tap.c
> index 8d7ab6ba6f..ea5987a3dc 100644
> --- a/net/tap.c
> +++ b/net/tap.c
> @@ -62,6 +62,7 @@ static const int kernel_feature_bits[] = {
> VIRTIO_F_NOTIFICATION_DATA,
> VIRTIO_NET_F_RSC_EXT,
> VIRTIO_NET_F_HASH_REPORT,
> + VIRTIO_NET_F_NOTF_COAL,
> VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO,
> VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO,
> VHOST_INVALID_FEATURE_BIT
> diff --git a/net/vhost-user.c b/net/vhost-user.c
> index a4bb49bbcf..f0b3752d7c 100644
> --- a/net/vhost-user.c
> +++ b/net/vhost-user.c
> @@ -54,6 +54,7 @@ static const int user_feature_bits[] = {
> VIRTIO_NET_F_GUEST_USO4,
> VIRTIO_NET_F_GUEST_USO6,
> VIRTIO_NET_F_HOST_USO,
> + VIRTIO_NET_F_NOTF_COAL,
>
> /* This bit implies RARP isn't sent by QEMU out of band */
> VIRTIO_NET_F_GUEST_ANNOUNCE,
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index 3df6091274..4ab8f26ceb 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -70,6 +70,7 @@ static const int vdpa_feature_bits[] = {
> VIRTIO_NET_F_CTRL_RX,
> VIRTIO_NET_F_CTRL_RX_EXTRA,
> VIRTIO_NET_F_CTRL_VLAN,
> + VIRTIO_NET_F_NOTF_COAL,
> VIRTIO_NET_F_CTRL_VQ,
> VIRTIO_NET_F_GSO,
> VIRTIO_NET_F_GUEST_CSUM,
> --
> 2.53.0
>
© 2016 - 2026 Red Hat, Inc.