Currently, the refill work is a global delayed work for all the receive
queues. This commit makes the refill work a per receive queue so that we
can manage them separately and avoid further mistakes. It also helps the
successfully refilled queue avoid the napi_disable in the global delayed
refill work like before.
Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
---
drivers/net/virtio_net.c | 155 ++++++++++++++++++---------------------
1 file changed, 72 insertions(+), 83 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 1bb3aeca66c6..63126e490bda 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -379,6 +379,15 @@ struct receive_queue {
struct xdp_rxq_info xsk_rxq_info;
struct xdp_buff **xsk_buffs;
+
+ /* Is delayed refill enabled? */
+ bool refill_enabled;
+
+ /* The lock to synchronize the access to refill_enabled */
+ spinlock_t refill_lock;
+
+ /* Work struct for delayed refilling if we run low on memory. */
+ struct delayed_work refill;
};
#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
@@ -441,9 +450,6 @@ struct virtnet_info {
/* Packet virtio header size */
u8 hdr_len;
- /* Work struct for delayed refilling if we run low on memory. */
- struct delayed_work refill;
-
/* UDP tunnel support */
bool tx_tnl;
@@ -451,12 +457,6 @@ struct virtnet_info {
bool rx_tnl_csum;
- /* Is delayed refill enabled? */
- bool refill_enabled;
-
- /* The lock to synchronize the access to refill_enabled */
- spinlock_t refill_lock;
-
/* Work struct for config space updates */
struct work_struct config_work;
@@ -720,18 +720,18 @@ static void virtnet_rq_free_buf(struct virtnet_info *vi,
put_page(virt_to_head_page(buf));
}
-static void enable_delayed_refill(struct virtnet_info *vi)
+static void enable_delayed_refill(struct receive_queue *rq)
{
- spin_lock_bh(&vi->refill_lock);
- vi->refill_enabled = true;
- spin_unlock_bh(&vi->refill_lock);
+ spin_lock_bh(&rq->refill_lock);
+ rq->refill_enabled = true;
+ spin_unlock_bh(&rq->refill_lock);
}
-static void disable_delayed_refill(struct virtnet_info *vi)
+static void disable_delayed_refill(struct receive_queue *rq)
{
- spin_lock_bh(&vi->refill_lock);
- vi->refill_enabled = false;
- spin_unlock_bh(&vi->refill_lock);
+ spin_lock_bh(&rq->refill_lock);
+ rq->refill_enabled = false;
+ spin_unlock_bh(&rq->refill_lock);
}
static void enable_rx_mode_work(struct virtnet_info *vi)
@@ -2950,38 +2950,19 @@ static void virtnet_napi_disable(struct receive_queue *rq)
static void refill_work(struct work_struct *work)
{
- struct virtnet_info *vi =
- container_of(work, struct virtnet_info, refill.work);
+ struct receive_queue *rq =
+ container_of(work, struct receive_queue, refill.work);
bool still_empty;
- int i;
-
- for (i = 0; i < vi->curr_queue_pairs; i++) {
- struct receive_queue *rq = &vi->rq[i];
- /*
- * When queue API support is added in the future and the call
- * below becomes napi_disable_locked, this driver will need to
- * be refactored.
- *
- * One possible solution would be to:
- * - cancel refill_work with cancel_delayed_work (note:
- * non-sync)
- * - cancel refill_work with cancel_delayed_work_sync in
- * virtnet_remove after the netdev is unregistered
- * - wrap all of the work in a lock (perhaps the netdev
- * instance lock)
- * - check netif_running() and return early to avoid a race
- */
- napi_disable(&rq->napi);
- still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
- virtnet_napi_do_enable(rq->vq, &rq->napi);
+ napi_disable(&rq->napi);
+ still_empty = !try_fill_recv(rq->vq->vdev->priv, rq, GFP_KERNEL);
+ virtnet_napi_do_enable(rq->vq, &rq->napi);
- /* In theory, this can happen: if we don't get any buffers in
- * we will *never* try to fill again.
- */
- if (still_empty)
- schedule_delayed_work(&vi->refill, HZ/2);
- }
+ /* In theory, this can happen: if we don't get any buffers in
+ * we will *never* try to fill again.
+ */
+ if (still_empty)
+ schedule_delayed_work(&rq->refill, HZ / 2);
}
static int virtnet_receive_xsk_bufs(struct virtnet_info *vi,
@@ -3048,10 +3029,10 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
- spin_lock(&vi->refill_lock);
- if (vi->refill_enabled)
- schedule_delayed_work(&vi->refill, 0);
- spin_unlock(&vi->refill_lock);
+ spin_lock(&rq->refill_lock);
+ if (rq->refill_enabled)
+ schedule_delayed_work(&rq->refill, 0);
+ spin_unlock(&rq->refill_lock);
}
}
@@ -3226,13 +3207,13 @@ static int virtnet_open(struct net_device *dev)
struct virtnet_info *vi = netdev_priv(dev);
int i, err;
- enable_delayed_refill(vi);
-
for (i = 0; i < vi->max_queue_pairs; i++) {
- if (i < vi->curr_queue_pairs)
+ if (i < vi->curr_queue_pairs) {
+ enable_delayed_refill(&vi->rq[i]);
/* Make sure we have some buffers: if oom use wq. */
if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
- schedule_delayed_work(&vi->refill, 0);
+ schedule_delayed_work(&vi->rq[i].refill, 0);
+ }
err = virtnet_enable_queue_pair(vi, i);
if (err < 0)
@@ -3251,10 +3232,9 @@ static int virtnet_open(struct net_device *dev)
return 0;
err_enable_qp:
- disable_delayed_refill(vi);
- cancel_delayed_work_sync(&vi->refill);
-
for (i--; i >= 0; i--) {
+ disable_delayed_refill(&vi->rq[i]);
+ cancel_delayed_work_sync(&vi->rq[i].refill);
virtnet_disable_queue_pair(vi, i);
virtnet_cancel_dim(vi, &vi->rq[i].dim);
}
@@ -3447,14 +3427,15 @@ static void virtnet_rx_pause_all(struct virtnet_info *vi)
{
int i;
- /*
- * Make sure refill_work does not run concurrently to
- * avoid napi_disable race which leads to deadlock.
- */
- disable_delayed_refill(vi);
- cancel_delayed_work_sync(&vi->refill);
- for (i = 0; i < vi->max_queue_pairs; i++)
+ for (i = 0; i < vi->max_queue_pairs; i++) {
+ /*
+ * Make sure refill_work does not run concurrently to
+ * avoid napi_disable race which leads to deadlock.
+ */
+ disable_delayed_refill(&vi->rq[i]);
+ cancel_delayed_work_sync(&vi->rq[i].refill);
__virtnet_rx_pause(vi, &vi->rq[i]);
+ }
}
static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
@@ -3463,8 +3444,8 @@ static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
* Make sure refill_work does not run concurrently to
* avoid napi_disable race which leads to deadlock.
*/
- disable_delayed_refill(vi);
- cancel_delayed_work_sync(&vi->refill);
+ disable_delayed_refill(rq);
+ cancel_delayed_work_sync(&rq->refill);
__virtnet_rx_pause(vi, rq);
}
@@ -3481,25 +3462,26 @@ static void __virtnet_rx_resume(struct virtnet_info *vi,
virtnet_napi_enable(rq);
if (schedule_refill)
- schedule_delayed_work(&vi->refill, 0);
+ schedule_delayed_work(&rq->refill, 0);
}
static void virtnet_rx_resume_all(struct virtnet_info *vi)
{
int i;
- enable_delayed_refill(vi);
for (i = 0; i < vi->max_queue_pairs; i++) {
- if (i < vi->curr_queue_pairs)
+ if (i < vi->curr_queue_pairs) {
+ enable_delayed_refill(&vi->rq[i]);
__virtnet_rx_resume(vi, &vi->rq[i], true);
- else
+ } else {
__virtnet_rx_resume(vi, &vi->rq[i], false);
+ }
}
}
static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq)
{
- enable_delayed_refill(vi);
+ enable_delayed_refill(rq);
__virtnet_rx_resume(vi, rq, true);
}
@@ -3830,10 +3812,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
succ:
vi->curr_queue_pairs = queue_pairs;
/* virtnet_open() will refill when device is going to up. */
- spin_lock_bh(&vi->refill_lock);
- if (dev->flags & IFF_UP && vi->refill_enabled)
- schedule_delayed_work(&vi->refill, 0);
- spin_unlock_bh(&vi->refill_lock);
+ if (dev->flags & IFF_UP) {
+ int i;
+
+ for (i = 0; i < vi->curr_queue_pairs; i++) {
+ spin_lock_bh(&vi->rq[i].refill_lock);
+ if (vi->rq[i].refill_enabled)
+ schedule_delayed_work(&vi->rq[i].refill, 0);
+ spin_unlock_bh(&vi->rq[i].refill_lock);
+ }
+ }
return 0;
}
@@ -3843,10 +3831,6 @@ static int virtnet_close(struct net_device *dev)
struct virtnet_info *vi = netdev_priv(dev);
int i;
- /* Make sure NAPI doesn't schedule refill work */
- disable_delayed_refill(vi);
- /* Make sure refill_work doesn't re-enable napi! */
- cancel_delayed_work_sync(&vi->refill);
/* Prevent the config change callback from changing carrier
* after close
*/
@@ -3857,6 +3841,10 @@ static int virtnet_close(struct net_device *dev)
cancel_work_sync(&vi->config_work);
for (i = 0; i < vi->max_queue_pairs; i++) {
+ /* Make sure NAPI doesn't schedule refill work */
+ disable_delayed_refill(&vi->rq[i]);
+ /* Make sure refill_work doesn't re-enable napi! */
+ cancel_delayed_work_sync(&vi->rq[i].refill);
virtnet_disable_queue_pair(vi, i);
virtnet_cancel_dim(vi, &vi->rq[i].dim);
}
@@ -5802,7 +5790,6 @@ static int virtnet_restore_up(struct virtio_device *vdev)
virtio_device_ready(vdev);
- enable_delayed_refill(vi);
enable_rx_mode_work(vi);
if (netif_running(vi->dev)) {
@@ -6559,8 +6546,9 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
if (!vi->rq)
goto err_rq;
- INIT_DELAYED_WORK(&vi->refill, refill_work);
for (i = 0; i < vi->max_queue_pairs; i++) {
+ INIT_DELAYED_WORK(&vi->rq[i].refill, refill_work);
+ spin_lock_init(&vi->rq[i].refill_lock);
vi->rq[i].pages = NULL;
netif_napi_add_config(vi->dev, &vi->rq[i].napi, virtnet_poll,
i);
@@ -6901,7 +6889,6 @@ static int virtnet_probe(struct virtio_device *vdev)
INIT_WORK(&vi->config_work, virtnet_config_changed_work);
INIT_WORK(&vi->rx_mode_work, virtnet_rx_mode_work);
- spin_lock_init(&vi->refill_lock);
if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
vi->mergeable_rx_bufs = true;
@@ -7165,7 +7152,9 @@ static int virtnet_probe(struct virtio_device *vdev)
net_failover_destroy(vi->failover);
free_vqs:
virtio_reset_device(vdev);
- cancel_delayed_work_sync(&vi->refill);
+ for (i = 0; i < vi->max_queue_pairs; i++)
+ cancel_delayed_work_sync(&vi->rq[i].refill);
+
free_receive_page_frags(vi);
virtnet_del_vqs(vi);
free:
--
2.43.0
On Tue, Dec 23, 2025 at 10:25:31PM +0700, Bui Quang Minh wrote:
> Currently, the refill work is a global delayed work for all the receive
> queues. This commit makes the refill work a per receive queue so that we
> can manage them separately and avoid further mistakes. It also helps the
> successfully refilled queue avoid the napi_disable in the global delayed
> refill work like before.
>
this commit log is unreadable. before what? what is the problem with
"refilled queue napi_disable" this refers to.
> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> ---
> drivers/net/virtio_net.c | 155 ++++++++++++++++++---------------------
> 1 file changed, 72 insertions(+), 83 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 1bb3aeca66c6..63126e490bda 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -379,6 +379,15 @@ struct receive_queue {
> struct xdp_rxq_info xsk_rxq_info;
>
> struct xdp_buff **xsk_buffs;
> +
> + /* Is delayed refill enabled? */
> + bool refill_enabled;
> +
> + /* The lock to synchronize the access to refill_enabled */
> + spinlock_t refill_lock;
> +
> + /* Work struct for delayed refilling if we run low on memory. */
> + struct delayed_work refill;
> };
>
> #define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
> @@ -441,9 +450,6 @@ struct virtnet_info {
> /* Packet virtio header size */
> u8 hdr_len;
>
> - /* Work struct for delayed refilling if we run low on memory. */
> - struct delayed_work refill;
> -
> /* UDP tunnel support */
> bool tx_tnl;
>
> @@ -451,12 +457,6 @@ struct virtnet_info {
>
> bool rx_tnl_csum;
>
> - /* Is delayed refill enabled? */
> - bool refill_enabled;
> -
> - /* The lock to synchronize the access to refill_enabled */
> - spinlock_t refill_lock;
> -
> /* Work struct for config space updates */
> struct work_struct config_work;
>
> @@ -720,18 +720,18 @@ static void virtnet_rq_free_buf(struct virtnet_info *vi,
> put_page(virt_to_head_page(buf));
> }
>
> -static void enable_delayed_refill(struct virtnet_info *vi)
> +static void enable_delayed_refill(struct receive_queue *rq)
> {
> - spin_lock_bh(&vi->refill_lock);
> - vi->refill_enabled = true;
> - spin_unlock_bh(&vi->refill_lock);
> + spin_lock_bh(&rq->refill_lock);
> + rq->refill_enabled = true;
> + spin_unlock_bh(&rq->refill_lock);
> }
>
> -static void disable_delayed_refill(struct virtnet_info *vi)
> +static void disable_delayed_refill(struct receive_queue *rq)
> {
> - spin_lock_bh(&vi->refill_lock);
> - vi->refill_enabled = false;
> - spin_unlock_bh(&vi->refill_lock);
> + spin_lock_bh(&rq->refill_lock);
> + rq->refill_enabled = false;
> + spin_unlock_bh(&rq->refill_lock);
> }
>
> static void enable_rx_mode_work(struct virtnet_info *vi)
> @@ -2950,38 +2950,19 @@ static void virtnet_napi_disable(struct receive_queue *rq)
>
> static void refill_work(struct work_struct *work)
> {
> - struct virtnet_info *vi =
> - container_of(work, struct virtnet_info, refill.work);
> + struct receive_queue *rq =
> + container_of(work, struct receive_queue, refill.work);
> bool still_empty;
> - int i;
> -
> - for (i = 0; i < vi->curr_queue_pairs; i++) {
> - struct receive_queue *rq = &vi->rq[i];
>
> - /*
> - * When queue API support is added in the future and the call
> - * below becomes napi_disable_locked, this driver will need to
> - * be refactored.
> - *
> - * One possible solution would be to:
> - * - cancel refill_work with cancel_delayed_work (note:
> - * non-sync)
> - * - cancel refill_work with cancel_delayed_work_sync in
> - * virtnet_remove after the netdev is unregistered
> - * - wrap all of the work in a lock (perhaps the netdev
> - * instance lock)
> - * - check netif_running() and return early to avoid a race
> - */
> - napi_disable(&rq->napi);
> - still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
> - virtnet_napi_do_enable(rq->vq, &rq->napi);
> + napi_disable(&rq->napi);
> + still_empty = !try_fill_recv(rq->vq->vdev->priv, rq, GFP_KERNEL);
> + virtnet_napi_do_enable(rq->vq, &rq->napi);
>
> - /* In theory, this can happen: if we don't get any buffers in
> - * we will *never* try to fill again.
> - */
> - if (still_empty)
> - schedule_delayed_work(&vi->refill, HZ/2);
> - }
> + /* In theory, this can happen: if we don't get any buffers in
> + * we will *never* try to fill again.
> + */
> + if (still_empty)
> + schedule_delayed_work(&rq->refill, HZ / 2);
> }
>
> static int virtnet_receive_xsk_bufs(struct virtnet_info *vi,
> @@ -3048,10 +3029,10 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
>
> if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
> if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
> - spin_lock(&vi->refill_lock);
> - if (vi->refill_enabled)
> - schedule_delayed_work(&vi->refill, 0);
> - spin_unlock(&vi->refill_lock);
> + spin_lock(&rq->refill_lock);
> + if (rq->refill_enabled)
> + schedule_delayed_work(&rq->refill, 0);
> + spin_unlock(&rq->refill_lock);
> }
> }
>
> @@ -3226,13 +3207,13 @@ static int virtnet_open(struct net_device *dev)
> struct virtnet_info *vi = netdev_priv(dev);
> int i, err;
>
> - enable_delayed_refill(vi);
> -
> for (i = 0; i < vi->max_queue_pairs; i++) {
> - if (i < vi->curr_queue_pairs)
> + if (i < vi->curr_queue_pairs) {
> + enable_delayed_refill(&vi->rq[i]);
> /* Make sure we have some buffers: if oom use wq. */
> if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
> - schedule_delayed_work(&vi->refill, 0);
> + schedule_delayed_work(&vi->rq[i].refill, 0);
> + }
>
> err = virtnet_enable_queue_pair(vi, i);
> if (err < 0)
> @@ -3251,10 +3232,9 @@ static int virtnet_open(struct net_device *dev)
> return 0;
>
> err_enable_qp:
> - disable_delayed_refill(vi);
> - cancel_delayed_work_sync(&vi->refill);
> -
> for (i--; i >= 0; i--) {
> + disable_delayed_refill(&vi->rq[i]);
> + cancel_delayed_work_sync(&vi->rq[i].refill);
> virtnet_disable_queue_pair(vi, i);
> virtnet_cancel_dim(vi, &vi->rq[i].dim);
> }
> @@ -3447,14 +3427,15 @@ static void virtnet_rx_pause_all(struct virtnet_info *vi)
> {
> int i;
>
> - /*
> - * Make sure refill_work does not run concurrently to
> - * avoid napi_disable race which leads to deadlock.
> - */
> - disable_delayed_refill(vi);
> - cancel_delayed_work_sync(&vi->refill);
> - for (i = 0; i < vi->max_queue_pairs; i++)
> + for (i = 0; i < vi->max_queue_pairs; i++) {
> + /*
> + * Make sure refill_work does not run concurrently to
> + * avoid napi_disable race which leads to deadlock.
> + */
> + disable_delayed_refill(&vi->rq[i]);
> + cancel_delayed_work_sync(&vi->rq[i].refill);
> __virtnet_rx_pause(vi, &vi->rq[i]);
> + }
> }
>
> static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
> @@ -3463,8 +3444,8 @@ static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
> * Make sure refill_work does not run concurrently to
> * avoid napi_disable race which leads to deadlock.
> */
> - disable_delayed_refill(vi);
> - cancel_delayed_work_sync(&vi->refill);
> + disable_delayed_refill(rq);
> + cancel_delayed_work_sync(&rq->refill);
> __virtnet_rx_pause(vi, rq);
> }
>
> @@ -3481,25 +3462,26 @@ static void __virtnet_rx_resume(struct virtnet_info *vi,
> virtnet_napi_enable(rq);
>
> if (schedule_refill)
> - schedule_delayed_work(&vi->refill, 0);
> + schedule_delayed_work(&rq->refill, 0);
> }
>
> static void virtnet_rx_resume_all(struct virtnet_info *vi)
> {
> int i;
>
> - enable_delayed_refill(vi);
> for (i = 0; i < vi->max_queue_pairs; i++) {
> - if (i < vi->curr_queue_pairs)
> + if (i < vi->curr_queue_pairs) {
> + enable_delayed_refill(&vi->rq[i]);
> __virtnet_rx_resume(vi, &vi->rq[i], true);
> - else
> + } else {
> __virtnet_rx_resume(vi, &vi->rq[i], false);
> + }
> }
> }
>
> static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq)
> {
> - enable_delayed_refill(vi);
> + enable_delayed_refill(rq);
> __virtnet_rx_resume(vi, rq, true);
> }
>
> @@ -3830,10 +3812,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
> succ:
> vi->curr_queue_pairs = queue_pairs;
> /* virtnet_open() will refill when device is going to up. */
> - spin_lock_bh(&vi->refill_lock);
> - if (dev->flags & IFF_UP && vi->refill_enabled)
> - schedule_delayed_work(&vi->refill, 0);
> - spin_unlock_bh(&vi->refill_lock);
> + if (dev->flags & IFF_UP) {
> + int i;
> +
> + for (i = 0; i < vi->curr_queue_pairs; i++) {
> + spin_lock_bh(&vi->rq[i].refill_lock);
> + if (vi->rq[i].refill_enabled)
> + schedule_delayed_work(&vi->rq[i].refill, 0);
> + spin_unlock_bh(&vi->rq[i].refill_lock);
> + }
> + }
>
> return 0;
> }
> @@ -3843,10 +3831,6 @@ static int virtnet_close(struct net_device *dev)
> struct virtnet_info *vi = netdev_priv(dev);
> int i;
>
> - /* Make sure NAPI doesn't schedule refill work */
> - disable_delayed_refill(vi);
> - /* Make sure refill_work doesn't re-enable napi! */
> - cancel_delayed_work_sync(&vi->refill);
> /* Prevent the config change callback from changing carrier
> * after close
> */
> @@ -3857,6 +3841,10 @@ static int virtnet_close(struct net_device *dev)
> cancel_work_sync(&vi->config_work);
>
> for (i = 0; i < vi->max_queue_pairs; i++) {
> + /* Make sure NAPI doesn't schedule refill work */
> + disable_delayed_refill(&vi->rq[i]);
> + /* Make sure refill_work doesn't re-enable napi! */
> + cancel_delayed_work_sync(&vi->rq[i].refill);
> virtnet_disable_queue_pair(vi, i);
> virtnet_cancel_dim(vi, &vi->rq[i].dim);
> }
> @@ -5802,7 +5790,6 @@ static int virtnet_restore_up(struct virtio_device *vdev)
>
> virtio_device_ready(vdev);
>
> - enable_delayed_refill(vi);
> enable_rx_mode_work(vi);
>
> if (netif_running(vi->dev)) {
> @@ -6559,8 +6546,9 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
> if (!vi->rq)
> goto err_rq;
>
> - INIT_DELAYED_WORK(&vi->refill, refill_work);
> for (i = 0; i < vi->max_queue_pairs; i++) {
> + INIT_DELAYED_WORK(&vi->rq[i].refill, refill_work);
> + spin_lock_init(&vi->rq[i].refill_lock);
> vi->rq[i].pages = NULL;
> netif_napi_add_config(vi->dev, &vi->rq[i].napi, virtnet_poll,
> i);
> @@ -6901,7 +6889,6 @@ static int virtnet_probe(struct virtio_device *vdev)
>
> INIT_WORK(&vi->config_work, virtnet_config_changed_work);
> INIT_WORK(&vi->rx_mode_work, virtnet_rx_mode_work);
> - spin_lock_init(&vi->refill_lock);
>
> if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
> vi->mergeable_rx_bufs = true;
> @@ -7165,7 +7152,9 @@ static int virtnet_probe(struct virtio_device *vdev)
> net_failover_destroy(vi->failover);
> free_vqs:
> virtio_reset_device(vdev);
> - cancel_delayed_work_sync(&vi->refill);
> + for (i = 0; i < vi->max_queue_pairs; i++)
> + cancel_delayed_work_sync(&vi->rq[i].refill);
> +
> free_receive_page_frags(vi);
> virtnet_del_vqs(vi);
> free:
> --
> 2.43.0
On 12/24/25 17:19, Michael S. Tsirkin wrote:
> On Tue, Dec 23, 2025 at 10:25:31PM +0700, Bui Quang Minh wrote:
>> Currently, the refill work is a global delayed work for all the receive
>> queues. This commit makes the refill work a per receive queue so that we
>> can manage them separately and avoid further mistakes. It also helps the
>> successfully refilled queue avoid the napi_disable in the global delayed
>> refill work like before.
>>
> this commit log is unreadable. before what? what is the problem with
> "refilled queue napi_disable" this refers to.
I mean that currently even if one RX queue is refilled successfully but
another is not, then the successful one still gets napi_disable() in the
global refill work. It will unnecessarily disrupt that queue I think.
>
>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>> ---
>> drivers/net/virtio_net.c | 155 ++++++++++++++++++---------------------
>> 1 file changed, 72 insertions(+), 83 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 1bb3aeca66c6..63126e490bda 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -379,6 +379,15 @@ struct receive_queue {
>> struct xdp_rxq_info xsk_rxq_info;
>>
>> struct xdp_buff **xsk_buffs;
>> +
>> + /* Is delayed refill enabled? */
>> + bool refill_enabled;
>> +
>> + /* The lock to synchronize the access to refill_enabled */
>> + spinlock_t refill_lock;
>> +
>> + /* Work struct for delayed refilling if we run low on memory. */
>> + struct delayed_work refill;
>> };
>>
>> #define VIRTIO_NET_RSS_MAX_KEY_SIZE 40
>> @@ -441,9 +450,6 @@ struct virtnet_info {
>> /* Packet virtio header size */
>> u8 hdr_len;
>>
>> - /* Work struct for delayed refilling if we run low on memory. */
>> - struct delayed_work refill;
>> -
>> /* UDP tunnel support */
>> bool tx_tnl;
>>
>> @@ -451,12 +457,6 @@ struct virtnet_info {
>>
>> bool rx_tnl_csum;
>>
>> - /* Is delayed refill enabled? */
>> - bool refill_enabled;
>> -
>> - /* The lock to synchronize the access to refill_enabled */
>> - spinlock_t refill_lock;
>> -
>> /* Work struct for config space updates */
>> struct work_struct config_work;
>>
>> @@ -720,18 +720,18 @@ static void virtnet_rq_free_buf(struct virtnet_info *vi,
>> put_page(virt_to_head_page(buf));
>> }
>>
>> -static void enable_delayed_refill(struct virtnet_info *vi)
>> +static void enable_delayed_refill(struct receive_queue *rq)
>> {
>> - spin_lock_bh(&vi->refill_lock);
>> - vi->refill_enabled = true;
>> - spin_unlock_bh(&vi->refill_lock);
>> + spin_lock_bh(&rq->refill_lock);
>> + rq->refill_enabled = true;
>> + spin_unlock_bh(&rq->refill_lock);
>> }
>>
>> -static void disable_delayed_refill(struct virtnet_info *vi)
>> +static void disable_delayed_refill(struct receive_queue *rq)
>> {
>> - spin_lock_bh(&vi->refill_lock);
>> - vi->refill_enabled = false;
>> - spin_unlock_bh(&vi->refill_lock);
>> + spin_lock_bh(&rq->refill_lock);
>> + rq->refill_enabled = false;
>> + spin_unlock_bh(&rq->refill_lock);
>> }
>>
>> static void enable_rx_mode_work(struct virtnet_info *vi)
>> @@ -2950,38 +2950,19 @@ static void virtnet_napi_disable(struct receive_queue *rq)
>>
>> static void refill_work(struct work_struct *work)
>> {
>> - struct virtnet_info *vi =
>> - container_of(work, struct virtnet_info, refill.work);
>> + struct receive_queue *rq =
>> + container_of(work, struct receive_queue, refill.work);
>> bool still_empty;
>> - int i;
>> -
>> - for (i = 0; i < vi->curr_queue_pairs; i++) {
>> - struct receive_queue *rq = &vi->rq[i];
>>
>> - /*
>> - * When queue API support is added in the future and the call
>> - * below becomes napi_disable_locked, this driver will need to
>> - * be refactored.
>> - *
>> - * One possible solution would be to:
>> - * - cancel refill_work with cancel_delayed_work (note:
>> - * non-sync)
>> - * - cancel refill_work with cancel_delayed_work_sync in
>> - * virtnet_remove after the netdev is unregistered
>> - * - wrap all of the work in a lock (perhaps the netdev
>> - * instance lock)
>> - * - check netif_running() and return early to avoid a race
>> - */
>> - napi_disable(&rq->napi);
>> - still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
>> - virtnet_napi_do_enable(rq->vq, &rq->napi);
>> + napi_disable(&rq->napi);
>> + still_empty = !try_fill_recv(rq->vq->vdev->priv, rq, GFP_KERNEL);
>> + virtnet_napi_do_enable(rq->vq, &rq->napi);
>>
>> - /* In theory, this can happen: if we don't get any buffers in
>> - * we will *never* try to fill again.
>> - */
>> - if (still_empty)
>> - schedule_delayed_work(&vi->refill, HZ/2);
>> - }
>> + /* In theory, this can happen: if we don't get any buffers in
>> + * we will *never* try to fill again.
>> + */
>> + if (still_empty)
>> + schedule_delayed_work(&rq->refill, HZ / 2);
>> }
>>
>> static int virtnet_receive_xsk_bufs(struct virtnet_info *vi,
>> @@ -3048,10 +3029,10 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
>>
>> if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
>> if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
>> - spin_lock(&vi->refill_lock);
>> - if (vi->refill_enabled)
>> - schedule_delayed_work(&vi->refill, 0);
>> - spin_unlock(&vi->refill_lock);
>> + spin_lock(&rq->refill_lock);
>> + if (rq->refill_enabled)
>> + schedule_delayed_work(&rq->refill, 0);
>> + spin_unlock(&rq->refill_lock);
>> }
>> }
>>
>> @@ -3226,13 +3207,13 @@ static int virtnet_open(struct net_device *dev)
>> struct virtnet_info *vi = netdev_priv(dev);
>> int i, err;
>>
>> - enable_delayed_refill(vi);
>> -
>> for (i = 0; i < vi->max_queue_pairs; i++) {
>> - if (i < vi->curr_queue_pairs)
>> + if (i < vi->curr_queue_pairs) {
>> + enable_delayed_refill(&vi->rq[i]);
>> /* Make sure we have some buffers: if oom use wq. */
>> if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
>> - schedule_delayed_work(&vi->refill, 0);
>> + schedule_delayed_work(&vi->rq[i].refill, 0);
>> + }
>>
>> err = virtnet_enable_queue_pair(vi, i);
>> if (err < 0)
>> @@ -3251,10 +3232,9 @@ static int virtnet_open(struct net_device *dev)
>> return 0;
>>
>> err_enable_qp:
>> - disable_delayed_refill(vi);
>> - cancel_delayed_work_sync(&vi->refill);
>> -
>> for (i--; i >= 0; i--) {
>> + disable_delayed_refill(&vi->rq[i]);
>> + cancel_delayed_work_sync(&vi->rq[i].refill);
>> virtnet_disable_queue_pair(vi, i);
>> virtnet_cancel_dim(vi, &vi->rq[i].dim);
>> }
>> @@ -3447,14 +3427,15 @@ static void virtnet_rx_pause_all(struct virtnet_info *vi)
>> {
>> int i;
>>
>> - /*
>> - * Make sure refill_work does not run concurrently to
>> - * avoid napi_disable race which leads to deadlock.
>> - */
>> - disable_delayed_refill(vi);
>> - cancel_delayed_work_sync(&vi->refill);
>> - for (i = 0; i < vi->max_queue_pairs; i++)
>> + for (i = 0; i < vi->max_queue_pairs; i++) {
>> + /*
>> + * Make sure refill_work does not run concurrently to
>> + * avoid napi_disable race which leads to deadlock.
>> + */
>> + disable_delayed_refill(&vi->rq[i]);
>> + cancel_delayed_work_sync(&vi->rq[i].refill);
>> __virtnet_rx_pause(vi, &vi->rq[i]);
>> + }
>> }
>>
>> static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
>> @@ -3463,8 +3444,8 @@ static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
>> * Make sure refill_work does not run concurrently to
>> * avoid napi_disable race which leads to deadlock.
>> */
>> - disable_delayed_refill(vi);
>> - cancel_delayed_work_sync(&vi->refill);
>> + disable_delayed_refill(rq);
>> + cancel_delayed_work_sync(&rq->refill);
>> __virtnet_rx_pause(vi, rq);
>> }
>>
>> @@ -3481,25 +3462,26 @@ static void __virtnet_rx_resume(struct virtnet_info *vi,
>> virtnet_napi_enable(rq);
>>
>> if (schedule_refill)
>> - schedule_delayed_work(&vi->refill, 0);
>> + schedule_delayed_work(&rq->refill, 0);
>> }
>>
>> static void virtnet_rx_resume_all(struct virtnet_info *vi)
>> {
>> int i;
>>
>> - enable_delayed_refill(vi);
>> for (i = 0; i < vi->max_queue_pairs; i++) {
>> - if (i < vi->curr_queue_pairs)
>> + if (i < vi->curr_queue_pairs) {
>> + enable_delayed_refill(&vi->rq[i]);
>> __virtnet_rx_resume(vi, &vi->rq[i], true);
>> - else
>> + } else {
>> __virtnet_rx_resume(vi, &vi->rq[i], false);
>> + }
>> }
>> }
>>
>> static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq)
>> {
>> - enable_delayed_refill(vi);
>> + enable_delayed_refill(rq);
>> __virtnet_rx_resume(vi, rq, true);
>> }
>>
>> @@ -3830,10 +3812,16 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
>> succ:
>> vi->curr_queue_pairs = queue_pairs;
>> /* virtnet_open() will refill when device is going to up. */
>> - spin_lock_bh(&vi->refill_lock);
>> - if (dev->flags & IFF_UP && vi->refill_enabled)
>> - schedule_delayed_work(&vi->refill, 0);
>> - spin_unlock_bh(&vi->refill_lock);
>> + if (dev->flags & IFF_UP) {
>> + int i;
>> +
>> + for (i = 0; i < vi->curr_queue_pairs; i++) {
>> + spin_lock_bh(&vi->rq[i].refill_lock);
>> + if (vi->rq[i].refill_enabled)
>> + schedule_delayed_work(&vi->rq[i].refill, 0);
>> + spin_unlock_bh(&vi->rq[i].refill_lock);
>> + }
>> + }
>>
>> return 0;
>> }
>> @@ -3843,10 +3831,6 @@ static int virtnet_close(struct net_device *dev)
>> struct virtnet_info *vi = netdev_priv(dev);
>> int i;
>>
>> - /* Make sure NAPI doesn't schedule refill work */
>> - disable_delayed_refill(vi);
>> - /* Make sure refill_work doesn't re-enable napi! */
>> - cancel_delayed_work_sync(&vi->refill);
>> /* Prevent the config change callback from changing carrier
>> * after close
>> */
>> @@ -3857,6 +3841,10 @@ static int virtnet_close(struct net_device *dev)
>> cancel_work_sync(&vi->config_work);
>>
>> for (i = 0; i < vi->max_queue_pairs; i++) {
>> + /* Make sure NAPI doesn't schedule refill work */
>> + disable_delayed_refill(&vi->rq[i]);
>> + /* Make sure refill_work doesn't re-enable napi! */
>> + cancel_delayed_work_sync(&vi->rq[i].refill);
>> virtnet_disable_queue_pair(vi, i);
>> virtnet_cancel_dim(vi, &vi->rq[i].dim);
>> }
>> @@ -5802,7 +5790,6 @@ static int virtnet_restore_up(struct virtio_device *vdev)
>>
>> virtio_device_ready(vdev);
>>
>> - enable_delayed_refill(vi);
>> enable_rx_mode_work(vi);
>>
>> if (netif_running(vi->dev)) {
>> @@ -6559,8 +6546,9 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
>> if (!vi->rq)
>> goto err_rq;
>>
>> - INIT_DELAYED_WORK(&vi->refill, refill_work);
>> for (i = 0; i < vi->max_queue_pairs; i++) {
>> + INIT_DELAYED_WORK(&vi->rq[i].refill, refill_work);
>> + spin_lock_init(&vi->rq[i].refill_lock);
>> vi->rq[i].pages = NULL;
>> netif_napi_add_config(vi->dev, &vi->rq[i].napi, virtnet_poll,
>> i);
>> @@ -6901,7 +6889,6 @@ static int virtnet_probe(struct virtio_device *vdev)
>>
>> INIT_WORK(&vi->config_work, virtnet_config_changed_work);
>> INIT_WORK(&vi->rx_mode_work, virtnet_rx_mode_work);
>> - spin_lock_init(&vi->refill_lock);
>>
>> if (virtio_has_feature(vdev, VIRTIO_NET_F_MRG_RXBUF)) {
>> vi->mergeable_rx_bufs = true;
>> @@ -7165,7 +7152,9 @@ static int virtnet_probe(struct virtio_device *vdev)
>> net_failover_destroy(vi->failover);
>> free_vqs:
>> virtio_reset_device(vdev);
>> - cancel_delayed_work_sync(&vi->refill);
>> + for (i = 0; i < vi->max_queue_pairs; i++)
>> + cancel_delayed_work_sync(&vi->rq[i].refill);
>> +
>> free_receive_page_frags(vi);
>> virtnet_del_vqs(vi);
>> free:
>> --
>> 2.43.0
> static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) > @@ -3463,8 +3444,8 @@ static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) > * Make sure refill_work does not run concurrently to > * avoid napi_disable race which leads to deadlock. > */ > - disable_delayed_refill(vi); > - cancel_delayed_work_sync(&vi->refill); > + disable_delayed_refill(rq); > + cancel_delayed_work_sync(&rq->refill); > __virtnet_rx_pause(vi, rq); > } > disable_delayed_refill is always followed by cancel_delayed_work_sync. Just put cancel into disable, and reduce code duplication. -- MST
On Tue, Dec 23, 2025 at 11:27 PM Bui Quang Minh <minhquangbui99@gmail.com> wrote: > > Currently, the refill work is a global delayed work for all the receive > queues. This commit makes the refill work a per receive queue so that we > can manage them separately and avoid further mistakes. It also helps the > successfully refilled queue avoid the napi_disable in the global delayed > refill work like before. > > Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com> > --- I may miss something but I think this patch is sufficient to fix the problem? Thanks
On 12/24/25 07:52, Jason Wang wrote: > On Tue, Dec 23, 2025 at 11:27 PM Bui Quang Minh > <minhquangbui99@gmail.com> wrote: >> Currently, the refill work is a global delayed work for all the receive >> queues. This commit makes the refill work a per receive queue so that we >> can manage them separately and avoid further mistakes. It also helps the >> successfully refilled queue avoid the napi_disable in the global delayed >> refill work like before. >> >> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com> >> --- > I may miss something but I think this patch is sufficient to fix the problem? > > Thanks > Yes, this fixes the reproducer in virtnet_rx_resume[_all] but the second patch also fixes a bug variant in virtnet_open. After the first patch, the enable_delayed_refill is still called before napi_enable. However, the only possible delayed refill schedule is in virtnet_set_queues and it can't happen between that window because during virtnet_rx_resume[_all], we still holds the rtnl_lock. So leaving the enable_delayed_refill before napi_enable does not cause an issue but it feels not correct to me. But moving enable_delayed_refill after napi_enable requires the new pending bool in the third patch. Thanks, Quang Minh.
Hi Jason, I'm wondering why we even need this refill work. Why not simply let NAPI retry the refill on its next run if the refill fails? That would seem much simpler. This refill work complicates maintenance and often introduces a lot of concurrency issues and races. Thanks. On Wed, 24 Dec 2025 08:52:36 +0800, Jason Wang <jasowang@redhat.com> wrote: > On Tue, Dec 23, 2025 at 11:27 PM Bui Quang Minh > <minhquangbui99@gmail.com> wrote: > > > > Currently, the refill work is a global delayed work for all the receive > > queues. This commit makes the refill work a per receive queue so that we > > can manage them separately and avoid further mistakes. It also helps the > > successfully refilled queue avoid the napi_disable in the global delayed > > refill work like before. > > > > Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com> > > --- > > I may miss something but I think this patch is sufficient to fix the problem? > > Thanks >
On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote: > > Hi Jason, > > I'm wondering why we even need this refill work. Why not simply let NAPI retry > the refill on its next run if the refill fails? That would seem much simpler. > This refill work complicates maintenance and often introduces a lot of > concurrency issues and races. > > Thanks. refill work can refill from GFP_KERNEL, napi only from ATOMIC. And if GFP_ATOMIC failed, aggressively retrying might not be a great idea. Not saying refill work is a great hack, but that is the reason for it. -- MST
On Wed, Dec 24, 2025 at 9:48 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote:
> >
> > Hi Jason,
> >
> > I'm wondering why we even need this refill work. Why not simply let NAPI retry
> > the refill on its next run if the refill fails? That would seem much simpler.
> > This refill work complicates maintenance and often introduces a lot of
> > concurrency issues and races.
> >
> > Thanks.
>
> refill work can refill from GFP_KERNEL, napi only from ATOMIC.
>
> And if GFP_ATOMIC failed, aggressively retrying might not be a great idea.
Btw, I see some drivers are doing things as Xuan said. E.g
mlx5e_napi_poll() did:
busy |= INDIRECT_CALL_2(rq->post_wqes,
mlx5e_post_rx_mpwqes,
mlx5e_post_rx_wqes,
...
if (busy) {
if (likely(mlx5e_channel_no_affinity_change(c))) {
work_done = budget;
goto out;
...
>
> Not saying refill work is a great hack, but that is the reason for it.
> --
> MST
>
Thanks
On Thu, Dec 25, 2025 at 03:33:29PM +0800, Jason Wang wrote:
> On Wed, Dec 24, 2025 at 9:48 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote:
> > >
> > > Hi Jason,
> > >
> > > I'm wondering why we even need this refill work. Why not simply let NAPI retry
> > > the refill on its next run if the refill fails? That would seem much simpler.
> > > This refill work complicates maintenance and often introduces a lot of
> > > concurrency issues and races.
> > >
> > > Thanks.
> >
> > refill work can refill from GFP_KERNEL, napi only from ATOMIC.
> >
> > And if GFP_ATOMIC failed, aggressively retrying might not be a great idea.
>
> Btw, I see some drivers are doing things as Xuan said. E.g
> mlx5e_napi_poll() did:
>
> busy |= INDIRECT_CALL_2(rq->post_wqes,
> mlx5e_post_rx_mpwqes,
> mlx5e_post_rx_wqes,
>
> ...
>
> if (busy) {
> if (likely(mlx5e_channel_no_affinity_change(c))) {
> work_done = budget;
> goto out;
> ...
is busy a GFP_ATOMIC allocation failure?
> >
> > Not saying refill work is a great hack, but that is the reason for it.
> > --
> > MST
> >
>
> Thanks
On Fri, Dec 26, 2025 at 12:27 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Thu, Dec 25, 2025 at 03:33:29PM +0800, Jason Wang wrote:
> > On Wed, Dec 24, 2025 at 9:48 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote:
> > > >
> > > > Hi Jason,
> > > >
> > > > I'm wondering why we even need this refill work. Why not simply let NAPI retry
> > > > the refill on its next run if the refill fails? That would seem much simpler.
> > > > This refill work complicates maintenance and often introduces a lot of
> > > > concurrency issues and races.
> > > >
> > > > Thanks.
> > >
> > > refill work can refill from GFP_KERNEL, napi only from ATOMIC.
> > >
> > > And if GFP_ATOMIC failed, aggressively retrying might not be a great idea.
> >
> > Btw, I see some drivers are doing things as Xuan said. E.g
> > mlx5e_napi_poll() did:
> >
> > busy |= INDIRECT_CALL_2(rq->post_wqes,
> > mlx5e_post_rx_mpwqes,
> > mlx5e_post_rx_wqes,
> >
> > ...
> >
> > if (busy) {
> > if (likely(mlx5e_channel_no_affinity_change(c))) {
> > work_done = budget;
> > goto out;
> > ...
>
>
> is busy a GFP_ATOMIC allocation failure?
Yes, and I think the logic here is to fallback to ksoftirqd if the
allocation fails too much.
Thanks
>
> > >
> > > Not saying refill work is a great hack, but that is the reason for it.
> > > --
> > > MST
> > >
> >
> > Thanks
>
On Fri, Dec 26, 2025 at 09:31:26AM +0800, Jason Wang wrote:
> On Fri, Dec 26, 2025 at 12:27 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Thu, Dec 25, 2025 at 03:33:29PM +0800, Jason Wang wrote:
> > > On Wed, Dec 24, 2025 at 9:48 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote:
> > > > >
> > > > > Hi Jason,
> > > > >
> > > > > I'm wondering why we even need this refill work. Why not simply let NAPI retry
> > > > > the refill on its next run if the refill fails? That would seem much simpler.
> > > > > This refill work complicates maintenance and often introduces a lot of
> > > > > concurrency issues and races.
> > > > >
> > > > > Thanks.
> > > >
> > > > refill work can refill from GFP_KERNEL, napi only from ATOMIC.
> > > >
> > > > And if GFP_ATOMIC failed, aggressively retrying might not be a great idea.
> > >
> > > Btw, I see some drivers are doing things as Xuan said. E.g
> > > mlx5e_napi_poll() did:
> > >
> > > busy |= INDIRECT_CALL_2(rq->post_wqes,
> > > mlx5e_post_rx_mpwqes,
> > > mlx5e_post_rx_wqes,
> > >
> > > ...
> > >
> > > if (busy) {
> > > if (likely(mlx5e_channel_no_affinity_change(c))) {
> > > work_done = budget;
> > > goto out;
> > > ...
> >
> >
> > is busy a GFP_ATOMIC allocation failure?
>
> Yes, and I think the logic here is to fallback to ksoftirqd if the
> allocation fails too much.
>
> Thanks
True. I just don't know if this works better or worse than the
current design, but it is certainly simpler and we never actually
worried about the performance of the current one.
So you know, let's roll with this approach.
I do however ask that some testing is done on the patch forcing these OOM
situations just to see if we are missing something obvious.
the beauty is the patch can be very small:
1. patch 1 do not schedule refill ever, just retrigger napi
2. remove all the now dead code
this way patch 1 will be small and backportable to stable.
> >
> > > >
> > > > Not saying refill work is a great hack, but that is the reason for it.
> > > > --
> > > > MST
> > > >
> > >
> > > Thanks
> >
On 12/26/25 14:37, Michael S. Tsirkin wrote:
> On Fri, Dec 26, 2025 at 09:31:26AM +0800, Jason Wang wrote:
>> On Fri, Dec 26, 2025 at 12:27 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>>> On Thu, Dec 25, 2025 at 03:33:29PM +0800, Jason Wang wrote:
>>>> On Wed, Dec 24, 2025 at 9:48 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>> On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote:
>>>>>> Hi Jason,
>>>>>>
>>>>>> I'm wondering why we even need this refill work. Why not simply let NAPI retry
>>>>>> the refill on its next run if the refill fails? That would seem much simpler.
>>>>>> This refill work complicates maintenance and often introduces a lot of
>>>>>> concurrency issues and races.
>>>>>>
>>>>>> Thanks.
>>>>> refill work can refill from GFP_KERNEL, napi only from ATOMIC.
>>>>>
>>>>> And if GFP_ATOMIC failed, aggressively retrying might not be a great idea.
>>>> Btw, I see some drivers are doing things as Xuan said. E.g
>>>> mlx5e_napi_poll() did:
>>>>
>>>> busy |= INDIRECT_CALL_2(rq->post_wqes,
>>>> mlx5e_post_rx_mpwqes,
>>>> mlx5e_post_rx_wqes,
>>>>
>>>> ...
>>>>
>>>> if (busy) {
>>>> if (likely(mlx5e_channel_no_affinity_change(c))) {
>>>> work_done = budget;
>>>> goto out;
>>>> ...
>>>
>>> is busy a GFP_ATOMIC allocation failure?
>> Yes, and I think the logic here is to fallback to ksoftirqd if the
>> allocation fails too much.
>>
>> Thanks
>
> True. I just don't know if this works better or worse than the
> current design, but it is certainly simpler and we never actually
> worried about the performance of the current one.
>
>
> So you know, let's roll with this approach.
>
> I do however ask that some testing is done on the patch forcing these OOM
> situations just to see if we are missing something obvious.
>
>
> the beauty is the patch can be very small:
> 1. patch 1 do not schedule refill ever, just retrigger napi
> 2. remove all the now dead code
>
> this way patch 1 will be small and backportable to stable.
I've tried 1. with this patch
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 1bb3aeca66c6..9e890aff2d95 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3035,7 +3035,7 @@ static int virtnet_receive_packets(struct virtnet_info *vi,
}
static int virtnet_receive(struct receive_queue *rq, int budget,
- unsigned int *xdp_xmit)
+ unsigned int *xdp_xmit, bool *retry_refill)
{
struct virtnet_info *vi = rq->vq->vdev->priv;
struct virtnet_rq_stats stats = {};
@@ -3047,12 +3047,8 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats);
if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
- if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
- spin_lock(&vi->refill_lock);
- if (vi->refill_enabled)
- schedule_delayed_work(&vi->refill, 0);
- spin_unlock(&vi->refill_lock);
- }
+ if (!try_fill_recv(vi, rq, GFP_ATOMIC))
+ *retry_refill = true;
}
u64_stats_set(&stats.packets, packets);
@@ -3129,18 +3125,18 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
struct send_queue *sq;
unsigned int received;
unsigned int xdp_xmit = 0;
- bool napi_complete;
+ bool napi_complete, retry_refill = false;
virtnet_poll_cleantx(rq, budget);
- received = virtnet_receive(rq, budget, &xdp_xmit);
+ received = virtnet_receive(rq, budget, &xdp_xmit, &retry_refill);
rq->packets_in_napi += received;
if (xdp_xmit & VIRTIO_XDP_REDIR)
xdp_do_flush();
/* Out of packets? */
- if (received < budget) {
+ if (received < budget && !retry_refill) {
napi_complete = virtqueue_napi_complete(napi, rq->vq, received);
/* Intentionally not taking dim_lock here. This may result in a
* spurious net_dim call. But if that happens virtnet_rx_dim_work
@@ -3230,9 +3226,11 @@ static int virtnet_open(struct net_device *dev)
for (i = 0; i < vi->max_queue_pairs; i++) {
if (i < vi->curr_queue_pairs)
- /* Make sure we have some buffers: if oom use wq. */
- if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
- schedule_delayed_work(&vi->refill, 0);
+ /* If this fails, we will retry later in
+ * NAPI poll, which is scheduled in the below
+ * virtnet_enable_queue_pair
+ */
+ try_fill_recv(vi, &vi->rq[i], GFP_KERNEL);
err = virtnet_enable_queue_pair(vi, i);
if (err < 0)
@@ -3473,15 +3471,15 @@ static void __virtnet_rx_resume(struct virtnet_info *vi,
bool refill)
{
bool running = netif_running(vi->dev);
- bool schedule_refill = false;
- if (refill && !try_fill_recv(vi, rq, GFP_KERNEL))
- schedule_refill = true;
+ if (refill)
+ /* If this fails, we will retry later in NAPI poll, which is
+ * scheduled in the below virtnet_napi_enable
+ */
+ try_fill_recv(vi, rq, GFP_KERNEL);
+
if (running)
virtnet_napi_enable(rq);
-
- if (schedule_refill)
- schedule_delayed_work(&vi->refill, 0);
}
static void virtnet_rx_resume_all(struct virtnet_info *vi)
@@ -3777,6 +3775,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
struct virtio_net_rss_config_trailer old_rss_trailer;
struct net_device *dev = vi->dev;
struct scatterlist sg;
+ int i;
if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
return 0;
@@ -3829,11 +3828,8 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
}
succ:
vi->curr_queue_pairs = queue_pairs;
- /* virtnet_open() will refill when device is going to up. */
- spin_lock_bh(&vi->refill_lock);
- if (dev->flags & IFF_UP && vi->refill_enabled)
- schedule_delayed_work(&vi->refill, 0);
- spin_unlock_bh(&vi->refill_lock);
+ for (i = 0; i < vi->curr_queue_pairs; i++)
+ try_fill_recv(vi, &vi->rq[i], GFP_KERNEL);
return 0;
}
But I got an issue with selftests/drivers/net/hw/xsk_reconfig.py. This
test sets up XDP zerocopy (Xsk) but does not provide any descriptors to
the fill ring. So xsk_pool does not have any descriptors and
try_fill_recv will always fail. The RX NAPI keeps polling. Later, when
we want to disable the xsk_pool, in virtnet_xsk_pool_disable path,
virtnet_xsk_pool_disable
-> virtnet_rq_bind_xsk_pool
-> virtnet_rx_pause
-> __virtnet_rx_pause
-> virtnet_napi_disable
-> napi_disable
We get stuck in napi_disable because the RX NAPI is still polling.
In drivers/net/ethernet/mellanox/mlx5, AFAICS, it uses state bit for
synchronization between xsk setup (mlx5e_xsk_setup_pool) with RX NAPI
(mlx5e_napi_poll) without using napi_disable/enable. However, in
drivers/net/ethernet/intel/ice,
ice_xsk_pool_setup
-> ice_qp_dis
-> ice_qvec_toggle_napi
-> napi_disable
it still uses napi_disable. Did I miss something in the above patch?
I'll try to look into using another synchronization instead of
napi_disable/enable in xsk_pool setup path too.
Thanks,
Quang Minh.
On Wed, Dec 31, 2025 at 12:29 AM Bui Quang Minh
<minhquangbui99@gmail.com> wrote:
>
> On 12/26/25 14:37, Michael S. Tsirkin wrote:
> > On Fri, Dec 26, 2025 at 09:31:26AM +0800, Jason Wang wrote:
> >> On Fri, Dec 26, 2025 at 12:27 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> >>> On Thu, Dec 25, 2025 at 03:33:29PM +0800, Jason Wang wrote:
> >>>> On Wed, Dec 24, 2025 at 9:48 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> >>>>> On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote:
> >>>>>> Hi Jason,
> >>>>>>
> >>>>>> I'm wondering why we even need this refill work. Why not simply let NAPI retry
> >>>>>> the refill on its next run if the refill fails? That would seem much simpler.
> >>>>>> This refill work complicates maintenance and often introduces a lot of
> >>>>>> concurrency issues and races.
> >>>>>>
> >>>>>> Thanks.
> >>>>> refill work can refill from GFP_KERNEL, napi only from ATOMIC.
> >>>>>
> >>>>> And if GFP_ATOMIC failed, aggressively retrying might not be a great idea.
> >>>> Btw, I see some drivers are doing things as Xuan said. E.g
> >>>> mlx5e_napi_poll() did:
> >>>>
> >>>> busy |= INDIRECT_CALL_2(rq->post_wqes,
> >>>> mlx5e_post_rx_mpwqes,
> >>>> mlx5e_post_rx_wqes,
> >>>>
> >>>> ...
> >>>>
> >>>> if (busy) {
> >>>> if (likely(mlx5e_channel_no_affinity_change(c))) {
> >>>> work_done = budget;
> >>>> goto out;
> >>>> ...
> >>>
> >>> is busy a GFP_ATOMIC allocation failure?
> >> Yes, and I think the logic here is to fallback to ksoftirqd if the
> >> allocation fails too much.
> >>
> >> Thanks
> >
> > True. I just don't know if this works better or worse than the
> > current design, but it is certainly simpler and we never actually
> > worried about the performance of the current one.
> >
> >
> > So you know, let's roll with this approach.
> >
> > I do however ask that some testing is done on the patch forcing these OOM
> > situations just to see if we are missing something obvious.
> >
> >
> > the beauty is the patch can be very small:
> > 1. patch 1 do not schedule refill ever, just retrigger napi
> > 2. remove all the now dead code
> >
> > this way patch 1 will be small and backportable to stable.
>
> I've tried 1. with this patch
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 1bb3aeca66c6..9e890aff2d95 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -3035,7 +3035,7 @@ static int virtnet_receive_packets(struct virtnet_info *vi,
> }
>
> static int virtnet_receive(struct receive_queue *rq, int budget,
> - unsigned int *xdp_xmit)
> + unsigned int *xdp_xmit, bool *retry_refill)
> {
> struct virtnet_info *vi = rq->vq->vdev->priv;
> struct virtnet_rq_stats stats = {};
> @@ -3047,12 +3047,8 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats);
>
> if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
> - if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
> - spin_lock(&vi->refill_lock);
> - if (vi->refill_enabled)
> - schedule_delayed_work(&vi->refill, 0);
> - spin_unlock(&vi->refill_lock);
> - }
> + if (!try_fill_recv(vi, rq, GFP_ATOMIC))
> + *retry_refill = true;
> }
>
> u64_stats_set(&stats.packets, packets);
> @@ -3129,18 +3125,18 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
> struct send_queue *sq;
> unsigned int received;
> unsigned int xdp_xmit = 0;
> - bool napi_complete;
> + bool napi_complete, retry_refill = false;
>
> virtnet_poll_cleantx(rq, budget);
>
> - received = virtnet_receive(rq, budget, &xdp_xmit);
> + received = virtnet_receive(rq, budget, &xdp_xmit, &retry_refill);
> rq->packets_in_napi += received;
>
> if (xdp_xmit & VIRTIO_XDP_REDIR)
> xdp_do_flush();
>
> /* Out of packets? */
> - if (received < budget) {
> + if (received < budget && !retry_refill) {
But you didn't return the budget when we need to retry here?
> napi_complete = virtqueue_napi_complete(napi, rq->vq, received);
> /* Intentionally not taking dim_lock here. This may result in a
> * spurious net_dim call. But if that happens virtnet_rx_dim_work
> @@ -3230,9 +3226,11 @@ static int virtnet_open(struct net_device *dev)
>
> for (i = 0; i < vi->max_queue_pairs; i++) {
> if (i < vi->curr_queue_pairs)
> - /* Make sure we have some buffers: if oom use wq. */
> - if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
> - schedule_delayed_work(&vi->refill, 0);
> + /* If this fails, we will retry later in
> + * NAPI poll, which is scheduled in the below
> + * virtnet_enable_queue_pair
> + */
> + try_fill_recv(vi, &vi->rq[i], GFP_KERNEL);
>
> err = virtnet_enable_queue_pair(vi, i);
> if (err < 0)
> @@ -3473,15 +3471,15 @@ static void __virtnet_rx_resume(struct virtnet_info *vi,
> bool refill)
> {
> bool running = netif_running(vi->dev);
> - bool schedule_refill = false;
>
> - if (refill && !try_fill_recv(vi, rq, GFP_KERNEL))
> - schedule_refill = true;
> + if (refill)
> + /* If this fails, we will retry later in NAPI poll, which is
> + * scheduled in the below virtnet_napi_enable
> + */
> + try_fill_recv(vi, rq, GFP_KERNEL);
> +
> if (running)
> virtnet_napi_enable(rq);
> -
> - if (schedule_refill)
> - schedule_delayed_work(&vi->refill, 0);
> }
>
> static void virtnet_rx_resume_all(struct virtnet_info *vi)
> @@ -3777,6 +3775,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
> struct virtio_net_rss_config_trailer old_rss_trailer;
> struct net_device *dev = vi->dev;
> struct scatterlist sg;
> + int i;
>
> if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
> return 0;
> @@ -3829,11 +3828,8 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
> }
> succ:
> vi->curr_queue_pairs = queue_pairs;
> - /* virtnet_open() will refill when device is going to up. */
> - spin_lock_bh(&vi->refill_lock);
> - if (dev->flags & IFF_UP && vi->refill_enabled)
> - schedule_delayed_work(&vi->refill, 0);
> - spin_unlock_bh(&vi->refill_lock);
> + for (i = 0; i < vi->curr_queue_pairs; i++)
> + try_fill_recv(vi, &vi->rq[i], GFP_KERNEL);
>
> return 0;
> }
>
>
> But I got an issue with selftests/drivers/net/hw/xsk_reconfig.py. This
> test sets up XDP zerocopy (Xsk) but does not provide any descriptors to
> the fill ring. So xsk_pool does not have any descriptors and
> try_fill_recv will always fail. The RX NAPI keeps polling. Later, when
> we want to disable the xsk_pool, in virtnet_xsk_pool_disable path,
>
> virtnet_xsk_pool_disable
> -> virtnet_rq_bind_xsk_pool
> -> virtnet_rx_pause
> -> __virtnet_rx_pause
> -> virtnet_napi_disable
> -> napi_disable
>
> We get stuck in napi_disable because the RX NAPI is still polling.
napi_disable will set NAPI_DISABLE bit, no?
>
> In drivers/net/ethernet/mellanox/mlx5, AFAICS, it uses state bit for
> synchronization between xsk setup (mlx5e_xsk_setup_pool) with RX NAPI
> (mlx5e_napi_poll) without using napi_disable/enable. However, in
> drivers/net/ethernet/intel/ice,
>
> ice_xsk_pool_setup
> -> ice_qp_dis
> -> ice_qvec_toggle_napi
> -> napi_disable
>
> it still uses napi_disable. Did I miss something in the above patch?
> I'll try to look into using another synchronization instead of
> napi_disable/enable in xsk_pool setup path too.
>
> Thanks,
> Quang Minh.
>
Thanks
On 12/31/25 14:30, Jason Wang wrote:
> On Wed, Dec 31, 2025 at 12:29 AM Bui Quang Minh
> <minhquangbui99@gmail.com> wrote:
>> On 12/26/25 14:37, Michael S. Tsirkin wrote:
>>> On Fri, Dec 26, 2025 at 09:31:26AM +0800, Jason Wang wrote:
>>>> On Fri, Dec 26, 2025 at 12:27 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>> On Thu, Dec 25, 2025 at 03:33:29PM +0800, Jason Wang wrote:
>>>>>> On Wed, Dec 24, 2025 at 9:48 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>>>>>>> On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote:
>>>>>>>> Hi Jason,
>>>>>>>>
>>>>>>>> I'm wondering why we even need this refill work. Why not simply let NAPI retry
>>>>>>>> the refill on its next run if the refill fails? That would seem much simpler.
>>>>>>>> This refill work complicates maintenance and often introduces a lot of
>>>>>>>> concurrency issues and races.
>>>>>>>>
>>>>>>>> Thanks.
>>>>>>> refill work can refill from GFP_KERNEL, napi only from ATOMIC.
>>>>>>>
>>>>>>> And if GFP_ATOMIC failed, aggressively retrying might not be a great idea.
>>>>>> Btw, I see some drivers are doing things as Xuan said. E.g
>>>>>> mlx5e_napi_poll() did:
>>>>>>
>>>>>> busy |= INDIRECT_CALL_2(rq->post_wqes,
>>>>>> mlx5e_post_rx_mpwqes,
>>>>>> mlx5e_post_rx_wqes,
>>>>>>
>>>>>> ...
>>>>>>
>>>>>> if (busy) {
>>>>>> if (likely(mlx5e_channel_no_affinity_change(c))) {
>>>>>> work_done = budget;
>>>>>> goto out;
>>>>>> ...
>>>>> is busy a GFP_ATOMIC allocation failure?
>>>> Yes, and I think the logic here is to fallback to ksoftirqd if the
>>>> allocation fails too much.
>>>>
>>>> Thanks
>>> True. I just don't know if this works better or worse than the
>>> current design, but it is certainly simpler and we never actually
>>> worried about the performance of the current one.
>>>
>>>
>>> So you know, let's roll with this approach.
>>>
>>> I do however ask that some testing is done on the patch forcing these OOM
>>> situations just to see if we are missing something obvious.
>>>
>>>
>>> the beauty is the patch can be very small:
>>> 1. patch 1 do not schedule refill ever, just retrigger napi
>>> 2. remove all the now dead code
>>>
>>> this way patch 1 will be small and backportable to stable.
>> I've tried 1. with this patch
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 1bb3aeca66c6..9e890aff2d95 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -3035,7 +3035,7 @@ static int virtnet_receive_packets(struct virtnet_info *vi,
>> }
>>
>> static int virtnet_receive(struct receive_queue *rq, int budget,
>> - unsigned int *xdp_xmit)
>> + unsigned int *xdp_xmit, bool *retry_refill)
>> {
>> struct virtnet_info *vi = rq->vq->vdev->priv;
>> struct virtnet_rq_stats stats = {};
>> @@ -3047,12 +3047,8 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
>> packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats);
>>
>> if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
>> - if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
>> - spin_lock(&vi->refill_lock);
>> - if (vi->refill_enabled)
>> - schedule_delayed_work(&vi->refill, 0);
>> - spin_unlock(&vi->refill_lock);
>> - }
>> + if (!try_fill_recv(vi, rq, GFP_ATOMIC))
>> + *retry_refill = true;
>> }
>>
>> u64_stats_set(&stats.packets, packets);
>> @@ -3129,18 +3125,18 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
>> struct send_queue *sq;
>> unsigned int received;
>> unsigned int xdp_xmit = 0;
>> - bool napi_complete;
>> + bool napi_complete, retry_refill = false;
>>
>> virtnet_poll_cleantx(rq, budget);
>>
>> - received = virtnet_receive(rq, budget, &xdp_xmit);
>> + received = virtnet_receive(rq, budget, &xdp_xmit, &retry_refill);
>> rq->packets_in_napi += received;
>>
>> if (xdp_xmit & VIRTIO_XDP_REDIR)
>> xdp_do_flush();
>>
>> /* Out of packets? */
>> - if (received < budget) {
>> + if (received < budget && !retry_refill) {
> But you didn't return the budget when we need to retry here?
You are right. Returning budget when we need to retry solves the issue. In __napi_poll, if we return budget, it will check whether we have pending disable by calling napi_disable_pending. If so, the NAPI is descheduled and we can napi_disable it.
Thanks,
Quang Minh.
>
>> napi_complete = virtqueue_napi_complete(napi, rq->vq, received);
>> /* Intentionally not taking dim_lock here. This may result in a
>> * spurious net_dim call. But if that happens virtnet_rx_dim_work
>> @@ -3230,9 +3226,11 @@ static int virtnet_open(struct net_device *dev)
>>
>> for (i = 0; i < vi->max_queue_pairs; i++) {
>> if (i < vi->curr_queue_pairs)
>> - /* Make sure we have some buffers: if oom use wq. */
>> - if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
>> - schedule_delayed_work(&vi->refill, 0);
>> + /* If this fails, we will retry later in
>> + * NAPI poll, which is scheduled in the below
>> + * virtnet_enable_queue_pair
>> + */
>> + try_fill_recv(vi, &vi->rq[i], GFP_KERNEL);
>>
>> err = virtnet_enable_queue_pair(vi, i);
>> if (err < 0)
>> @@ -3473,15 +3471,15 @@ static void __virtnet_rx_resume(struct virtnet_info *vi,
>> bool refill)
>> {
>> bool running = netif_running(vi->dev);
>> - bool schedule_refill = false;
>>
>> - if (refill && !try_fill_recv(vi, rq, GFP_KERNEL))
>> - schedule_refill = true;
>> + if (refill)
>> + /* If this fails, we will retry later in NAPI poll, which is
>> + * scheduled in the below virtnet_napi_enable
>> + */
>> + try_fill_recv(vi, rq, GFP_KERNEL);
>> +
>> if (running)
>> virtnet_napi_enable(rq);
>> -
>> - if (schedule_refill)
>> - schedule_delayed_work(&vi->refill, 0);
>> }
>>
>> static void virtnet_rx_resume_all(struct virtnet_info *vi)
>> @@ -3777,6 +3775,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
>> struct virtio_net_rss_config_trailer old_rss_trailer;
>> struct net_device *dev = vi->dev;
>> struct scatterlist sg;
>> + int i;
>>
>> if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
>> return 0;
>> @@ -3829,11 +3828,8 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
>> }
>> succ:
>> vi->curr_queue_pairs = queue_pairs;
>> - /* virtnet_open() will refill when device is going to up. */
>> - spin_lock_bh(&vi->refill_lock);
>> - if (dev->flags & IFF_UP && vi->refill_enabled)
>> - schedule_delayed_work(&vi->refill, 0);
>> - spin_unlock_bh(&vi->refill_lock);
>> + for (i = 0; i < vi->curr_queue_pairs; i++)
>> + try_fill_recv(vi, &vi->rq[i], GFP_KERNEL);
>>
>> return 0;
>> }
>>
>>
>> But I got an issue with selftests/drivers/net/hw/xsk_reconfig.py. This
>> test sets up XDP zerocopy (Xsk) but does not provide any descriptors to
>> the fill ring. So xsk_pool does not have any descriptors and
>> try_fill_recv will always fail. The RX NAPI keeps polling. Later, when
>> we want to disable the xsk_pool, in virtnet_xsk_pool_disable path,
>>
>> virtnet_xsk_pool_disable
>> -> virtnet_rq_bind_xsk_pool
>> -> virtnet_rx_pause
>> -> __virtnet_rx_pause
>> -> virtnet_napi_disable
>> -> napi_disable
>>
>> We get stuck in napi_disable because the RX NAPI is still polling.
> napi_disable will set NAPI_DISABLE bit, no?
>
>> In drivers/net/ethernet/mellanox/mlx5, AFAICS, it uses state bit for
>> synchronization between xsk setup (mlx5e_xsk_setup_pool) with RX NAPI
>> (mlx5e_napi_poll) without using napi_disable/enable. However, in
>> drivers/net/ethernet/intel/ice,
>>
>> ice_xsk_pool_setup
>> -> ice_qp_dis
>> -> ice_qvec_toggle_napi
>> -> napi_disable
>>
>> it still uses napi_disable. Did I miss something in the above patch?
>> I'll try to look into using another synchronization instead of
>> napi_disable/enable in xsk_pool setup path too.
>>
>> Thanks,
>> Quang Minh.
>>
> Thanks
>
On Tue, Dec 30, 2025 at 11:28:50PM +0700, Bui Quang Minh wrote:
> On 12/26/25 14:37, Michael S. Tsirkin wrote:
> > On Fri, Dec 26, 2025 at 09:31:26AM +0800, Jason Wang wrote:
> > > On Fri, Dec 26, 2025 at 12:27 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > On Thu, Dec 25, 2025 at 03:33:29PM +0800, Jason Wang wrote:
> > > > > On Wed, Dec 24, 2025 at 9:48 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote:
> > > > > > > Hi Jason,
> > > > > > >
> > > > > > > I'm wondering why we even need this refill work. Why not simply let NAPI retry
> > > > > > > the refill on its next run if the refill fails? That would seem much simpler.
> > > > > > > This refill work complicates maintenance and often introduces a lot of
> > > > > > > concurrency issues and races.
> > > > > > >
> > > > > > > Thanks.
> > > > > > refill work can refill from GFP_KERNEL, napi only from ATOMIC.
> > > > > >
> > > > > > And if GFP_ATOMIC failed, aggressively retrying might not be a great idea.
> > > > > Btw, I see some drivers are doing things as Xuan said. E.g
> > > > > mlx5e_napi_poll() did:
> > > > >
> > > > > busy |= INDIRECT_CALL_2(rq->post_wqes,
> > > > > mlx5e_post_rx_mpwqes,
> > > > > mlx5e_post_rx_wqes,
> > > > >
> > > > > ...
> > > > >
> > > > > if (busy) {
> > > > > if (likely(mlx5e_channel_no_affinity_change(c))) {
> > > > > work_done = budget;
> > > > > goto out;
> > > > > ...
> > > >
> > > > is busy a GFP_ATOMIC allocation failure?
> > > Yes, and I think the logic here is to fallback to ksoftirqd if the
> > > allocation fails too much.
> > >
> > > Thanks
> >
> > True. I just don't know if this works better or worse than the
> > current design, but it is certainly simpler and we never actually
> > worried about the performance of the current one.
> >
> >
> > So you know, let's roll with this approach.
> >
> > I do however ask that some testing is done on the patch forcing these OOM
> > situations just to see if we are missing something obvious.
> >
> >
> > the beauty is the patch can be very small:
> > 1. patch 1 do not schedule refill ever, just retrigger napi
> > 2. remove all the now dead code
> >
> > this way patch 1 will be small and backportable to stable.
>
> I've tried 1. with this patch
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 1bb3aeca66c6..9e890aff2d95 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -3035,7 +3035,7 @@ static int virtnet_receive_packets(struct virtnet_info *vi,
> }
>
> static int virtnet_receive(struct receive_queue *rq, int budget,
> - unsigned int *xdp_xmit)
> + unsigned int *xdp_xmit, bool *retry_refill)
> {
> struct virtnet_info *vi = rq->vq->vdev->priv;
> struct virtnet_rq_stats stats = {};
> @@ -3047,12 +3047,8 @@ static int virtnet_receive(struct receive_queue *rq, int budget,
> packets = virtnet_receive_packets(vi, rq, budget, xdp_xmit, &stats);
>
> if (rq->vq->num_free > min((unsigned int)budget, virtqueue_get_vring_size(rq->vq)) / 2) {
> - if (!try_fill_recv(vi, rq, GFP_ATOMIC)) {
> - spin_lock(&vi->refill_lock);
> - if (vi->refill_enabled)
> - schedule_delayed_work(&vi->refill, 0);
> - spin_unlock(&vi->refill_lock);
> - }
> + if (!try_fill_recv(vi, rq, GFP_ATOMIC))
> + *retry_refill = true;
> }
>
> u64_stats_set(&stats.packets, packets);
> @@ -3129,18 +3125,18 @@ static int virtnet_poll(struct napi_struct *napi, int budget)
> struct send_queue *sq;
> unsigned int received;
> unsigned int xdp_xmit = 0;
> - bool napi_complete;
> + bool napi_complete, retry_refill = false;
>
> virtnet_poll_cleantx(rq, budget);
>
> - received = virtnet_receive(rq, budget, &xdp_xmit);
> + received = virtnet_receive(rq, budget, &xdp_xmit, &retry_refill);
> rq->packets_in_napi += received;
>
> if (xdp_xmit & VIRTIO_XDP_REDIR)
> xdp_do_flush();
>
> /* Out of packets? */
> - if (received < budget) {
> + if (received < budget && !retry_refill) {
> napi_complete = virtqueue_napi_complete(napi, rq->vq, received);
> /* Intentionally not taking dim_lock here. This may result in a
> * spurious net_dim call. But if that happens virtnet_rx_dim_work
> @@ -3230,9 +3226,11 @@ static int virtnet_open(struct net_device *dev)
>
> for (i = 0; i < vi->max_queue_pairs; i++) {
> if (i < vi->curr_queue_pairs)
> - /* Make sure we have some buffers: if oom use wq. */
> - if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
> - schedule_delayed_work(&vi->refill, 0);
> + /* If this fails, we will retry later in
> + * NAPI poll, which is scheduled in the below
> + * virtnet_enable_queue_pair
> + */
> + try_fill_recv(vi, &vi->rq[i], GFP_KERNEL);
>
> err = virtnet_enable_queue_pair(vi, i);
> if (err < 0)
> @@ -3473,15 +3471,15 @@ static void __virtnet_rx_resume(struct virtnet_info *vi,
> bool refill)
> {
> bool running = netif_running(vi->dev);
> - bool schedule_refill = false;
>
> - if (refill && !try_fill_recv(vi, rq, GFP_KERNEL))
> - schedule_refill = true;
> + if (refill)
> + /* If this fails, we will retry later in NAPI poll, which is
> + * scheduled in the below virtnet_napi_enable
> + */
> + try_fill_recv(vi, rq, GFP_KERNEL);
> +
> if (running)
> virtnet_napi_enable(rq);
> -
> - if (schedule_refill)
> - schedule_delayed_work(&vi->refill, 0);
> }
>
> static void virtnet_rx_resume_all(struct virtnet_info *vi)
> @@ -3777,6 +3775,7 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
> struct virtio_net_rss_config_trailer old_rss_trailer;
> struct net_device *dev = vi->dev;
> struct scatterlist sg;
> + int i;
>
> if (!vi->has_cvq || !virtio_has_feature(vi->vdev, VIRTIO_NET_F_MQ))
> return 0;
> @@ -3829,11 +3828,8 @@ static int virtnet_set_queues(struct virtnet_info *vi, u16 queue_pairs)
> }
> succ:
> vi->curr_queue_pairs = queue_pairs;
> - /* virtnet_open() will refill when device is going to up. */
> - spin_lock_bh(&vi->refill_lock);
> - if (dev->flags & IFF_UP && vi->refill_enabled)
> - schedule_delayed_work(&vi->refill, 0);
> - spin_unlock_bh(&vi->refill_lock);
> + for (i = 0; i < vi->curr_queue_pairs; i++)
> + try_fill_recv(vi, &vi->rq[i], GFP_KERNEL);
>
> return 0;
> }
>
>
> But I got an issue with selftests/drivers/net/hw/xsk_reconfig.py. This
> test sets up XDP zerocopy (Xsk) but does not provide any descriptors to
> the fill ring. So xsk_pool does not have any descriptors and
> try_fill_recv will always fail. The RX NAPI keeps polling. Later, when
> we want to disable the xsk_pool, in virtnet_xsk_pool_disable path,
>
> virtnet_xsk_pool_disable
> -> virtnet_rq_bind_xsk_pool
> -> virtnet_rx_pause
> -> __virtnet_rx_pause
> -> virtnet_napi_disable
> -> napi_disable
>
> We get stuck in napi_disable because the RX NAPI is still polling.
>
> In drivers/net/ethernet/mellanox/mlx5, AFAICS, it uses state bit for
> synchronization between xsk setup (mlx5e_xsk_setup_pool) with RX NAPI
> (mlx5e_napi_poll) without using napi_disable/enable. However, in
> drivers/net/ethernet/intel/ice,
>
> ice_xsk_pool_setup
> -> ice_qp_dis
> -> ice_qvec_toggle_napi
> -> napi_disable
>
> it still uses napi_disable. Did I miss something in the above patch?
> I'll try to look into using another synchronization instead of
> napi_disable/enable in xsk_pool setup path too.
>
> Thanks,
> Quang Minh.
... and the simplicity is out of the window. Up to you but maybe
it is easier to keep plugging the holes in the current approach.
It has been in the field for a very long time now, at least.
--
MST
On Fri, Dec 26, 2025 at 3:37 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, Dec 26, 2025 at 09:31:26AM +0800, Jason Wang wrote:
> > On Fri, Dec 26, 2025 at 12:27 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Thu, Dec 25, 2025 at 03:33:29PM +0800, Jason Wang wrote:
> > > > On Wed, Dec 24, 2025 at 9:48 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote:
> > > > > >
> > > > > > Hi Jason,
> > > > > >
> > > > > > I'm wondering why we even need this refill work. Why not simply let NAPI retry
> > > > > > the refill on its next run if the refill fails? That would seem much simpler.
> > > > > > This refill work complicates maintenance and often introduces a lot of
> > > > > > concurrency issues and races.
> > > > > >
> > > > > > Thanks.
> > > > >
> > > > > refill work can refill from GFP_KERNEL, napi only from ATOMIC.
> > > > >
> > > > > And if GFP_ATOMIC failed, aggressively retrying might not be a great idea.
> > > >
> > > > Btw, I see some drivers are doing things as Xuan said. E.g
> > > > mlx5e_napi_poll() did:
> > > >
> > > > busy |= INDIRECT_CALL_2(rq->post_wqes,
> > > > mlx5e_post_rx_mpwqes,
> > > > mlx5e_post_rx_wqes,
> > > >
> > > > ...
> > > >
> > > > if (busy) {
> > > > if (likely(mlx5e_channel_no_affinity_change(c))) {
> > > > work_done = budget;
> > > > goto out;
> > > > ...
> > >
> > >
> > > is busy a GFP_ATOMIC allocation failure?
> >
> > Yes, and I think the logic here is to fallback to ksoftirqd if the
> > allocation fails too much.
> >
> > Thanks
>
>
> True. I just don't know if this works better or worse than the
> current design, but it is certainly simpler and we never actually
> worried about the performance of the current one.
>
>
> So you know, let's roll with this approach.
>
> I do however ask that some testing is done on the patch forcing these OOM
> situations just to see if we are missing something obvious.
>
>
> the beauty is the patch can be very small:
> 1. patch 1 do not schedule refill ever, just retrigger napi
> 2. remove all the now dead code
>
> this way patch 1 will be small and backportable to stable.
I fully agree here.
Thanks
On 12/24/25 08:47, Michael S. Tsirkin wrote: > On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote: >> Hi Jason, >> >> I'm wondering why we even need this refill work. Why not simply let NAPI retry >> the refill on its next run if the refill fails? That would seem much simpler. >> This refill work complicates maintenance and often introduces a lot of >> concurrency issues and races. >> >> Thanks. > refill work can refill from GFP_KERNEL, napi only from ATOMIC. > > And if GFP_ATOMIC failed, aggressively retrying might not be a great idea. > > Not saying refill work is a great hack, but that is the reason for it. In case no allocated received buffer and NAPI refill fails, the host will not send any packets. If there is no busy polling loop either, the RX will be stuck. That's also the reason why we need refill work. Is it correct? Thanks, Quang Minh.
On 12/24/25 23:49, Bui Quang Minh wrote: > On 12/24/25 08:47, Michael S. Tsirkin wrote: >> On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote: >>> Hi Jason, >>> >>> I'm wondering why we even need this refill work. Why not simply let >>> NAPI retry >>> the refill on its next run if the refill fails? That would seem much >>> simpler. >>> This refill work complicates maintenance and often introduces a lot of >>> concurrency issues and races. >>> >>> Thanks. >> refill work can refill from GFP_KERNEL, napi only from ATOMIC. >> >> And if GFP_ATOMIC failed, aggressively retrying might not be a great >> idea. >> >> Not saying refill work is a great hack, but that is the reason for it. > > In case no allocated received buffer and NAPI refill fails, the host > will not send any packets. If there is no busy polling loop either, > the RX will be stuck. That's also the reason why we need refill work. > Is it correct? I've just looked at mlx5e_napi_poll which is mentioned by Jason. So if we want to retry refilling in the next NAPI, we can set a bool (e.g. retry_refill) in virtnet_receive, then in virtnet_poll, we don't call virtqueue_napi_complete. As a result, our napi poll is still in the softirq's poll list, so we don't need a new host packet to trigger virtqueue's callback which calls napi_schedule again. > > Thanks, > Quang Minh. > >
On Thu, Dec 25, 2025 at 10:55:36PM +0700, Bui Quang Minh wrote: > On 12/24/25 23:49, Bui Quang Minh wrote: > > On 12/24/25 08:47, Michael S. Tsirkin wrote: > > > On Wed, Dec 24, 2025 at 09:37:14AM +0800, Xuan Zhuo wrote: > > > > Hi Jason, > > > > > > > > I'm wondering why we even need this refill work. Why not simply > > > > let NAPI retry > > > > the refill on its next run if the refill fails? That would seem > > > > much simpler. > > > > This refill work complicates maintenance and often introduces a lot of > > > > concurrency issues and races. > > > > > > > > Thanks. > > > refill work can refill from GFP_KERNEL, napi only from ATOMIC. > > > > > > And if GFP_ATOMIC failed, aggressively retrying might not be a great > > > idea. > > > > > > Not saying refill work is a great hack, but that is the reason for it. > > > > In case no allocated received buffer and NAPI refill fails, the host > > will not send any packets. If there is no busy polling loop either, the > > RX will be stuck. That's also the reason why we need refill work. Is it > > correct? > > I've just looked at mlx5e_napi_poll which is mentioned by Jason. So if we > want to retry refilling in the next NAPI, we can set a bool (e.g. > retry_refill) in virtnet_receive, then in virtnet_poll, we don't call > virtqueue_napi_complete. As a result, our napi poll is still in the > softirq's poll list, so we don't need a new host packet to trigger > virtqueue's callback which calls napi_schedule again. > > > > Thanks, > > Quang Minh. > > > yes yes. but aggressively retrying GFP_ATOMIC until it works is not the thing to do. -- MST
© 2016 - 2026 Red Hat, Inc.