[v1] virtio-net: disable delayed refill when pausing rx

[PATCH] virtio-net: disable delayed refill when pausing rx

Posted by Bui Quang Minh 10 months, 1 week ago

When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call
napi_disable() on the receive queue's napi. In delayed refill_work, it
also calls napi_disable() on the receive queue's napi. This can leads to
deadlock when napi_disable() is called on an already disabled napi. This
scenario can be reproducible by binding a XDP socket to virtio-net
interface without setting up the fill ring. As a result, try_fill_recv
will fail until the fill ring is set up and refill_work is scheduled.

This commit adds virtnet_rx_(pause/resume)_all helpers and fixes up the
virtnet_rx_resume to disable future and cancel all inflights delayed
refill_work before calling napi_disable() to pause the rx.

Fixes: 6a4763e26803 ("virtio_net: support rx queue resize")
Fixes: 4941d472bf95 ("virtio-net: do not reset during XDP set")
Fixes: 09d2b3182c8e ("virtio_net: xsk: bind/unbind xsk for rx")
Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
---
 drivers/net/virtio_net.c | 60 ++++++++++++++++++++++++++++++++++------
 1 file changed, 51 insertions(+), 9 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7e4617216a4b..4361b91ccc64 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3342,10 +3342,53 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
 	return NETDEV_TX_OK;
 }
 
+static void virtnet_rx_pause_all(struct virtnet_info *vi)
+{
+	bool running = netif_running(vi->dev);
+
+	/*
+	 * Make sure refill_work does not run concurrently to
+	 * avoid napi_disable race which leads to deadlock.
+	 */
+	disable_delayed_refill(vi);
+	cancel_delayed_work_sync(&vi->refill);
+	if (running) {
+		int i;
+
+		for (i = 0; i < vi->max_queue_pairs; i++) {
+			virtnet_napi_disable(&vi->rq[i]);
+			virtnet_cancel_dim(vi, &vi->rq[i].dim);
+		}
+	}
+}
+
+static void virtnet_rx_resume_all(struct virtnet_info *vi)
+{
+	bool running = netif_running(vi->dev);
+	int i;
+
+	enable_delayed_refill(vi);
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		if (i < vi->curr_queue_pairs) {
+			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
+				schedule_delayed_work(&vi->refill, 0);
+		}
+
+		if (running)
+			virtnet_napi_enable(&vi->rq[i]);
+	}
+}
+
 static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
 {
 	bool running = netif_running(vi->dev);
 
+	/*
+	 * Make sure refill_work does not run concurrently to
+	 * avoid napi_disable race which leads to deadlock.
+	 */
+	disable_delayed_refill(vi);
+	cancel_delayed_work_sync(&vi->refill);
 	if (running) {
 		virtnet_napi_disable(rq);
 		virtnet_cancel_dim(vi, &rq->dim);
@@ -3356,6 +3399,7 @@ static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq)
 {
 	bool running = netif_running(vi->dev);
 
+	enable_delayed_refill(vi);
 	if (!try_fill_recv(vi, rq, GFP_KERNEL))
 		schedule_delayed_work(&vi->refill, 0);
 
@@ -5959,12 +6003,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	if (prog)
 		bpf_prog_add(prog, vi->max_queue_pairs - 1);
 
+	virtnet_rx_pause_all(vi);
+
 	/* Make sure NAPI is not using any XDP TX queues for RX. */
 	if (netif_running(dev)) {
-		for (i = 0; i < vi->max_queue_pairs; i++) {
-			virtnet_napi_disable(&vi->rq[i]);
+		for (i = 0; i < vi->max_queue_pairs; i++)
 			virtnet_napi_tx_disable(&vi->sq[i]);
-		}
 	}
 
 	if (!prog) {
@@ -5996,13 +6040,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 		vi->xdp_enabled = false;
 	}
 
+	virtnet_rx_resume_all(vi);
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		if (old_prog)
 			bpf_prog_put(old_prog);
-		if (netif_running(dev)) {
-			virtnet_napi_enable(&vi->rq[i]);
+		if (netif_running(dev))
 			virtnet_napi_tx_enable(&vi->sq[i]);
-		}
 	}
 
 	return 0;
@@ -6014,11 +6057,10 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 			rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
 	}
 
+	virtnet_rx_resume_all(vi);
 	if (netif_running(dev)) {
-		for (i = 0; i < vi->max_queue_pairs; i++) {
-			virtnet_napi_enable(&vi->rq[i]);
+		for (i = 0; i < vi->max_queue_pairs; i++)
 			virtnet_napi_tx_enable(&vi->sq[i]);
-		}
 	}
 	if (prog)
 		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
-- 
2.43.0

Re: [PATCH] virtio-net: disable delayed refill when pausing rx

Posted by Xuan Zhuo 10 months ago

On Fri,  4 Apr 2025 16:39:03 +0700, Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call
> napi_disable() on the receive queue's napi. In delayed refill_work, it
> also calls napi_disable() on the receive queue's napi. This can leads to
> deadlock when napi_disable() is called on an already disabled napi. This
> scenario can be reproducible by binding a XDP socket to virtio-net
> interface without setting up the fill ring. As a result, try_fill_recv
> will fail until the fill ring is set up and refill_work is scheduled.


So, what is the problem? The refill_work is waiting? As I know, that thread
will sleep some time, so the cpu can do other work.

Thanks.

>
> This commit adds virtnet_rx_(pause/resume)_all helpers and fixes up the
> virtnet_rx_resume to disable future and cancel all inflights delayed
> refill_work before calling napi_disable() to pause the rx.
>
> Fixes: 6a4763e26803 ("virtio_net: support rx queue resize")
> Fixes: 4941d472bf95 ("virtio-net: do not reset during XDP set")
> Fixes: 09d2b3182c8e ("virtio_net: xsk: bind/unbind xsk for rx")
> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> ---
>  drivers/net/virtio_net.c | 60 ++++++++++++++++++++++++++++++++++------
>  1 file changed, 51 insertions(+), 9 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 7e4617216a4b..4361b91ccc64 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -3342,10 +3342,53 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
>  	return NETDEV_TX_OK;
>  }
>
> +static void virtnet_rx_pause_all(struct virtnet_info *vi)
> +{
> +	bool running = netif_running(vi->dev);
> +
> +	/*
> +	 * Make sure refill_work does not run concurrently to
> +	 * avoid napi_disable race which leads to deadlock.
> +	 */
> +	disable_delayed_refill(vi);
> +	cancel_delayed_work_sync(&vi->refill);
> +	if (running) {
> +		int i;
> +
> +		for (i = 0; i < vi->max_queue_pairs; i++) {
> +			virtnet_napi_disable(&vi->rq[i]);
> +			virtnet_cancel_dim(vi, &vi->rq[i].dim);
> +		}
> +	}
> +}
> +
> +static void virtnet_rx_resume_all(struct virtnet_info *vi)
> +{
> +	bool running = netif_running(vi->dev);
> +	int i;
> +
> +	enable_delayed_refill(vi);
> +	for (i = 0; i < vi->max_queue_pairs; i++) {
> +		if (i < vi->curr_queue_pairs) {
> +			if (!try_fill_recv(vi, &vi->rq[i], GFP_KERNEL))
> +				schedule_delayed_work(&vi->refill, 0);
> +		}
> +
> +		if (running)
> +			virtnet_napi_enable(&vi->rq[i]);
> +	}
> +}
> +
>  static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq)
>  {
>  	bool running = netif_running(vi->dev);
>
> +	/*
> +	 * Make sure refill_work does not run concurrently to
> +	 * avoid napi_disable race which leads to deadlock.
> +	 */
> +	disable_delayed_refill(vi);
> +	cancel_delayed_work_sync(&vi->refill);
>  	if (running) {
>  		virtnet_napi_disable(rq);
>  		virtnet_cancel_dim(vi, &rq->dim);
> @@ -3356,6 +3399,7 @@ static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq)
>  {
>  	bool running = netif_running(vi->dev);
>
> +	enable_delayed_refill(vi);
>  	if (!try_fill_recv(vi, rq, GFP_KERNEL))
>  		schedule_delayed_work(&vi->refill, 0);
>
> @@ -5959,12 +6003,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>  	if (prog)
>  		bpf_prog_add(prog, vi->max_queue_pairs - 1);
>
> +	virtnet_rx_pause_all(vi);
> +
>  	/* Make sure NAPI is not using any XDP TX queues for RX. */
>  	if (netif_running(dev)) {
> -		for (i = 0; i < vi->max_queue_pairs; i++) {
> -			virtnet_napi_disable(&vi->rq[i]);
> +		for (i = 0; i < vi->max_queue_pairs; i++)
>  			virtnet_napi_tx_disable(&vi->sq[i]);
> -		}
>  	}
>
>  	if (!prog) {
> @@ -5996,13 +6040,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>  		vi->xdp_enabled = false;
>  	}
>
> +	virtnet_rx_resume_all(vi);
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		if (old_prog)
>  			bpf_prog_put(old_prog);
> -		if (netif_running(dev)) {
> -			virtnet_napi_enable(&vi->rq[i]);
> +		if (netif_running(dev))
>  			virtnet_napi_tx_enable(&vi->sq[i]);
> -		}
>  	}
>
>  	return 0;
> @@ -6014,11 +6057,10 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
>  			rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog);
>  	}
>
> +	virtnet_rx_resume_all(vi);
>  	if (netif_running(dev)) {
> -		for (i = 0; i < vi->max_queue_pairs; i++) {
> -			virtnet_napi_enable(&vi->rq[i]);
> +		for (i = 0; i < vi->max_queue_pairs; i++)
>  			virtnet_napi_tx_enable(&vi->sq[i]);
> -		}
>  	}
>  	if (prog)
>  		bpf_prog_sub(prog, vi->max_queue_pairs - 1);
> --
> 2.43.0
>

Re: [PATCH] virtio-net: disable delayed refill when pausing rx

Posted by Bui Quang Minh 10 months ago

On 4/7/25 08:03, Xuan Zhuo wrote:
> On Fri,  4 Apr 2025 16:39:03 +0700, Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>> When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call
>> napi_disable() on the receive queue's napi. In delayed refill_work, it
>> also calls napi_disable() on the receive queue's napi. This can leads to
>> deadlock when napi_disable() is called on an already disabled napi. This
>> scenario can be reproducible by binding a XDP socket to virtio-net
>> interface without setting up the fill ring. As a result, try_fill_recv
>> will fail until the fill ring is set up and refill_work is scheduled.
>
> So, what is the problem? The refill_work is waiting? As I know, that thread
> will sleep some time, so the cpu can do other work.

When napi_disable is called on an already disabled napi, it will sleep 
in napi_disable_locked while still holding the netdev_lock. As a result, 
later napi_enable gets stuck too as it cannot acquire the netdev_lock. 
This leads to refill_work and the pause-then-resume tx are stuck altogether.

Thanks,
Quang Minh.

Re: [PATCH] virtio-net: disable delayed refill when pausing rx

Posted by Jason Wang 10 months ago

On Mon, Apr 7, 2025 at 10:27 AM Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>
> On 4/7/25 08:03, Xuan Zhuo wrote:
> > On Fri,  4 Apr 2025 16:39:03 +0700, Bui Quang Minh <minhquangbui99@gmail.com> wrote:
> >> When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call
> >> napi_disable() on the receive queue's napi. In delayed refill_work, it
> >> also calls napi_disable() on the receive queue's napi. This can leads to
> >> deadlock when napi_disable() is called on an already disabled napi. This
> >> scenario can be reproducible by binding a XDP socket to virtio-net
> >> interface without setting up the fill ring. As a result, try_fill_recv
> >> will fail until the fill ring is set up and refill_work is scheduled.
> >
> > So, what is the problem? The refill_work is waiting? As I know, that thread
> > will sleep some time, so the cpu can do other work.
>
> When napi_disable is called on an already disabled napi, it will sleep
> in napi_disable_locked while still holding the netdev_lock. As a result,
> later napi_enable gets stuck too as it cannot acquire the netdev_lock.
> This leads to refill_work and the pause-then-resume tx are stuck altogether.

This needs to be added to the chagelog. And it looks like this is a fix for

commit 413f0271f3966e0c73d4937963f19335af19e628
Author: Jakub Kicinski <kuba@kernel.org>
Date:   Tue Jan 14 19:53:14 2025 -0800

    net: protect NAPI enablement with netdev_lock()

?

I wonder if it's simpler to just hold the netdev lock in resize or xsk
binding instead of this.

Thanks

>
> Thanks,
> Quang Minh.
>

Re: [PATCH] virtio-net: disable delayed refill when pausing rx

Posted by Bui Quang Minh 10 months ago

On 4/8/25 14:34, Jason Wang wrote:
> On Mon, Apr 7, 2025 at 10:27 AM Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>> On 4/7/25 08:03, Xuan Zhuo wrote:
>>> On Fri,  4 Apr 2025 16:39:03 +0700, Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>>>> When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call
>>>> napi_disable() on the receive queue's napi. In delayed refill_work, it
>>>> also calls napi_disable() on the receive queue's napi. This can leads to
>>>> deadlock when napi_disable() is called on an already disabled napi. This
>>>> scenario can be reproducible by binding a XDP socket to virtio-net
>>>> interface without setting up the fill ring. As a result, try_fill_recv
>>>> will fail until the fill ring is set up and refill_work is scheduled.
>>> So, what is the problem? The refill_work is waiting? As I know, that thread
>>> will sleep some time, so the cpu can do other work.
>> When napi_disable is called on an already disabled napi, it will sleep
>> in napi_disable_locked while still holding the netdev_lock. As a result,
>> later napi_enable gets stuck too as it cannot acquire the netdev_lock.
>> This leads to refill_work and the pause-then-resume tx are stuck altogether.
> This needs to be added to the chagelog. And it looks like this is a fix for
>
> commit 413f0271f3966e0c73d4937963f19335af19e628
> Author: Jakub Kicinski <kuba@kernel.org>
> Date:   Tue Jan 14 19:53:14 2025 -0800
>
>      net: protect NAPI enablement with netdev_lock()
>
> ?

I'm not aware of this, will update the fix tags in the next patch.

> I wonder if it's simpler to just hold the netdev lock in resize or xsk
> binding instead of this.

That looks cleaner, let me try that approach.

Thanks,
Quang Minh

Re: [PATCH] virtio-net: disable delayed refill when pausing rx

Posted by Bui Quang Minh 10 months ago

On 4/9/25 13:44, Bui Quang Minh wrote:
> On 4/8/25 14:34, Jason Wang wrote:
>> On Mon, Apr 7, 2025 at 10:27 AM Bui Quang Minh 
>> <minhquangbui99@gmail.com> wrote:
>>> On 4/7/25 08:03, Xuan Zhuo wrote:
>>>> On Fri,  4 Apr 2025 16:39:03 +0700, Bui Quang Minh 
>>>> <minhquangbui99@gmail.com> wrote:
>>>>> When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call
>>>>> napi_disable() on the receive queue's napi. In delayed 
>>>>> refill_work, it
>>>>> also calls napi_disable() on the receive queue's napi. This can 
>>>>> leads to
>>>>> deadlock when napi_disable() is called on an already disabled 
>>>>> napi. This
>>>>> scenario can be reproducible by binding a XDP socket to virtio-net
>>>>> interface without setting up the fill ring. As a result, 
>>>>> try_fill_recv
>>>>> will fail until the fill ring is set up and refill_work is scheduled.
>>>> So, what is the problem? The refill_work is waiting? As I know, 
>>>> that thread
>>>> will sleep some time, so the cpu can do other work.
>>> When napi_disable is called on an already disabled napi, it will sleep
>>> in napi_disable_locked while still holding the netdev_lock. As a 
>>> result,
>>> later napi_enable gets stuck too as it cannot acquire the netdev_lock.
>>> This leads to refill_work and the pause-then-resume tx are stuck 
>>> altogether.
>> This needs to be added to the chagelog. And it looks like this is a 
>> fix for
>>
>> commit 413f0271f3966e0c73d4937963f19335af19e628
>> Author: Jakub Kicinski <kuba@kernel.org>
>> Date:   Tue Jan 14 19:53:14 2025 -0800
>>
>>      net: protect NAPI enablement with netdev_lock()
>>
>> ?
>
> I'm not aware of this, will update the fix tags in the next patch.
>
>> I wonder if it's simpler to just hold the netdev lock in resize or xsk
>> binding instead of this.
>
> That looks cleaner, let me try that approach.

We need to hold the netdev_lock in the refill work too. Otherwise, we 
get this race

refill_work                                xdp bind
napi_disable()

                                             netdev_lock(dev)
                                             napi_disable_locked() <- hang
napi_enable() <- cannot acquire the lock


I don't like the idea to hold netdev_lock in refill_work. I will reply 
with the patch for you to review.

Thanks,
Quang Minh.

[PATCH] virtio-net: hold netdev_lock when pausing rx

Posted by Bui Quang Minh 10 months ago

When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call
napi_disable() on the receive queue's napi. In delayed refill_work, it
also calls napi_disable() on the receive queue's napi. When
napi_disable() is called on an already disabled napi, it will sleep in
napi_disable_locked while still holding the netdev_lock. As a result,
later napi_enable gets stuck too as it cannot acquire the netdev_lock.
This leads to refill_work and the pause-then-resume tx are stuck
altogether.

This scenario can be reproducible by binding a XDP socket to virtio-net
interface without setting up the fill ring. As a result, try_fill_recv
will fail until the fill ring is set up and refill_work is scheduled.

This commit makes the pausing rx path hold the netdev_lock until
resuming, prevent any napi_disable() to be called on a temporarily
disabled napi.

Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()")
Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
---
  drivers/net/virtio_net.c | 74 +++++++++++++++++++++++++---------------
  1 file changed, 47 insertions(+), 27 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 7e4617216a4b..74bd1065c586 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2786,9 +2786,13 @@ static void skb_recv_done(struct virtqueue *rvq)
  }

  static void virtnet_napi_do_enable(struct virtqueue *vq,
-                   struct napi_struct *napi)
+                   struct napi_struct *napi,
+                   bool netdev_locked)
  {
-    napi_enable(napi);
+    if (netdev_locked)
+        napi_enable_locked(napi);
+    else
+        napi_enable(napi);

      /* If all buffers were filled by other side before we napi_enabled, we
       * won't get another interrupt, so process any outstanding packets 
now.
@@ -2799,16 +2803,16 @@ static void virtnet_napi_do_enable(struct 
virtqueue *vq,
      local_bh_enable();
  }

-static void virtnet_napi_enable(struct receive_queue *rq)
+static void virtnet_napi_enable(struct receive_queue *rq, bool 
netdev_locked)
  {
      struct virtnet_info *vi = rq->vq->vdev->priv;
      int qidx = vq2rxq(rq->vq);

-    virtnet_napi_do_enable(rq->vq, &rq->napi);
+    virtnet_napi_do_enable(rq->vq, &rq->napi, netdev_locked);
      netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, &rq->napi);
  }

-static void virtnet_napi_tx_enable(struct send_queue *sq)
+static void virtnet_napi_tx_enable(struct send_queue *sq, bool 
netdev_locked)
  {
      struct virtnet_info *vi = sq->vq->vdev->priv;
      struct napi_struct *napi = &sq->napi;
@@ -2825,11 +2829,11 @@ static void virtnet_napi_tx_enable(struct 
send_queue *sq)
          return;
      }

-    virtnet_napi_do_enable(sq->vq, napi);
+    virtnet_napi_do_enable(sq->vq, napi, netdev_locked);
      netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, napi);
  }

-static void virtnet_napi_tx_disable(struct send_queue *sq)
+static void virtnet_napi_tx_disable(struct send_queue *sq, bool 
netdev_locked)
  {
      struct virtnet_info *vi = sq->vq->vdev->priv;
      struct napi_struct *napi = &sq->napi;
@@ -2837,18 +2841,24 @@ static void virtnet_napi_tx_disable(struct 
send_queue *sq)

      if (napi->weight) {
          netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, NULL);
-        napi_disable(napi);
+        if (netdev_locked)
+            napi_disable_locked(napi);
+        else
+            napi_disable(napi);
      }
  }

-static void virtnet_napi_disable(struct receive_queue *rq)
+static void virtnet_napi_disable(struct receive_queue *rq, bool 
netdev_locked)
  {
      struct virtnet_info *vi = rq->vq->vdev->priv;
      struct napi_struct *napi = &rq->napi;
      int qidx = vq2rxq(rq->vq);

      netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, NULL);
-    napi_disable(napi);
+    if (netdev_locked)
+        napi_disable_locked(napi);
+    else
+        napi_disable(napi);
  }

  static void refill_work(struct work_struct *work)
@@ -2875,9 +2885,11 @@ static void refill_work(struct work_struct *work)
           *     instance lock)
           *   - check netif_running() and return early to avoid a race
           */
-        napi_disable(&rq->napi);
+        netdev_lock(vi->dev);
+        napi_disable_locked(&rq->napi);
          still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
-        virtnet_napi_do_enable(rq->vq, &rq->napi);
+        virtnet_napi_do_enable(rq->vq, &rq->napi, true);
+        netdev_unlock(vi->dev);

          /* In theory, this can happen: if we don't get any buffers in
           * we will *never* try to fill again.
@@ -3074,8 +3086,8 @@ static int virtnet_poll(struct napi_struct *napi, 
int budget)

  static void virtnet_disable_queue_pair(struct virtnet_info *vi, int 
qp_index)
  {
-    virtnet_napi_tx_disable(&vi->sq[qp_index]);
-    virtnet_napi_disable(&vi->rq[qp_index]);
+    virtnet_napi_tx_disable(&vi->sq[qp_index], false);
+    virtnet_napi_disable(&vi->rq[qp_index], false);
      xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq);
  }

@@ -3094,8 +3106,8 @@ static int virtnet_enable_queue_pair(struct 
virtnet_info *vi, int qp_index)
      if (err < 0)
          goto err_xdp_reg_mem_model;

-    virtnet_napi_enable(&vi->rq[qp_index]);
-    virtnet_napi_tx_enable(&vi->sq[qp_index]);
+    virtnet_napi_enable(&vi->rq[qp_index], false);
+    virtnet_napi_tx_enable(&vi->sq[qp_index], false);

      return 0;

@@ -3347,7 +3359,8 @@ static void virtnet_rx_pause(struct virtnet_info 
*vi, struct receive_queue *rq)
      bool running = netif_running(vi->dev);

      if (running) {
-        virtnet_napi_disable(rq);
+        netdev_lock(vi->dev);
+        virtnet_napi_disable(rq, true);
          virtnet_cancel_dim(vi, &rq->dim);
      }
  }
@@ -3359,8 +3372,10 @@ static void virtnet_rx_resume(struct virtnet_info 
*vi, struct receive_queue *rq)
      if (!try_fill_recv(vi, rq, GFP_KERNEL))
          schedule_delayed_work(&vi->refill, 0);

-    if (running)
-        virtnet_napi_enable(rq);
+    if (running) {
+        virtnet_napi_enable(rq, true);
+        netdev_unlock(vi->dev);
+    }
  }

  static int virtnet_rx_resize(struct virtnet_info *vi,
@@ -3389,7 +3404,7 @@ static void virtnet_tx_pause(struct virtnet_info 
*vi, struct send_queue *sq)
      qindex = sq - vi->sq;

      if (running)
-        virtnet_napi_tx_disable(sq);
+        virtnet_napi_tx_disable(sq, false);

      txq = netdev_get_tx_queue(vi->dev, qindex);

@@ -3423,7 +3438,7 @@ static void virtnet_tx_resume(struct virtnet_info 
*vi, struct send_queue *sq)
      __netif_tx_unlock_bh(txq);

      if (running)
-        virtnet_napi_tx_enable(sq);
+        virtnet_napi_tx_enable(sq, false);
  }

  static int virtnet_tx_resize(struct virtnet_info *vi, struct 
send_queue *sq,
@@ -5961,9 +5976,10 @@ static int virtnet_xdp_set(struct net_device 
*dev, struct bpf_prog *prog,

      /* Make sure NAPI is not using any XDP TX queues for RX. */
      if (netif_running(dev)) {
+        netdev_lock(dev);
          for (i = 0; i < vi->max_queue_pairs; i++) {
-            virtnet_napi_disable(&vi->rq[i]);
-            virtnet_napi_tx_disable(&vi->sq[i]);
+            virtnet_napi_disable(&vi->rq[i], true);
+            virtnet_napi_tx_disable(&vi->sq[i], true);
          }
      }

@@ -6000,11 +6016,14 @@ static int virtnet_xdp_set(struct net_device 
*dev, struct bpf_prog *prog,
          if (old_prog)
              bpf_prog_put(old_prog);
          if (netif_running(dev)) {
-            virtnet_napi_enable(&vi->rq[i]);
-            virtnet_napi_tx_enable(&vi->sq[i]);
+            virtnet_napi_enable(&vi->rq[i], true);
+            virtnet_napi_tx_enable(&vi->sq[i], true);
          }
      }

+    if (netif_running(dev))
+        netdev_unlock(dev);
+
      return 0;

  err:
@@ -6016,9 +6035,10 @@ static int virtnet_xdp_set(struct net_device 
*dev, struct bpf_prog *prog,

      if (netif_running(dev)) {
          for (i = 0; i < vi->max_queue_pairs; i++) {
-            virtnet_napi_enable(&vi->rq[i]);
-            virtnet_napi_tx_enable(&vi->sq[i]);
+            virtnet_napi_enable(&vi->rq[i], true);
+            virtnet_napi_tx_enable(&vi->sq[i], true);
          }
+        netdev_unlock(dev);
      }
      if (prog)
          bpf_prog_sub(prog, vi->max_queue_pairs - 1);
-- 
2.43.0

Re: [PATCH] virtio-net: hold netdev_lock when pausing rx

Posted by Michael S. Tsirkin 10 months ago

On Thu, Apr 10, 2025 at 02:05:57PM +0700, Bui Quang Minh wrote:
> When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call
> napi_disable() on the receive queue's napi. In delayed refill_work, it
> also calls napi_disable() on the receive queue's napi. When
> napi_disable() is called on an already disabled napi, it will sleep in
> napi_disable_locked while still holding the netdev_lock. As a result,
> later napi_enable gets stuck too as it cannot acquire the netdev_lock.
> This leads to refill_work and the pause-then-resume tx are stuck
> altogether.
> 
> This scenario can be reproducible by binding a XDP socket to virtio-net
> interface without setting up the fill ring. As a result, try_fill_recv
> will fail until the fill ring is set up and refill_work is scheduled.
> 
> This commit makes the pausing rx path hold the netdev_lock until
> resuming, prevent any napi_disable() to be called on a temporarily
> disabled napi.
> 
> Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()")
> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
> ---
>  drivers/net/virtio_net.c | 74 +++++++++++++++++++++++++---------------
>  1 file changed, 47 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 7e4617216a4b..74bd1065c586 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2786,9 +2786,13 @@ static void skb_recv_done(struct virtqueue *rvq)
>  }
> 
>  static void virtnet_napi_do_enable(struct virtqueue *vq,
> -                   struct napi_struct *napi)
> +                   struct napi_struct *napi,
> +                   bool netdev_locked)
>  {
> -    napi_enable(napi);
> +    if (netdev_locked)
> +        napi_enable_locked(napi);
> +    else
> +        napi_enable(napi);
> 
>      /* If all buffers were filled by other side before we napi_enabled, we
>       * won't get another interrupt, so process any outstanding packets now.
> @@ -2799,16 +2803,16 @@ static void virtnet_napi_do_enable(struct virtqueue
> *vq,




Your patch is line-wrapped, unfortunately. Here and elsewhere.




>      local_bh_enable();
>  }
> 
> -static void virtnet_napi_enable(struct receive_queue *rq)
> +static void virtnet_napi_enable(struct receive_queue *rq, bool
> netdev_locked)
>  {
>      struct virtnet_info *vi = rq->vq->vdev->priv;
>      int qidx = vq2rxq(rq->vq);
> 
> -    virtnet_napi_do_enable(rq->vq, &rq->napi);
> +    virtnet_napi_do_enable(rq->vq, &rq->napi, netdev_locked);
>      netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, &rq->napi);
>  }
> 
> -static void virtnet_napi_tx_enable(struct send_queue *sq)
> +static void virtnet_napi_tx_enable(struct send_queue *sq, bool
> netdev_locked)
>  {
>      struct virtnet_info *vi = sq->vq->vdev->priv;
>      struct napi_struct *napi = &sq->napi;
> @@ -2825,11 +2829,11 @@ static void virtnet_napi_tx_enable(struct send_queue
> *sq)
>          return;
>      }
> 
> -    virtnet_napi_do_enable(sq->vq, napi);
> +    virtnet_napi_do_enable(sq->vq, napi, netdev_locked);
>      netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, napi);
>  }
> 
> -static void virtnet_napi_tx_disable(struct send_queue *sq)
> +static void virtnet_napi_tx_disable(struct send_queue *sq, bool
> netdev_locked)
>  {
>      struct virtnet_info *vi = sq->vq->vdev->priv;
>      struct napi_struct *napi = &sq->napi;
> @@ -2837,18 +2841,24 @@ static void virtnet_napi_tx_disable(struct
> send_queue *sq)
> 
>      if (napi->weight) {
>          netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, NULL);
> -        napi_disable(napi);
> +        if (netdev_locked)
> +            napi_disable_locked(napi);
> +        else
> +            napi_disable(napi);
>      }
>  }
> 
> -static void virtnet_napi_disable(struct receive_queue *rq)
> +static void virtnet_napi_disable(struct receive_queue *rq, bool
> netdev_locked)
>  {
>      struct virtnet_info *vi = rq->vq->vdev->priv;
>      struct napi_struct *napi = &rq->napi;
>      int qidx = vq2rxq(rq->vq);
> 
>      netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, NULL);
> -    napi_disable(napi);
> +    if (netdev_locked)
> +        napi_disable_locked(napi);
> +    else
> +        napi_disable(napi);
>  }
> 
>  static void refill_work(struct work_struct *work)
> @@ -2875,9 +2885,11 @@ static void refill_work(struct work_struct *work)
>           *     instance lock)
>           *   - check netif_running() and return early to avoid a race
>           */
> -        napi_disable(&rq->napi);
> +        netdev_lock(vi->dev);
> +        napi_disable_locked(&rq->napi);
>          still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);


This does mean netdev_lock is held potentially for a long while,
while try_fill_recv and processing inside virtnet_napi_do_enable
finish. Better ideas?

> -        virtnet_napi_do_enable(rq->vq, &rq->napi);
> +        virtnet_napi_do_enable(rq->vq, &rq->napi, true);
> +        netdev_unlock(vi->dev);
> 
>          /* In theory, this can happen: if we don't get any buffers in
>           * we will *never* try to fill again.
> @@ -3074,8 +3086,8 @@ static int virtnet_poll(struct napi_struct *napi, int
> budget)
> 
>  static void virtnet_disable_queue_pair(struct virtnet_info *vi, int
> qp_index)
>  {
> -    virtnet_napi_tx_disable(&vi->sq[qp_index]);
> -    virtnet_napi_disable(&vi->rq[qp_index]);
> +    virtnet_napi_tx_disable(&vi->sq[qp_index], false);
> +    virtnet_napi_disable(&vi->rq[qp_index], false);
>      xdp_rxq_info_unreg(&vi->rq[qp_index].xdp_rxq);
>  }
> 
> @@ -3094,8 +3106,8 @@ static int virtnet_enable_queue_pair(struct
> virtnet_info *vi, int qp_index)
>      if (err < 0)
>          goto err_xdp_reg_mem_model;
> 
> -    virtnet_napi_enable(&vi->rq[qp_index]);
> -    virtnet_napi_tx_enable(&vi->sq[qp_index]);
> +    virtnet_napi_enable(&vi->rq[qp_index], false);
> +    virtnet_napi_tx_enable(&vi->sq[qp_index], false);
> 
>      return 0;
> 
> @@ -3347,7 +3359,8 @@ static void virtnet_rx_pause(struct virtnet_info *vi,
> struct receive_queue *rq)
>      bool running = netif_running(vi->dev);
> 
>      if (running) {
> -        virtnet_napi_disable(rq);
> +        netdev_lock(vi->dev);
> +        virtnet_napi_disable(rq, true);
>          virtnet_cancel_dim(vi, &rq->dim);
>      }
>  }
> @@ -3359,8 +3372,10 @@ static void virtnet_rx_resume(struct virtnet_info
> *vi, struct receive_queue *rq)
>      if (!try_fill_recv(vi, rq, GFP_KERNEL))
>          schedule_delayed_work(&vi->refill, 0);
> 
> -    if (running)
> -        virtnet_napi_enable(rq);
> +    if (running) {
> +        virtnet_napi_enable(rq, true);
> +        netdev_unlock(vi->dev);
> +    }
>  }
> 
>  static int virtnet_rx_resize(struct virtnet_info *vi,
> @@ -3389,7 +3404,7 @@ static void virtnet_tx_pause(struct virtnet_info *vi,
> struct send_queue *sq)
>      qindex = sq - vi->sq;
> 
>      if (running)
> -        virtnet_napi_tx_disable(sq);
> +        virtnet_napi_tx_disable(sq, false);
> 
>      txq = netdev_get_tx_queue(vi->dev, qindex);
> 
> @@ -3423,7 +3438,7 @@ static void virtnet_tx_resume(struct virtnet_info *vi,
> struct send_queue *sq)
>      __netif_tx_unlock_bh(txq);
> 
>      if (running)
> -        virtnet_napi_tx_enable(sq);
> +        virtnet_napi_tx_enable(sq, false);
>  }
> 
>  static int virtnet_tx_resize(struct virtnet_info *vi, struct send_queue
> *sq,
> @@ -5961,9 +5976,10 @@ static int virtnet_xdp_set(struct net_device *dev,
> struct bpf_prog *prog,
> 
>      /* Make sure NAPI is not using any XDP TX queues for RX. */
>      if (netif_running(dev)) {
> +        netdev_lock(dev);
>          for (i = 0; i < vi->max_queue_pairs; i++) {
> -            virtnet_napi_disable(&vi->rq[i]);
> -            virtnet_napi_tx_disable(&vi->sq[i]);
> +            virtnet_napi_disable(&vi->rq[i], true);
> +            virtnet_napi_tx_disable(&vi->sq[i], true);
>          }
>      }
> 
> @@ -6000,11 +6016,14 @@ static int virtnet_xdp_set(struct net_device *dev,
> struct bpf_prog *prog,
>          if (old_prog)
>              bpf_prog_put(old_prog);
>          if (netif_running(dev)) {
> -            virtnet_napi_enable(&vi->rq[i]);
> -            virtnet_napi_tx_enable(&vi->sq[i]);
> +            virtnet_napi_enable(&vi->rq[i], true);
> +            virtnet_napi_tx_enable(&vi->sq[i], true);
>          }
>      }
> 
> +    if (netif_running(dev))
> +        netdev_unlock(dev);
> +
>      return 0;
> 
>  err:
> @@ -6016,9 +6035,10 @@ static int virtnet_xdp_set(struct net_device *dev,
> struct bpf_prog *prog,
> 
>      if (netif_running(dev)) {
>          for (i = 0; i < vi->max_queue_pairs; i++) {
> -            virtnet_napi_enable(&vi->rq[i]);
> -            virtnet_napi_tx_enable(&vi->sq[i]);
> +            virtnet_napi_enable(&vi->rq[i], true);
> +            virtnet_napi_tx_enable(&vi->sq[i], true);
>          }
> +        netdev_unlock(dev);
>      }
>      if (prog)
>          bpf_prog_sub(prog, vi->max_queue_pairs - 1);
> -- 
> 2.43.0
>

Re: [PATCH] virtio-net: hold netdev_lock when pausing rx

Posted by Bui Quang Minh 10 months ago

On 4/10/25 14:58, Michael S. Tsirkin wrote:
> On Thu, Apr 10, 2025 at 02:05:57PM +0700, Bui Quang Minh wrote:
>> When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call
>> napi_disable() on the receive queue's napi. In delayed refill_work, it
>> also calls napi_disable() on the receive queue's napi. When
>> napi_disable() is called on an already disabled napi, it will sleep in
>> napi_disable_locked while still holding the netdev_lock. As a result,
>> later napi_enable gets stuck too as it cannot acquire the netdev_lock.
>> This leads to refill_work and the pause-then-resume tx are stuck
>> altogether.
>>
>> This scenario can be reproducible by binding a XDP socket to virtio-net
>> interface without setting up the fill ring. As a result, try_fill_recv
>> will fail until the fill ring is set up and refill_work is scheduled.
>>
>> This commit makes the pausing rx path hold the netdev_lock until
>> resuming, prevent any napi_disable() to be called on a temporarily
>> disabled napi.
>>
>> Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()")
>> Signed-off-by: Bui Quang Minh <minhquangbui99@gmail.com>
>> ---
>>   drivers/net/virtio_net.c | 74 +++++++++++++++++++++++++---------------
>>   1 file changed, 47 insertions(+), 27 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 7e4617216a4b..74bd1065c586 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -2786,9 +2786,13 @@ static void skb_recv_done(struct virtqueue *rvq)
>>   }
>>
>>   static void virtnet_napi_do_enable(struct virtqueue *vq,
>> -                   struct napi_struct *napi)
>> +                   struct napi_struct *napi,
>> +                   bool netdev_locked)
>>   {
>> -    napi_enable(napi);
>> +    if (netdev_locked)
>> +        napi_enable_locked(napi);
>> +    else
>> +        napi_enable(napi);
>>
>>       /* If all buffers were filled by other side before we napi_enabled, we
>>        * won't get another interrupt, so process any outstanding packets now.
>> @@ -2799,16 +2803,16 @@ static void virtnet_napi_do_enable(struct virtqueue
>> *vq,
>
>
>
> Your patch is line-wrapped, unfortunately. Here and elsewhere.
>
>
>
>
>>       local_bh_enable();
>>   }
>>
>> -static void virtnet_napi_enable(struct receive_queue *rq)
>> +static void virtnet_napi_enable(struct receive_queue *rq, bool
>> netdev_locked)
>>   {
>>       struct virtnet_info *vi = rq->vq->vdev->priv;
>>       int qidx = vq2rxq(rq->vq);
>>
>> -    virtnet_napi_do_enable(rq->vq, &rq->napi);
>> +    virtnet_napi_do_enable(rq->vq, &rq->napi, netdev_locked);
>>       netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, &rq->napi);
>>   }
>>
>> -static void virtnet_napi_tx_enable(struct send_queue *sq)
>> +static void virtnet_napi_tx_enable(struct send_queue *sq, bool
>> netdev_locked)
>>   {
>>       struct virtnet_info *vi = sq->vq->vdev->priv;
>>       struct napi_struct *napi = &sq->napi;
>> @@ -2825,11 +2829,11 @@ static void virtnet_napi_tx_enable(struct send_queue
>> *sq)
>>           return;
>>       }
>>
>> -    virtnet_napi_do_enable(sq->vq, napi);
>> +    virtnet_napi_do_enable(sq->vq, napi, netdev_locked);
>>       netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, napi);
>>   }
>>
>> -static void virtnet_napi_tx_disable(struct send_queue *sq)
>> +static void virtnet_napi_tx_disable(struct send_queue *sq, bool
>> netdev_locked)
>>   {
>>       struct virtnet_info *vi = sq->vq->vdev->priv;
>>       struct napi_struct *napi = &sq->napi;
>> @@ -2837,18 +2841,24 @@ static void virtnet_napi_tx_disable(struct
>> send_queue *sq)
>>
>>       if (napi->weight) {
>>           netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_TX, NULL);
>> -        napi_disable(napi);
>> +        if (netdev_locked)
>> +            napi_disable_locked(napi);
>> +        else
>> +            napi_disable(napi);
>>       }
>>   }
>>
>> -static void virtnet_napi_disable(struct receive_queue *rq)
>> +static void virtnet_napi_disable(struct receive_queue *rq, bool
>> netdev_locked)
>>   {
>>       struct virtnet_info *vi = rq->vq->vdev->priv;
>>       struct napi_struct *napi = &rq->napi;
>>       int qidx = vq2rxq(rq->vq);
>>
>>       netif_queue_set_napi(vi->dev, qidx, NETDEV_QUEUE_TYPE_RX, NULL);
>> -    napi_disable(napi);
>> +    if (netdev_locked)
>> +        napi_disable_locked(napi);
>> +    else
>> +        napi_disable(napi);
>>   }
>>
>>   static void refill_work(struct work_struct *work)
>> @@ -2875,9 +2885,11 @@ static void refill_work(struct work_struct *work)
>>            *     instance lock)
>>            *   - check netif_running() and return early to avoid a race
>>            */
>> -        napi_disable(&rq->napi);
>> +        netdev_lock(vi->dev);
>> +        napi_disable_locked(&rq->napi);
>>           still_empty = !try_fill_recv(vi, rq, GFP_KERNEL);
>
> This does mean netdev_lock is held potentially for a long while,
> while try_fill_recv and processing inside virtnet_napi_do_enable
> finish. Better ideas?
I prefer the first patch in this thread where we disable delayed refill 
and cancel all inflight refill_work before pausing rx.

Thanks,
Quang Minh.

Re: [PATCH] virtio-net: disable delayed refill when pausing rx

Posted by Paolo Abeni 10 months ago

On 4/8/25 9:34 AM, Jason Wang wrote:
> On Mon, Apr 7, 2025 at 10:27 AM Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>> On 4/7/25 08:03, Xuan Zhuo wrote:
>>> On Fri,  4 Apr 2025 16:39:03 +0700, Bui Quang Minh <minhquangbui99@gmail.com> wrote:
>>>> When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call
>>>> napi_disable() on the receive queue's napi. In delayed refill_work, it
>>>> also calls napi_disable() on the receive queue's napi. This can leads to
>>>> deadlock when napi_disable() is called on an already disabled napi. This
>>>> scenario can be reproducible by binding a XDP socket to virtio-net
>>>> interface without setting up the fill ring. As a result, try_fill_recv
>>>> will fail until the fill ring is set up and refill_work is scheduled.
>>>
>>> So, what is the problem? The refill_work is waiting? As I know, that thread
>>> will sleep some time, so the cpu can do other work.
>>
>> When napi_disable is called on an already disabled napi, it will sleep
>> in napi_disable_locked while still holding the netdev_lock. As a result,
>> later napi_enable gets stuck too as it cannot acquire the netdev_lock.
>> This leads to refill_work and the pause-then-resume tx are stuck altogether.
> 
> This needs to be added to the chagelog. And it looks like this is a fix for
> 
> commit 413f0271f3966e0c73d4937963f19335af19e628
> Author: Jakub Kicinski <kuba@kernel.org>
> Date:   Tue Jan 14 19:53:14 2025 -0800
> 
>     net: protect NAPI enablement with netdev_lock()
> 
> ?
> 
> I wonder if it's simpler to just hold the netdev lock in resize or xsk
> binding instead of this.

Setting:

	dev->request_ops_lock = true;

in virtnet_probe() before calling register_netdevice() should achieve
the above. Could you please have a try?

Thanks,

Paolo

Re: [PATCH] virtio-net: disable delayed refill when pausing rx

Posted by Jakub Kicinski 10 months ago

On Tue, 8 Apr 2025 11:28:54 +0200 Paolo Abeni wrote:
> >> When napi_disable is called on an already disabled napi, it will sleep
> >> in napi_disable_locked while still holding the netdev_lock. As a result,
> >> later napi_enable gets stuck too as it cannot acquire the netdev_lock.
> >> This leads to refill_work and the pause-then-resume tx are stuck altogether.  
> > 
> > This needs to be added to the chagelog. And it looks like this is a fix for
> > 
> > commit 413f0271f3966e0c73d4937963f19335af19e628
> > Author: Jakub Kicinski <kuba@kernel.org>
> > Date:   Tue Jan 14 19:53:14 2025 -0800
> > 
> >     net: protect NAPI enablement with netdev_lock()
> > 
> > ?
> > 
> > I wonder if it's simpler to just hold the netdev lock in resize or xsk
> > binding instead of this.  
> 
> Setting:
> 
> 	dev->request_ops_lock = true;
> 
> in virtnet_probe() before calling register_netdevice() should achieve
> the above. Could you please have a try?

Can we do that or do we need a more tailored fix? request_ops_lock only
appeared in 6.15 and the bug AFAIU dates back to 6.14. We don't normally
worry about given the stream of fixes - request_ops_lock is a bit risky.
Jason's suggestion AFAIU is just to wrap the disable/enable pairs in
the lock. We can try request_ops_lock in -next ?

Bui Quang Minh, could you add a selftest for this problem?
See tools/testing/selftests/drivers/net/virtio_net/
You can re-use / extend the XSK helper from
tools/testing/selftests/drivers/net/xdp_helper.c ?
(move it to tools/testing/selftests/net/lib for easier access)

Re: [PATCH] virtio-net: disable delayed refill when pausing rx

Posted by Bui Quang Minh 10 months ago

On 4/8/25 21:37, Jakub Kicinski wrote:
> On Tue, 8 Apr 2025 11:28:54 +0200 Paolo Abeni wrote:
>>>> When napi_disable is called on an already disabled napi, it will sleep
>>>> in napi_disable_locked while still holding the netdev_lock. As a result,
>>>> later napi_enable gets stuck too as it cannot acquire the netdev_lock.
>>>> This leads to refill_work and the pause-then-resume tx are stuck altogether.
>>> This needs to be added to the chagelog. And it looks like this is a fix for
>>>
>>> commit 413f0271f3966e0c73d4937963f19335af19e628
>>> Author: Jakub Kicinski <kuba@kernel.org>
>>> Date:   Tue Jan 14 19:53:14 2025 -0800
>>>
>>>      net: protect NAPI enablement with netdev_lock()
>>>
>>> ?
>>>
>>> I wonder if it's simpler to just hold the netdev lock in resize or xsk
>>> binding instead of this.
>> Setting:
>>
>> 	dev->request_ops_lock = true;
>>
>> in virtnet_probe() before calling register_netdevice() should achieve
>> the above. Could you please have a try?
> Can we do that or do we need a more tailored fix? request_ops_lock only
> appeared in 6.15 and the bug AFAIU dates back to 6.14. We don't normally
> worry about given the stream of fixes - request_ops_lock is a bit risky.
> Jason's suggestion AFAIU is just to wrap the disable/enable pairs in
> the lock. We can try request_ops_lock in -next ?
>
> Bui Quang Minh, could you add a selftest for this problem?
> See tools/testing/selftests/drivers/net/virtio_net/
> You can re-use / extend the XSK helper from
> tools/testing/selftests/drivers/net/xdp_helper.c ?
> (move it to tools/testing/selftests/net/lib for easier access)

I've just tried the current selftests and found out that the queues.py 
can trigger the deadlock.

Running this a few times (the probability is quite high) on the guest 
with virtio-net interface
~ NETIF=enp0s2 ./ksft-net-drv/run_kselftest.sh -t drivers/net:queues.py

The test hangs.

I've got the hung task warning

[  363.841549]  #0: ffff88800015c758 
((wq_completion)events){+.+.}-{0:0}, at: process_scheduled_works+0x1a9/0x3c9
[  363.841560]  #1: ffffc90000043e28 
((work_completion)(&pool->work)){+.+.}-{0:0}, at: 
process_scheduled_works+0x1c2/0x3c9
[  363.841570]  #2: ffffffff81f5a150 (rtnl_mutex){+.+.}-{4:4}, at: 
rtnl_lock+0x1b/0x1d
[  363.841579]  #3: ffff8880035e49f0 (&dev->lock){+.+.}-{4:4}, at: 
netdev_lock+0x12/0x14
[  363.841587] 3 locks held by kworker/2:0/26:
[  363.841589]  #0: ffff88800015c758 
((wq_completion)events){+.+.}-{0:0}, at: process_scheduled_works+0x1a9/0x3c9
[  363.841598]  #1: ffffc900000f7e28 
((work_completion)(&(&vi->refill)->work)){+.+.}-{0:0}, at: 
process_scheduled_works+0x1c2/0x3c9
[  363.841608]  #2: ffff8880035e49f0 (&dev->lock){+.+.}-{4:4}, at: 
netdev_lock+0x12/0x14

~ cat /proc/7/stack
[<0>] netdev_lock+0x12/0x14
[<0>] napi_enable+0x1a/0x35
[<0>] virtnet_napi_do_enable+0x1a/0x40
[<0>] virtnet_napi_enable+0x32/0x4a
[<0>] virtnet_rx_resume+0x4f/0x56
[<0>] virtnet_rq_bind_xsk_pool+0xd8/0xfd
[<0>] virtnet_xdp+0x6d3/0x72d
[<0>] xp_disable_drv_zc+0x5a/0x63
[<0>] xp_clear_dev+0x1d/0x4a
[<0>] xp_release_deferred+0x24/0x75
[<0>] process_scheduled_works+0x23e/0x3c9
[<0>] worker_thread+0x13f/0x1ca
[<0>] kthread+0x1b2/0x1ba
[<0>] ret_from_fork+0x2b/0x40
[<0>] ret_from_fork_asm+0x11/0x20

~ cat /proc/26/stack
[<0>] napi_disable_locked+0x41/0xaf
[<0>] napi_disable+0x22/0x35
[<0>] refill_work+0x4b/0x8f
[<0>] process_scheduled_works+0x23e/0x3c9
[<0>] worker_thread+0x13f/0x1ca
[<0>] kthread+0x1b2/0x1ba
[<0>] ret_from_fork+0x2b/0x40
[<0>] ret_from_fork_asm+0x11/0x20

The deadlock was caused by a race between xsk unbind and refill_work.


Thanks,
Quang Minh.