drivers/net/netdevsim/bus.c | 3 ++ drivers/net/netdevsim/netdev.c | 63 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 2 deletions(-)
Add flow control mechanism between paired netdevsim devices to stop the
TX queue during high traffic scenarios. When a receive queue becomes
congested (approaching NSIM_RING_SIZE limit), the corresponding transmit
queue on the peer device is stopped using netif_subqueue_try_stop().
Once the receive queue has sufficient capacity again, the peer's
transmit queue is resumed with netif_tx_wake_queue().
Key changes:
* Add nsim_stop_peer_tx_queue() to pause peer TX when RX queue is full
* Add nsim_start_peer_tx_queue() to resume peer TX when RX queue drains
* Implement queue mapping validation to ensure TX/RX queue count match
* Wake all queues during device unlinking to prevent stuck queues
* Use RCU protection when accessing peer device references
The flow control only activates when devices have matching TX/RX queue
counts to ensure proper queue mapping.
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Breno Leitao <leitao@debian.org>
---
Changes in v2:
- Move the RCU locks inside the function (David)
- Use better helpers for waking up all the queues (Jakub)
- Link to v1: https://lore.kernel.org/r/20250701-netdev_flow_control-v1-1-240329fc91b1@debian.org
---
drivers/net/netdevsim/bus.c | 3 ++
drivers/net/netdevsim/netdev.c | 63 ++++++++++++++++++++++++++++++++++++++++--
2 files changed, 64 insertions(+), 2 deletions(-)
diff --git a/drivers/net/netdevsim/bus.c b/drivers/net/netdevsim/bus.c
index 64c0cdd31bf85..1ba52471f3fbc 100644
--- a/drivers/net/netdevsim/bus.c
+++ b/drivers/net/netdevsim/bus.c
@@ -366,6 +366,9 @@ static ssize_t unlink_device_store(const struct bus_type *bus, const char *buf,
err = 0;
RCU_INIT_POINTER(nsim->peer, NULL);
RCU_INIT_POINTER(peer->peer, NULL);
+ synchronize_net();
+ netif_tx_wake_all_queues(dev);
+ netif_tx_wake_all_queues(peer->netdev);
out_put_netns:
put_net(ns);
diff --git a/drivers/net/netdevsim/netdev.c b/drivers/net/netdevsim/netdev.c
index e36d3e846c2dc..b5b13fba6450d 100644
--- a/drivers/net/netdevsim/netdev.c
+++ b/drivers/net/netdevsim/netdev.c
@@ -37,9 +37,67 @@ MODULE_IMPORT_NS("NETDEV_INTERNAL");
#define NSIM_RING_SIZE 256
-static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb)
+static void nsim_start_peer_tx_queue(struct net_device *dev, struct nsim_rq *rq)
+{
+ struct netdevsim *ns = netdev_priv(dev);
+ struct net_device *peer_dev;
+ struct netdevsim *peer_ns;
+ struct netdev_queue *txq;
+ u16 idx;
+
+ idx = rq->napi.index;
+ rcu_read_lock();
+ peer_ns = rcu_dereference(ns->peer);
+ if (!peer_ns)
+ goto out;
+
+ /* TX device */
+ peer_dev = peer_ns->netdev;
+ if (dev->real_num_tx_queues != peer_dev->num_rx_queues)
+ goto out;
+
+ txq = netdev_get_tx_queue(peer_dev, idx);
+ if (!(netif_tx_queue_stopped(txq)))
+ goto out;
+
+ netif_tx_wake_queue(txq);
+out:
+ rcu_read_unlock();
+}
+
+static void nsim_stop_peer_tx_queue(struct net_device *dev, struct nsim_rq *rq,
+ u16 idx)
+{
+ struct netdevsim *ns = netdev_priv(dev);
+ struct net_device *peer_dev;
+ struct netdevsim *peer_ns;
+
+ rcu_read_lock();
+ peer_ns = rcu_dereference(ns->peer);
+ if (!peer_ns)
+ goto out;
+
+ /* TX device */
+ peer_dev = peer_ns->netdev;
+
+ /* If different queues size, do not stop, since it is not
+ * easy to find which TX queue is mapped here
+ */
+ if (dev->real_num_tx_queues != peer_dev->num_rx_queues)
+ goto out;
+
+ netif_subqueue_try_stop(peer_dev, idx,
+ NSIM_RING_SIZE - skb_queue_len(&rq->skb_queue),
+ NSIM_RING_SIZE / 2);
+out:
+ rcu_read_unlock();
+}
+
+static int nsim_napi_rx(struct net_device *dev, struct nsim_rq *rq,
+ struct sk_buff *skb)
{
if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) {
+ nsim_stop_peer_tx_queue(dev, rq, skb_get_queue_mapping(skb));
dev_kfree_skb_any(skb);
return NET_RX_DROP;
}
@@ -51,7 +109,7 @@ static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb)
static int nsim_forward_skb(struct net_device *dev, struct sk_buff *skb,
struct nsim_rq *rq)
{
- return __dev_forward_skb(dev, skb) ?: nsim_napi_rx(rq, skb);
+ return __dev_forward_skb(dev, skb) ?: nsim_napi_rx(dev, rq, skb);
}
static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -351,6 +409,7 @@ static int nsim_rcv(struct nsim_rq *rq, int budget)
dev_dstats_rx_dropped(dev);
}
+ nsim_start_peer_tx_queue(dev, rq);
return i;
}
---
base-commit: be4ea6c336b9a4fc1cc4be1c0549b24d0e687488
change-id: 20250630-netdev_flow_control-2b2d37965377
Best regards,
--
Breno Leitao <leitao@debian.org>
On Thu, 03 Jul 2025 06:09:31 -0700 Breno Leitao wrote: > +static int nsim_napi_rx(struct net_device *dev, struct nsim_rq *rq, > + struct sk_buff *skb) > { > if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) { > + nsim_stop_peer_tx_queue(dev, rq, skb_get_queue_mapping(skb)); > dev_kfree_skb_any(skb); > return NET_RX_DROP; > } we should probably add: if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) nsim_stop_tx_queue(dev, rq, skb_get_queue_mapping(skb)); after enqueuing the skb, so that we stop the queue before any drops happen > @@ -51,7 +109,7 @@ static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb) > static int nsim_forward_skb(struct net_device *dev, struct sk_buff *skb, > struct nsim_rq *rq) > { > - return __dev_forward_skb(dev, skb) ?: nsim_napi_rx(rq, skb); > + return __dev_forward_skb(dev, skb) ?: nsim_napi_rx(dev, rq, skb); > } > > static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) nsim_start_xmit() has both dev and peer_dev, pass them all the way to nsim_stop_peer_tx_queue() so that you don't have to try to dereference the peer again. > + if (dev->real_num_tx_queues != peer_dev->num_rx_queues) given that we compare real_num_tx_queues I think we should also kick the queues in nsim_set_channels(), like we do in unlink_device_store() -- pw-bot: cr
Hello Jakub, On Tue, Jul 08, 2025 at 06:27:18PM -0700, Jakub Kicinski wrote: > On Thu, 03 Jul 2025 06:09:31 -0700 Breno Leitao wrote: > > +static int nsim_napi_rx(struct net_device *dev, struct nsim_rq *rq, > > + struct sk_buff *skb) > > { > > if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) { > > + nsim_stop_peer_tx_queue(dev, rq, skb_get_queue_mapping(skb)); > > dev_kfree_skb_any(skb); > > return NET_RX_DROP; > > } > > we should probably add: > > if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) > nsim_stop_tx_queue(dev, rq, skb_get_queue_mapping(skb)); > > after enqueuing the skb, so that we stop the queue before any drops > happen Agree, we can stop the queue when queueing the packets instead. Since we need to check for the queue numbers, we cannot call nsim_stop_tx_queue() straight away. I think we still need to have a helper (nsim_stop_tx_queue). This is what I have in mind: static void nsim_stop_tx_queue(struct net_device *tx_dev, struct net_device *rx_dev, struct nsim_rq *rq, u16 idx) { /* If different queues size, do not stop, since it is not * easy to find which TX queue is mapped here */ if (rx_dev->real_num_tx_queues != tx_dev->num_rx_queues) return; /* rq is the queue on the receive side */ netif_subqueue_try_stop(tx_dev, idx, NSIM_RING_SIZE - skb_queue_len(&rq->skb_queue), NSIM_RING_SIZE / 2); } static int nsim_napi_rx(struct net_device *tx_dev, struct net_device *rx_dev, struct nsim_rq *rq, struct sk_buff *skb) { if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) { dev_kfree_skb_any(skb); return NET_RX_DROP; } skb_queue_tail(&rq->skb_queue, skb); /* Stop the peer TX queue avoiding dropping packets later */ if (skb_queue_len(&rq->skb_queue) >= NSIM_RING_SIZE) nsim_stop_tx_queue(tx_dev, rx_dev, rq, skb_get_queue_mapping(skb)); return NET_RX_SUCCESS; } > > @@ -51,7 +109,7 @@ static int nsim_napi_rx(struct nsim_rq *rq, struct sk_buff *skb) > > static int nsim_forward_skb(struct net_device *dev, struct sk_buff *skb, > > struct nsim_rq *rq) > > { > > - return __dev_forward_skb(dev, skb) ?: nsim_napi_rx(rq, skb); > > + return __dev_forward_skb(dev, skb) ?: nsim_napi_rx(dev, rq, skb); > > } > > > > static netdev_tx_t nsim_start_xmit(struct sk_buff *skb, struct net_device *dev) > > nsim_start_xmit() has both dev and peer_dev, pass them all the way to > nsim_stop_peer_tx_queue() so that you don't have to try to dereference > the peer again. Sure. This is a good idea. I am using it, as you can see in the snippet above. > > + if (dev->real_num_tx_queues != peer_dev->num_rx_queues) > > given that we compare real_num_tx_queues I think we should also kick > the queues in nsim_set_channels(), like we do in unlink_device_store() Sure. I suppose something like the following. What do you think? nsim_set_channels(struct net_device *dev, struct ethtool_channels *ch) { struct netdevsim *ns = netdev_priv(dev); + struct netdevsim *peer; int err; err = netif_set_real_num_queues(dev, ch->combined_count, @@ -113,6 +114,14 @@ nsim_set_channels(struct net_device *dev, struct ethtool_channels *ch) return err; ns->ethtool.channels = ch->combined_count; + + synchronize_net(); + netif_tx_wake_all_queues(dev); + rcu_read_lock(); + peer = rcu_dereference(ns->peer); + if (peer) + netif_tx_wake_all_queues(peer->netdev); + rcu_read_unlock(); + return 0; } Also, with this patch, we will eventually get the following critical message: net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); I am wondering if that alert is not valid anymore, and I can simply remove it. Thanks for your review! --breno
On Wed, 9 Jul 2025 03:34:20 -0700 Breno Leitao wrote: > On Tue, Jul 08, 2025 at 06:27:18PM -0700, Jakub Kicinski wrote: > > On Thu, 03 Jul 2025 06:09:31 -0700 Breno Leitao wrote: > > > +static int nsim_napi_rx(struct net_device *dev, struct nsim_rq *rq, > > > + struct sk_buff *skb) > > > { > > > if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) { > > > + nsim_stop_peer_tx_queue(dev, rq, skb_get_queue_mapping(skb)); > > > dev_kfree_skb_any(skb); > > > return NET_RX_DROP; > > > } > > > > we should probably add: > > > > if (skb_queue_len(&rq->skb_queue) > NSIM_RING_SIZE) > > nsim_stop_tx_queue(dev, rq, skb_get_queue_mapping(skb)); > > > > after enqueuing the skb, so that we stop the queue before any drops > > happen > > Agree, we can stop the queue when queueing the packets instead. Since we > need to check for the queue numbers, we cannot call nsim_stop_tx_queue() > straight away. I think we still need to have a helper > (nsim_stop_tx_queue). This is what I have in mind: LGTM! > > > + if (dev->real_num_tx_queues != peer_dev->num_rx_queues) > > > > given that we compare real_num_tx_queues I think we should also kick > > the queues in nsim_set_channels(), like we do in unlink_device_store() > > Sure. I suppose something like the following. What do you think? > > nsim_set_channels(struct net_device *dev, struct ethtool_channels *ch) > { > struct netdevsim *ns = netdev_priv(dev); > + struct netdevsim *peer; > int err; > > err = netif_set_real_num_queues(dev, ch->combined_count, > @@ -113,6 +114,14 @@ nsim_set_channels(struct net_device *dev, struct ethtool_channels *ch) > return err; > > ns->ethtool.channels = ch->combined_count; > + > + synchronize_net(); > + netif_tx_wake_all_queues(dev); > + rcu_read_lock(); > + peer = rcu_dereference(ns->peer); > + if (peer) > + netif_tx_wake_all_queues(peer->netdev); > + rcu_read_unlock(); That's sufficiently orthogonal to warrant a dedicated function / helper. In terms of code I think we can skip the whole dance if peer is NULL? > return 0; > } > > > Also, with this patch, we will eventually get the following critical > message: > > net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); > > I am wondering if that alert is not valid anymore, and I can simply > remove it. Ah. In nsim_setup() we should remove IFF_NO_QUEUE and stop setting tx_queue_len to 0
On Wed, Jul 09, 2025 at 02:36:27PM -0700, Jakub Kicinski wrote: > On Wed, 9 Jul 2025 03:34:20 -0700 Breno Leitao wrote: > > + > > + synchronize_net(); > > + netif_tx_wake_all_queues(dev); > > + rcu_read_lock(); > > + peer = rcu_dereference(ns->peer); > > + if (peer) > > + netif_tx_wake_all_queues(peer->netdev); > > + rcu_read_unlock(); > > That's sufficiently orthogonal to warrant a dedicated function / helper. > > In terms of code I think we can skip the whole dance if peer is NULL? Sure. We can use rcu_access_pointer() to check if the value is set, and then get into the slow path. if (rcu_access_pointer(ns->peer)) nsim_wake_queues(dev); > > Also, with this patch, we will eventually get the following critical > > message: > > > > net_crit_ratelimited("Virtual device %s asks to queue packet!\n", dev->name); > > > > I am wondering if that alert is not valid anymore, and I can simply > > remove it. > > Ah. In nsim_setup() we should remove IFF_NO_QUEUE and stop setting > tx_queue_len to 0 That makes sense, thanks! --breno
© 2016 - 2025 Red Hat, Inc.