[net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver

Lukasz Majewski posted 7 patches 1 month, 1 week ago
[net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver
Posted by Lukasz Majewski 1 month, 1 week ago
This patch provides callbacks for struct net_device_ops for MTIP
L2 switch.

Signed-off-by: Lukasz Majewski <lukasz.majewski@mailbox.org>

---
Changes for v13:
- New patch - created by excluding some code from large (i.e. v12 and
  earlier) MTIP driver

Changes for v14:
- Add read memory barier (rmb) before reading current descriptor
- Use proper locking primitives

Changes for v15 - v15:
- None

Changes for v16:
- Enable MTIP ports to support bridge offloading
- Use dev_err_ratelimited() instead of plain dev_err()
- Move skb storage and tx ring buffer modifications after
  dma mapping code.
- Do not increase tx_errors when frames are dropped after
  failed dma_mapping.
- Refactor the code for better readability
- Remove legacy call to netif_trans_update()
- Remove not needed rmb() - synchronized data read already assured by
  coherent DMA allocation
- Replace spin_{un}lock() with _bh variant

Changes for v17:
- Add missing _bh() variant of spin_unlock
- Avoid reverse christmas tree in swap_buffer()
- Print error message after unlock
- Add DO_ONCE() and a separate function to print state of switch HW
- Remove dev->stats.tx_errors++

Changes for v18 - v19:
- None
---
 .../net/ethernet/freescale/mtipsw/mtipl2sw.c  | 284 ++++++++++++++++++
 1 file changed, 284 insertions(+)

diff --git a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
index 61380f09e2f1..70c624339cc9 100644
--- a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
+++ b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
@@ -43,6 +43,15 @@
 
 #include "mtipl2sw.h"
 
+static void swap_buffer(void *bufaddr, int len)
+{
+	unsigned int *buf = bufaddr;
+	int i;
+
+	for (i = 0; i < len; i += 4, buf++)
+		swab32s(buf);
+}
+
 /* Set the last buffer to wrap */
 static void mtip_set_last_buf_to_wrap(struct cbd_t *bdp)
 {
@@ -436,6 +445,124 @@ static void mtip_config_switch(struct switch_enet_private *fep)
 	       fep->hwp + ESW_IMR);
 }
 
+static netdev_tx_t mtip_start_xmit_port(struct sk_buff *skb,
+					struct net_device *dev, int port)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	struct switch_enet_private *fep = priv->fep;
+	unsigned short status;
+	struct cbd_t *bdp;
+	void *bufaddr;
+
+	spin_lock_bh(&fep->hw_lock);
+
+	if (!fep->link[0] && !fep->link[1]) {
+		/* Link is down or autonegotiation is in progress. */
+		netif_stop_queue(dev);
+		spin_unlock_bh(&fep->hw_lock);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Fill in a Tx ring entry */
+	bdp = fep->cur_tx;
+	status = bdp->cbd_sc;
+
+	if (status & BD_ENET_TX_READY) {
+		/* All transmit buffers are full. Bail out.
+		 * This should not happen, since dev->tbusy should be set.
+		 */
+		netif_stop_queue(dev);
+		spin_unlock_bh(&fep->hw_lock);
+		dev_err_ratelimited(&fep->pdev->dev, "%s: tx queue full!.\n",
+				    dev->name);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Clear all of the status flags */
+	status &= ~BD_ENET_TX_STATS;
+
+	/* Set buffer length and buffer pointer */
+	bufaddr = skb->data;
+	bdp->cbd_datlen = skb->len;
+
+	/* On some FEC implementations data must be aligned on
+	 * 4-byte boundaries. Use bounce buffers to copy data
+	 * and get it aligned.spin
+	 */
+	if ((unsigned long)bufaddr & MTIP_ALIGNMENT) {
+		unsigned int index;
+
+		index = bdp - fep->tx_bd_base;
+		memcpy(fep->tx_bounce[index], skb->data, skb->len);
+		bufaddr = fep->tx_bounce[index];
+	}
+
+	if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
+		swap_buffer(bufaddr, skb->len);
+
+	/* Push the data cache so the CPM does not get stale memory
+	 * data.
+	 */
+	bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev, bufaddr,
+					  MTIP_SWITCH_TX_FRSIZE,
+					  DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(&fep->pdev->dev, bdp->cbd_bufaddr))) {
+		dev_err(&fep->pdev->dev,
+			"Failed to map descriptor tx buffer\n");
+		dev->stats.tx_dropped++;
+		dev_kfree_skb_any(skb);
+		goto err;
+	}
+
+	/* Save skb pointer. */
+	fep->tx_skbuff[fep->skb_cur] = skb;
+	fep->skb_cur = (fep->skb_cur + 1) & TX_RING_MOD_MASK;
+
+	/* Send it on its way.  Tell FEC it's ready, interrupt when done,
+	 * it's the last BD of the frame, and to put the CRC on the end.
+	 */
+
+	status |= (BD_ENET_TX_READY | BD_ENET_TX_INTR | BD_ENET_TX_LAST |
+		   BD_ENET_TX_TC);
+
+	/* Synchronize all descriptor writes */
+	wmb();
+	bdp->cbd_sc = status;
+
+	skb_tx_timestamp(skb);
+
+	/* Trigger transmission start */
+	writel(MCF_ESW_TDAR_X_DES_ACTIVE, fep->hwp + ESW_TDAR);
+
+	dev->stats.tx_bytes += skb->len;
+	/* If this was the last BD in the ring,
+	 * start at the beginning again.
+	 */
+	if (status & BD_ENET_TX_WRAP)
+		bdp = fep->tx_bd_base;
+	else
+		bdp++;
+
+	if (bdp == fep->dirty_tx) {
+		fep->tx_full = 1;
+		netif_stop_queue(dev);
+	}
+
+	fep->cur_tx = bdp;
+ err:
+	spin_unlock_bh(&fep->hw_lock);
+
+	return NETDEV_TX_OK;
+}
+
+static netdev_tx_t mtip_start_xmit(struct sk_buff *skb,
+				   struct net_device *dev)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+
+	return mtip_start_xmit_port(skb, dev, priv->portnum);
+}
+
 static void mtip_configure_enet_mii(struct switch_enet_private *fep, int port)
 {
 	struct phy_device *phydev = fep->phy_dev[port - 1];
@@ -593,6 +720,70 @@ static void mtip_switch_restart(struct net_device *dev, int duplex0,
 	mtip_config_switch(fep);
 }
 
+static void mtip_print_hw_state(struct net_device *dev)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	struct switch_enet_private *fep = priv->fep;
+	struct cbd_t *bdp;
+	int i;
+
+	spin_lock_bh(&fep->hw_lock);
+	dev_info(&dev->dev, "%s: transmit timed out.\n", dev->name);
+	dev_info(&dev->dev,
+		 "Ring data: cur_tx 0x%p%s, dirty_tx 0x%p cur_rx: 0x%p\n",
+		 fep->cur_tx, fep->tx_full ? " (full)" : "", fep->dirty_tx,
+		 fep->cur_rx);
+
+	bdp = fep->tx_bd_base;
+	dev_info(&dev->dev, " tx: %u buffers\n", TX_RING_SIZE);
+	for (i = 0; i < TX_RING_SIZE; i++) {
+		dev_info(&dev->dev, "  0x%p: %04x %04x %08x\n",
+			 bdp, bdp->cbd_sc, bdp->cbd_datlen,
+			 (int)bdp->cbd_bufaddr);
+		bdp++;
+	}
+
+	bdp = fep->rx_bd_base;
+	dev_info(&dev->dev, " rx: %lu buffers\n", RX_RING_SIZE);
+	for (i = 0 ; i < RX_RING_SIZE; i++) {
+		dev_info(&dev->dev, "  0x%p: %04x %04x %08x\n",
+			 bdp, bdp->cbd_sc, bdp->cbd_datlen,
+			 (int)bdp->cbd_bufaddr);
+		bdp++;
+	}
+	spin_unlock_bh(&fep->hw_lock);
+}
+
+static void mtip_timeout(struct net_device *dev, unsigned int txqueue)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+
+	dev->stats.tx_errors++;
+	DO_ONCE(mtip_print_hw_state, dev);
+
+	schedule_work(&priv->tx_timeout_work);
+}
+
+static void mtip_timeout_work(struct work_struct *work)
+{
+	struct mtip_ndev_priv *priv =
+		container_of(work, struct mtip_ndev_priv, tx_timeout_work);
+	struct switch_enet_private *fep = priv->fep;
+	struct net_device *dev = priv->dev;
+
+	rtnl_lock();
+	if (netif_device_present(dev) || netif_running(dev)) {
+		napi_disable(&fep->napi);
+		netif_tx_lock_bh(dev);
+		mtip_switch_restart(dev, fep->full_duplex[0],
+				    fep->full_duplex[1]);
+		netif_tx_wake_all_queues(dev);
+		netif_tx_unlock_bh(dev);
+		napi_enable(&fep->napi);
+	}
+	rtnl_unlock();
+}
+
 static irqreturn_t mtip_interrupt(int irq, void *ptr_fep)
 {
 	struct switch_enet_private *fep = ptr_fep;
@@ -1071,6 +1262,92 @@ static int mtip_close(struct net_device *dev)
 	return 0;
 }
 
+#define FEC_HASH_BITS	6		/* #bits in hash */
+static void mtip_set_multicast_list(struct net_device *dev)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	unsigned int hash_high = 0, hash_low = 0, crc;
+	struct switch_enet_private *fep = priv->fep;
+	void __iomem *enet_addr = fep->enet_addr;
+	struct netdev_hw_addr *ha;
+	unsigned char hash;
+
+	if (priv->portnum == 2)
+		enet_addr += MCF_ESW_ENET_PORT_OFFSET;
+
+	if (dev->flags & IFF_PROMISC) {
+		/* Promisc mode is required for switch - it is
+		 * already enabled during driver's probe.
+		 */
+		dev_dbg(&dev->dev, "%s: IFF_PROMISC\n", __func__);
+		return;
+	}
+
+	if (dev->flags & IFF_ALLMULTI) {
+		dev_dbg(&dev->dev, "%s: IFF_ALLMULTI\n", __func__);
+
+		/* Allow all multicast addresses */
+		writel(0xFFFFFFFF, enet_addr + MCF_FEC_GRP_HASH_TABLE_HIGH);
+		writel(0xFFFFFFFF, enet_addr + MCF_FEC_GRP_HASH_TABLE_LOW);
+
+		return;
+	}
+
+	netdev_for_each_mc_addr(ha, dev) {
+		/* Calculate crc32 value of mac address */
+		crc = ether_crc_le(dev->addr_len, ha->addr);
+
+		/* Only upper 6 bits (FEC_HASH_BITS) are used
+		 * which point to specific bit in the hash registers
+		 */
+		hash = (crc >> (32 - FEC_HASH_BITS)) & 0x3F;
+
+		if (hash > 31)
+			hash_high |= 1 << (hash - 32);
+		else
+			hash_low |= 1 << hash;
+	}
+
+	writel(hash_high, enet_addr + MCF_FEC_GRP_HASH_TABLE_HIGH);
+	writel(hash_low, enet_addr + MCF_FEC_GRP_HASH_TABLE_LOW);
+}
+
+static int mtip_set_mac_address(struct net_device *dev, void *p)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	struct switch_enet_private *fep = priv->fep;
+	void __iomem *enet_addr = fep->enet_addr;
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+	eth_hw_addr_set(dev, addr->sa_data);
+
+	if (priv->portnum == 2)
+		enet_addr += MCF_ESW_ENET_PORT_OFFSET;
+
+	writel(dev->dev_addr[3] | (dev->dev_addr[2] << 8) |
+	       (dev->dev_addr[1] << 16) | (dev->dev_addr[0] << 24),
+	       enet_addr + MCF_FEC_PALR);
+	writel((dev->dev_addr[5] << 16) | (dev->dev_addr[4] << 24),
+	       enet_addr + MCF_FEC_PAUR);
+
+	return mtip_update_atable_static((unsigned char *)dev->dev_addr,
+					 7, 7, fep);
+}
+
+static int mtip_get_port_parent_id(struct net_device *ndev,
+				   struct netdev_phys_item_id *ppid)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(ndev);
+	struct switch_enet_private *fep = priv->fep;
+
+	ppid->id_len = sizeof(fep->mac[0]);
+	memcpy(&ppid->id, &fep->mac[0], ppid->id_len);
+
+	return 0;
+}
+
 static const struct ethtool_ops mtip_ethtool_ops = {
 	.get_link_ksettings     = phy_ethtool_get_link_ksettings,
 	.set_link_ksettings     = phy_ethtool_set_link_ksettings,
@@ -1082,6 +1359,11 @@ static const struct ethtool_ops mtip_ethtool_ops = {
 static const struct net_device_ops mtip_netdev_ops = {
 	.ndo_open		= mtip_open,
 	.ndo_stop		= mtip_close,
+	.ndo_start_xmit	= mtip_start_xmit,
+	.ndo_set_rx_mode	= mtip_set_multicast_list,
+	.ndo_tx_timeout	= mtip_timeout,
+	.ndo_set_mac_address	= mtip_set_mac_address,
+	.ndo_get_port_parent_id	= mtip_get_port_parent_id,
 };
 
 bool mtip_is_switch_netdev_port(const struct net_device *ndev)
@@ -1186,6 +1468,8 @@ static int mtip_ndev_init(struct switch_enet_private *fep,
 			goto cleanup_created_ndev;
 		}
 
+		INIT_WORK(&priv->tx_timeout_work, mtip_timeout_work);
+
 		dev_dbg(&fep->ndev[i]->dev, "%s: MTIP eth L2 switch %pM\n",
 			fep->ndev[i]->name, fep->ndev[i]->dev_addr);
 	}
-- 
2.39.5
Re: [net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver
Posted by Jakub Kicinski 1 month, 1 week ago
On Mon, 25 Aug 2025 00:07:33 +0200 Lukasz Majewski wrote:
> +
> +	/* On some FEC implementations data must be aligned on
> +	 * 4-byte boundaries. Use bounce buffers to copy data
> +	 * and get it aligned.spin
> +	 */

Almost forgot, the word "spin" appears here rather out of place.
Re: [net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver
Posted by Łukasz Majewski 3 weeks, 5 days ago
Hi Jakub,

> On Mon, 25 Aug 2025 00:07:33 +0200 Lukasz Majewski wrote:
> > +
> > +	/* On some FEC implementations data must be aligned on
> > +	 * 4-byte boundaries. Use bounce buffers to copy data
> > +	 * and get it aligned.spin
> > +	 */  
> 
> Almost forgot, the word "spin" appears here rather out of place.

Thanks for pointing this out.

-- 
Best regards,

Łukasz Majewski
Re: [net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver
Posted by Jakub Kicinski 1 month, 1 week ago
On Mon, 25 Aug 2025 00:07:33 +0200 Lukasz Majewski wrote:
> +	/* Set buffer length and buffer pointer */
> +	bufaddr = skb->data;

You can't write (swap) skb->data if the skb is a clone..

> +	bdp->cbd_datlen = skb->len;
> +
> +	/* On some FEC implementations data must be aligned on
> +	 * 4-byte boundaries. Use bounce buffers to copy data
> +	 * and get it aligned.spin
> +	 */
> +	if ((unsigned long)bufaddr & MTIP_ALIGNMENT) {

add 
	.. ||
	(fep->quirks & FEC_QUIRK_SWAP_FRAME && skb_cloned(skb))

here to switch to the local buffer for clones ?

> +		unsigned int index;
> +
> +		index = bdp - fep->tx_bd_base;
> +		memcpy(fep->tx_bounce[index], skb->data, skb->len);
> +		bufaddr = fep->tx_bounce[index];
> +	}
> +
> +	if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
> +		swap_buffer(bufaddr, skb->len);

> +	if (unlikely(dma_mapping_error(&fep->pdev->dev, bdp->cbd_bufaddr))) {
> +		dev_err(&fep->pdev->dev,
> +			"Failed to map descriptor tx buffer\n");

All per-packet prints must be rate limited

> +		/* Since we have freed up a buffer, the ring is no longer
> +		 * full.
> +		 */
> +		if (fep->tx_full) {
> +			fep->tx_full = 0;
> +			if (netif_queue_stopped(dev))
> +				netif_wake_queue(dev);
> +		}

I must say I'm still quite confused by the netdev management in this
driver. You seem to have 2 netdevs, one per port. There's one
set of queues and one NAPI. Whichever netdev gets up first gets the
NAPI. What makes my head spin is that you seem to record which
netdev/port was doing Rx _last_ and then pass that netdev to
mtip_switch_tx(). Why? Isn't the dev that we're completing Tx for is
best read from skb->dev packet by packet? Also this wake up logic looks
like it will wake up _one_ netdev's queue and then set tx_full = 0, so
presumably it will not wake the other port if both ports queues were
stopped. Why keep tx_full state in the first place? Just check if the
queues is stopped..?
Re: [net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver
Posted by Łukasz Majewski 3 weeks, 5 days ago
Hi Jakub,

Sorry for late reply.

> On Mon, 25 Aug 2025 00:07:33 +0200 Lukasz Majewski wrote:
> > +	/* Set buffer length and buffer pointer */
> > +	bufaddr = skb->data;  
> 
> You can't write (swap) skb->data if the skb is a clone..

I do use skb = buld_skb() which, "builds" the SKB around the memory
page (from pool).

Then, I "pass" this data (and swap it) to upper layer of the network
stack.

The same approach is used in the fec_main.c driver:
https://elixir.bootlin.com/linux/v6.17-rc3/source/drivers/net/ethernet/freescale/fec_main.c#L1853

> 
> > +	bdp->cbd_datlen = skb->len;
> > +
> > +	/* On some FEC implementations data must be aligned on
> > +	 * 4-byte boundaries. Use bounce buffers to copy data
> > +	 * and get it aligned.spin
> > +	 */
> > +	if ((unsigned long)bufaddr & MTIP_ALIGNMENT) {  
> 
> add 
> 	.. ||
> 	(fep->quirks & FEC_QUIRK_SWAP_FRAME && skb_cloned(skb))
> 
> here to switch to the local buffer for clones ?

Please see the above comment.

> 
> > +		unsigned int index;
> > +
> > +		index = bdp - fep->tx_bd_base;
> > +		memcpy(fep->tx_bounce[index], skb->data, skb->len);
> > +		bufaddr = fep->tx_bounce[index];
> > +	}
> > +
> > +	if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
> > +		swap_buffer(bufaddr, skb->len);  
> 
> > +	if (unlikely(dma_mapping_error(&fep->pdev->dev,
> > bdp->cbd_bufaddr))) {
> > +		dev_err(&fep->pdev->dev,
> > +			"Failed to map descriptor tx buffer\n");  
> 
> All per-packet prints must be rate limited

Ok. I will update it globally.

> 
> > +		/* Since we have freed up a buffer, the ring is no
> > longer
> > +		 * full.
> > +		 */
> > +		if (fep->tx_full) {
> > +			fep->tx_full = 0;
> > +			if (netif_queue_stopped(dev))
> > +				netif_wake_queue(dev);
> > +		}  
> 
> I must say I'm still quite confused by the netdev management in this
> driver. You seem to have 2 netdevs, one per port.

Yes.

> There's one
> set of queues and one NAPI.

Yes.

> Whichever netdev gets up first gets the
> NAPI.

Yes.

What I'm trying to do - is to model the HW which I do have...

When switch is enabled I do have ONE uDMA0 which works for both eth
ports (lan0 and lan1).

That is why I do have only one NAPI queue.

> What makes my head spin is that you seem to record which
> netdev/port was doing Rx _last_ and then pass that netdev to
> mtip_switch_tx(). Why?

You may have port == 1 || port == 2 when you receive packet from ingres
ports.
You may also have port == 0xFF when you first time encounter the SA on
the port and port == 0 when you send/receive data from the "host"
interface.

When port 1/2 is "detected" then the net dev for this particular port
is used. In other cases the one for NAPI is used (which is one of those
two - please see comment above).

This was the approach from original NXP (Freescale) driver. It in some
way prevents from "starvation" from net devices when L2 switch is
disabled and I need to provide port separation.

(port separation in fact is achieved by programming L2 switch registers
and is realized in HW).

> Isn't the dev that we're completing Tx for is
> best read from skb->dev packet by packet?

It may be worth to try.... I think that the code, which we do have now,
tries to reuse some kind of "locality".

> Also this wake up logic
> looks like it will wake up _one_ netdev's queue and then set tx_full
> = 0, so presumably it will not wake the other port if both ports
> queues were stopped. Why keep tx_full state in the first place? Just
> check if the queues is stopped..?

As I said - we do have only ONE queue, which corresponds to uDMA0 when
the switch is enabled. This single queue is responsible for handling
transmission for both ports (this is how the HW is designed).



-- 
Best regards,

Łukasz Majewski
Re: [net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver
Posted by Jakub Kicinski 3 weeks, 3 days ago
On Sun, 7 Sep 2025 18:38:54 +0200 Łukasz Majewski wrote:
> > On Mon, 25 Aug 2025 00:07:33 +0200 Lukasz Majewski wrote:  
> > > +	/* Set buffer length and buffer pointer */
> > > +	bufaddr = skb->data;    
> > 
> > You can't write (swap) skb->data if the skb is a clone..  
> 
> I do use skb = buld_skb() which, "builds" the SKB around the memory
> page (from pool).
> 
> Then, I "pass" this data (and swap it) to upper layer of the network
> stack.
> 
> The same approach is used in the fec_main.c driver:
> https://elixir.bootlin.com/linux/v6.17-rc3/source/drivers/net/ethernet/freescale/fec_main.c#L1853

I probably cut out too much context. I think I was quoting from Tx,
indeed on Rx this is not an issue.

> What I'm trying to do - is to model the HW which I do have...
> 
> When switch is enabled I do have ONE uDMA0 which works for both eth
> ports (lan0 and lan1).
> 
> That is why I do have only one NAPI queue.
> 
> > What makes my head spin is that you seem to record which
> > netdev/port was doing Rx _last_ and then pass that netdev to
> > mtip_switch_tx(). Why?  
> 
> You may have port == 1 || port == 2 when you receive packet from ingres
> ports.
> You may also have port == 0xFF when you first time encounter the SA on
> the port and port == 0 when you send/receive data from the "host"
> interface.
> 
> When port 1/2 is "detected" then the net dev for this particular port
> is used. In other cases the one for NAPI is used (which is one of those
> two - please see comment above).
> 
> This was the approach from original NXP (Freescale) driver. It in some
> way prevents from "starvation" from net devices when L2 switch is
> disabled and I need to provide port separation.
> 
> (port separation in fact is achieved by programming L2 switch registers
> and is realized in HW).

But what if we have mixed traffic from port 1 and 2?
Does the current code correctly Rx the packets from port 1 on the
netdev from port 1 and packets from port 2 on the netdev from port 2?

> > Isn't the dev that we're completing Tx for is
> > best read from skb->dev packet by packet?  
> 
> It may be worth to try.... I think that the code, which we do have now,
> tries to reuse some kind of "locality".
> 
> > Also this wake up logic
> > looks like it will wake up _one_ netdev's queue and then set tx_full
> > = 0, so presumably it will not wake the other port if both ports
> > queues were stopped. Why keep tx_full state in the first place? Just
> > check if the queues is stopped..?  
> 
> As I said - we do have only ONE queue, which corresponds to uDMA0 when
> the switch is enabled. This single queue is responsible for handling
> transmission for both ports (this is how the HW is designed).

Right but kernel has two SW queues. Which can be independently stopped.
So my concerns is that for example port 1 is very busy so the queue is
full of packets for port 1, port 1's netdev's queue gets stopped.
Then port 2 tries to Tx, queue is shared, and is full, so netdev 2's
SW queue is also stopped. Then we complete the packets, because packets
were for port 1 we wake the queue for port 1. But port 2 also got
stopped, even tho it never put a packet on the ring..

Long story short I think you need to always stop and start queues from
both netdevs.. There's just 2 so not too bad of a hack.
Re: [net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver
Posted by Łukasz Majewski 3 weeks, 1 day ago
Hi Jakub,

> On Sun, 7 Sep 2025 18:38:54 +0200 Łukasz Majewski wrote:
> > > On Mon, 25 Aug 2025 00:07:33 +0200 Lukasz Majewski wrote:    
> > > > +	/* Set buffer length and buffer pointer */
> > > > +	bufaddr = skb->data;      
> > > 
> > > You can't write (swap) skb->data if the skb is a clone..    
> > 
> > I do use skb = buld_skb() which, "builds" the SKB around the memory
> > page (from pool).
> > 
> > Then, I "pass" this data (and swap it) to upper layer of the network
> > stack.
> > 
> > The same approach is used in the fec_main.c driver:
> > https://elixir.bootlin.com/linux/v6.17-rc3/source/drivers/net/ethernet/freescale/fec_main.c#L1853
> >  
> 
> I probably cut out too much context. I think I was quoting from Tx,
> indeed on Rx this is not an issue.

Ok. No adjustments needed then. Good :)

> 
> > What I'm trying to do - is to model the HW which I do have...
> > 
> > When switch is enabled I do have ONE uDMA0 which works for both eth
> > ports (lan0 and lan1).
> > 
> > That is why I do have only one NAPI queue.
> >   
> > > What makes my head spin is that you seem to record which
> > > netdev/port was doing Rx _last_ and then pass that netdev to
> > > mtip_switch_tx(). Why?    
> > 
> > You may have port == 1 || port == 2 when you receive packet from
> > ingres ports.
> > You may also have port == 0xFF when you first time encounter the SA
> > on the port and port == 0 when you send/receive data from the "host"
> > interface.
> > 
> > When port 1/2 is "detected" then the net dev for this particular
> > port is used. In other cases the one for NAPI is used (which is one
> > of those two - please see comment above).
> > 
> > This was the approach from original NXP (Freescale) driver. It in
> > some way prevents from "starvation" from net devices when L2 switch
> > is disabled and I need to provide port separation.
> > 
> > (port separation in fact is achieved by programming L2 switch
> > registers and is realized in HW).  
> 
> But what if we have mixed traffic from port 1 and 2?
> Does the current code correctly Rx the packets from port 1 on the
> netdev from port 1 and packets from port 2 on the netdev from port 2?

Yes, it does.

In the mtip_rx_napi() you have call to mtip_switch_rx() which accepts
pointer to port variable.

It sets it according to the information provided by the switch IP block
HW and also adjust the skb's ndev.

I'm just wondering if the snippet from mtip_rx_napi():
-------8<--------
if ((port == 1 || port == 2) && fep->ndev[port - 1]
	mtip_switch_tx(fep->ndev[port - 1]);
else
	mtip_switch_tx(napi->dev);
------->8--------

could be replaced just with mtip_switch_tx(napi->dev);
as TX via napi->dev shall be forward to both ports if required.

I will check if this can be done in such a way.

> 
> > > Isn't the dev that we're completing Tx for is
> > > best read from skb->dev packet by packet?    
> > 
> > It may be worth to try.... I think that the code, which we do have
> > now, tries to reuse some kind of "locality".
> >   
> > > Also this wake up logic
> > > looks like it will wake up _one_ netdev's queue and then set
> > > tx_full = 0, so presumably it will not wake the other port if
> > > both ports queues were stopped. Why keep tx_full state in the
> > > first place? Just check if the queues is stopped..?    
> > 
> > As I said - we do have only ONE queue, which corresponds to uDMA0
> > when the switch is enabled. This single queue is responsible for
> > handling transmission for both ports (this is how the HW is
> > designed).  
> 
> Right but kernel has two SW queues.

You mean a separate SW queues for each devices? This is not supported
in the MTIP L2 switch driver. Maybe such high level SW queues
management is available in the upper layers?

> Which can be independently
> stopped.

It supports separate RX and TX HW queues (i.e. ring buffers for
descriptors) for the uDMA0 when switch is enabled.

When you want to send data (no matter from which lan[01] device) the
same mtip_start_xmit() is called, the HW TX descriptor is setup and is
passed via uDMA0 to L2 switch IP block.

For next TX transmission (even from different port) we assign another
descriptor from the ring buffer.

> So my concerns is that for example port 1 is very busy so
> the queue is full of packets for port 1, port 1's netdev's queue gets
> stopped. Then port 2 tries to Tx, queue is shared, and is full, so
> netdev 2's SW queue is also stopped. Then we complete the packets,
> because packets were for port 1 we wake the queue for port 1. But
> port 2 also got stopped, even tho it never put a packet on the ring..
> 

As fair as I can tell - both ports call mtip_start_xmit(), their data
is serialized to the TX queue (via descriptors).

Queued descriptors are sent always at some point (or overridden if
critical error encountered).

> Long story short I think you need to always stop and start queues from
> both netdevs.. There's just 2 so not too bad of a hack.

Maybe there are some peculiarities in for example bridge code (or upper
network stack layers in general), but I think, that I don't need any
extra "queue" management for TX code of MTIP L2 switch.

-- 
Best regards,

Łukasz Majewski
Re: [net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver
Posted by Jakub Kicinski 3 weeks, 1 day ago
On Wed, 10 Sep 2025 23:15:52 +0200 Łukasz Majewski wrote:
> > > I do use skb = buld_skb() which, "builds" the SKB around the memory
> > > page (from pool).
> > > 
> > > Then, I "pass" this data (and swap it) to upper layer of the network
> > > stack.
> > > 
> > > The same approach is used in the fec_main.c driver:
> > > https://elixir.bootlin.com/linux/v6.17-rc3/source/drivers/net/ethernet/freescale/fec_main.c#L1853
> > >    
> > 
> > I probably cut out too much context. I think I was quoting from Tx,
> > indeed on Rx this is not an issue.  
> 
> Ok. No adjustments needed then. Good :)

No, you were talking about build_skb() which is Rx.
This is the patch that adds Tx. Tx is wrong.
You can't modify the skb unless you call skb_cow().
Or just copy the data out to your local buffer.

> > > You may have port == 1 || port == 2 when you receive packet from
> > > ingres ports.
> > > You may also have port == 0xFF when you first time encounter the SA
> > > on the port and port == 0 when you send/receive data from the "host"
> > > interface.
> > > 
> > > When port 1/2 is "detected" then the net dev for this particular
> > > port is used. In other cases the one for NAPI is used (which is one
> > > of those two - please see comment above).
> > > 
> > > This was the approach from original NXP (Freescale) driver. It in
> > > some way prevents from "starvation" from net devices when L2 switch
> > > is disabled and I need to provide port separation.
> > > 
> > > (port separation in fact is achieved by programming L2 switch
> > > registers and is realized in HW).    
> > 
> > But what if we have mixed traffic from port 1 and 2?
> > Does the current code correctly Rx the packets from port 1 on the
> > netdev from port 1 and packets from port 2 on the netdev from port 2?  
> 
> Yes, it does.
> 
> In the mtip_rx_napi() you have call to mtip_switch_rx() which accepts
> pointer to port variable.
> 
> It sets it according to the information provided by the switch IP block
> HW and also adjust the skb's ndev.
> 
> I'm just wondering if the snippet from mtip_rx_napi():
> -------8<--------
> if ((port == 1 || port == 2) && fep->ndev[port - 1]
> 	mtip_switch_tx(fep->ndev[port - 1]);
> else
> 	mtip_switch_tx(napi->dev);
> ------->8--------  
> 
> could be replaced just with mtip_switch_tx(napi->dev);
> as TX via napi->dev shall be forward to both ports if required.
> 
> I will check if this can be done in such a way.

Not napi->dev. You have to attribute sent packets to the right netdev.

> > > As I said - we do have only ONE queue, which corresponds to uDMA0
> > > when the switch is enabled. This single queue is responsible for
> > > handling transmission for both ports (this is how the HW is
> > > designed).    
> > 
> > Right but kernel has two SW queues.  
> 
> You mean a separate SW queues for each devices? This is not supported
> in the MTIP L2 switch driver. Maybe such high level SW queues
> management is available in the upper layers?

Not possible, each netdev has it's own private qdisc tree.

> > Which can be independently
> > stopped.  
> 
> It supports separate RX and TX HW queues (i.e. ring buffers for
> descriptors) for the uDMA0 when switch is enabled.
> 
> When you want to send data (no matter from which lan[01] device) the
> same mtip_start_xmit() is called, the HW TX descriptor is setup and is
> passed via uDMA0 to L2 switch IP block.
> 
> For next TX transmission (even from different port) we assign another
> descriptor from the ring buffer.
> 
> > So my concerns is that for example port 1 is very busy so
> > the queue is full of packets for port 1, port 1's netdev's queue gets
> > stopped. Then port 2 tries to Tx, queue is shared, and is full, so
> > netdev 2's SW queue is also stopped. Then we complete the packets,
> > because packets were for port 1 we wake the queue for port 1. But
> > port 2 also got stopped, even tho it never put a packet on the ring..
> >   
> 
> As fair as I can tell - both ports call mtip_start_xmit(), their data
> is serialized to the TX queue (via descriptors).
> 
> Queued descriptors are sent always at some point (or overridden if
> critical error encountered).
> 
> > Long story short I think you need to always stop and start queues from
> > both netdevs.. There's just 2 so not too bad of a hack.  
> 
> Maybe there are some peculiarities in for example bridge code (or upper
> network stack layers in general), but I think, that I don't need any
> extra "queue" management for TX code of MTIP L2 switch.

I think I explained this enough times. Next version is v20.
If it's not significantly better than this one, I'm going to have 
to ask you to stop posting this driver.
Re: [net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver
Posted by Łukasz Majewski 3 weeks ago
Hi Jakub,

> On Wed, 10 Sep 2025 23:15:52 +0200 Łukasz Majewski wrote:
> > > > I do use skb = buld_skb() which, "builds" the SKB around the
> > > > memory page (from pool).
> > > > 
> > > > Then, I "pass" this data (and swap it) to upper layer of the
> > > > network stack.
> > > > 
> > > > The same approach is used in the fec_main.c driver:
> > > > https://elixir.bootlin.com/linux/v6.17-rc3/source/drivers/net/ethernet/freescale/fec_main.c#L1853
> > > >      
> > > 
> > > I probably cut out too much context. I think I was quoting from
> > > Tx, indeed on Rx this is not an issue.    
> > 
> > Ok. No adjustments needed then. Good :)  
> 
> No, you were talking about build_skb() which is Rx.
> This is the patch that adds Tx. Tx is wrong.

The same approach is taken in fec_main.c (@ fec_enet_txq_submit_skb()
function).

> You can't modify the skb unless you call skb_cow().
> Or just copy the data out to your local buffer.

In the mtip_start_xmit_port() I do assign the address to skb->data to
bufaddr pointer.

If alignment is wrong the we copy it to bounce buffer.
Then we do swap the buffer if needed.

Last step is to dma map this memory and assign the pointer to the
descriptor for L2 switch transmission.

> 
> > > > You may have port == 1 || port == 2 when you receive packet from
> > > > ingres ports.
> > > > You may also have port == 0xFF when you first time encounter
> > > > the SA on the port and port == 0 when you send/receive data
> > > > from the "host" interface.
> > > > 
> > > > When port 1/2 is "detected" then the net dev for this particular
> > > > port is used. In other cases the one for NAPI is used (which is
> > > > one of those two - please see comment above).
> > > > 
> > > > This was the approach from original NXP (Freescale) driver. It
> > > > in some way prevents from "starvation" from net devices when L2
> > > > switch is disabled and I need to provide port separation.
> > > > 
> > > > (port separation in fact is achieved by programming L2 switch
> > > > registers and is realized in HW).      
> > > 
> > > But what if we have mixed traffic from port 1 and 2?
> > > Does the current code correctly Rx the packets from port 1 on the
> > > netdev from port 1 and packets from port 2 on the netdev from
> > > port 2?    
> > 
> > Yes, it does.
> > 
> > In the mtip_rx_napi() you have call to mtip_switch_rx() which
> > accepts pointer to port variable.
> > 
> > It sets it according to the information provided by the switch IP
> > block HW and also adjust the skb's ndev.
> > 
> > I'm just wondering if the snippet from mtip_rx_napi():
> > -------8<--------
> > if ((port == 1 || port == 2) && fep->ndev[port - 1]
> > 	mtip_switch_tx(fep->ndev[port - 1]);
> > else
> > 	mtip_switch_tx(napi->dev);  
> > ------->8--------    
> > 
> > could be replaced just with mtip_switch_tx(napi->dev);
> > as TX via napi->dev shall be forward to both ports if required.
> > 
> > I will check if this can be done in such a way.  
> 
> Not napi->dev. You have to attribute sent packets to the right netdev.

And then we do have some issue to solve. To be more specific -
fec_main.c to avoid starvation just from fec_enet_rx_napi() calls
fec_enet_tx() with only one net device (which it supports).

I wanted to mimic such behaviour with L2 switch driver (at
mtip_rx_napi()), but then the question - which network device (from
available two) shall be assigned?

The net device passed to mtip_switch_tx() is only relevant for
"housekeeping/statistical data" as in fact we just provide another
descriptor to the HW to be sent.

Maybe I shall extract the net device pointer from the skb structure?

> 
> > > > As I said - we do have only ONE queue, which corresponds to
> > > > uDMA0 when the switch is enabled. This single queue is
> > > > responsible for handling transmission for both ports (this is
> > > > how the HW is designed).      
> > > 
> > > Right but kernel has two SW queues.    
> > 
> > You mean a separate SW queues for each devices? This is not
> > supported in the MTIP L2 switch driver. Maybe such high level SW
> > queues management is available in the upper layers?  
> 
> Not possible, each netdev has it's own private qdisc tree.

Please correct me if I'm wrong, but aren't packets from those queues
end up with calling ->ndo_start_xmit() function?

> 
> > > Which can be independently
> > > stopped.    
> > 
> > It supports separate RX and TX HW queues (i.e. ring buffers for
> > descriptors) for the uDMA0 when switch is enabled.
> > 
> > When you want to send data (no matter from which lan[01] device) the
> > same mtip_start_xmit() is called, the HW TX descriptor is setup and
> > is passed via uDMA0 to L2 switch IP block.
> > 
> > For next TX transmission (even from different port) we assign
> > another descriptor from the ring buffer.
> >   
> > > So my concerns is that for example port 1 is very busy so
> > > the queue is full of packets for port 1, port 1's netdev's queue
> > > gets stopped. Then port 2 tries to Tx, queue is shared, and is
> > > full, so netdev 2's SW queue is also stopped. Then we complete
> > > the packets, because packets were for port 1 we wake the queue
> > > for port 1. But port 2 also got stopped, even tho it never put a
> > > packet on the ring.. 
> > 
> > As fair as I can tell - both ports call mtip_start_xmit(), their
> > data is serialized to the TX queue (via descriptors).
> > 
> > Queued descriptors are sent always at some point (or overridden if
> > critical error encountered).
> >   
> > > Long story short I think you need to always stop and start queues
> > > from both netdevs.. There's just 2 so not too bad of a hack.    
> > 
> > Maybe there are some peculiarities in for example bridge code (or
> > upper network stack layers in general), but I think, that I don't
> > need any extra "queue" management for TX code of MTIP L2 switch.  
> 
> I think I explained this enough times. Next version is v20.
> If it's not significantly better than this one, I'm going to have 
> to ask you to stop posting this driver.

I don't know how to reply to this comment, really. 

I've spent many hours of my spare time to upstream this driver.
I'm just disappointed (and maybe I will not say more because of high
level of my frustration).




Could you point me to the driver example which provides such queues
management for switchdev driver? Just to show what you expect from me.

One example.


-- 
Best regards,

Łukasz Majewski
Re: [net-next v19 4/7] net: mtip: Add net_device_ops functions to the L2 switch driver
Posted by Jakub Kicinski 3 weeks ago
On Thu, 11 Sep 2025 23:55:47 +0200 Łukasz Majewski wrote:
> > > Ok. No adjustments needed then. Good :)    
> > 
> > No, you were talking about build_skb() which is Rx.
> > This is the patch that adds Tx. Tx is wrong.  
> 
> The same approach is taken in fec_main.c (@ fec_enet_txq_submit_skb()
> function).

FWIW I'm 99% sure we were once investigating a bug in FEC related to
modifying timestamped packets, leading to crashes. Maybe there is more.

> > > could be replaced just with mtip_switch_tx(napi->dev);
> > > as TX via napi->dev shall be forward to both ports if required.
> > > 
> > > I will check if this can be done in such a way.    
> > 
> > Not napi->dev. You have to attribute sent packets to the right netdev.  
> 
> And then we do have some issue to solve. To be more specific -
> fec_main.c to avoid starvation just from fec_enet_rx_napi() calls
> fec_enet_tx() with only one net device (which it supports).
> 
> I wanted to mimic such behaviour with L2 switch driver (at
> mtip_rx_napi()), but then the question - which network device (from
> available two) shall be assigned?
> 
> The net device passed to mtip_switch_tx() is only relevant for
> "housekeeping/statistical data" as in fact we just provide another
> descriptor to the HW to be sent.
> 
> Maybe I shall extract the net device pointer from the skb structure?

Exactly :)

> > > You mean a separate SW queues for each devices? This is not
> > > supported in the MTIP L2 switch driver. Maybe such high level SW
> > > queues management is available in the upper layers?    
> > 
> > Not possible, each netdev has it's own private qdisc tree.  
> 
> Please correct me if I'm wrong, but aren't packets from those queues
> end up with calling ->ndo_start_xmit() function?

Right. I think I'm lost, why does this matter?

> > I think I explained this enough times. Next version is v20.
> > If it's not significantly better than this one, I'm going to have 
> > to ask you to stop posting this driver.  
> 
> I don't know how to reply to this comment, really. 
> 
> I've spent many hours of my spare time to upstream this driver.
> I'm just disappointed (and maybe I will not say more because of high
> level of my frustration).

I believe mlxsw has fewer DMA queues than ports. But TBH I'm not sure
how they handle the congestion. In your case since you only have two
ports (at most) I think you can trivially just always stop and start
both.