[v15] net: mtip: Add support for MTIP imx287 L2 switch driver

[net-next v15 06/12] net: mtip: Add net_device_ops functions to the L2 switch driver

Posted by Lukasz Majewski 2 months, 3 weeks ago

This patch provides callbacks for struct net_device_ops for MTIP
L2 switch.

Signed-off-by: Lukasz Majewski <lukma@denx.de>

---
Changes for v13:
- New patch - created by excluding some code from large (i.e. v12 and
  earlier) MTIP driver

Changes for v14:
- Add read memory barier (rmb) before reading current descriptor
- Use proper locking primitives

Changes for v15:
- None
---
 .../net/ethernet/freescale/mtipsw/mtipl2sw.c  | 277 ++++++++++++++++++
 1 file changed, 277 insertions(+)

diff --git a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
index b7e9f73fe4fa..3ec76b06f1f1 100644
--- a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
+++ b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
@@ -43,6 +43,15 @@
 
 #include "mtipl2sw.h"
 
+static void swap_buffer(void *bufaddr, int len)
+{
+	int i;
+	unsigned int *buf = bufaddr;
+
+	for (i = 0; i < len; i += 4, buf++)
+		swab32s(buf);
+}
+
 /* Set the last buffer to wrap */
 static void mtip_set_last_buf_to_wrap(struct cbd_t *bdp)
 {
@@ -444,6 +453,130 @@ static void mtip_config_switch(struct switch_enet_private *fep)
 	       fep->hwp + ESW_IMR);
 }
 
+static netdev_tx_t mtip_start_xmit_port(struct sk_buff *skb,
+					struct net_device *dev, int port)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	struct switch_enet_private *fep = priv->fep;
+	unsigned short status;
+	struct cbd_t *bdp;
+	void *bufaddr;
+
+	spin_lock(&fep->hw_lock);
+
+	if (!fep->link[0] && !fep->link[1]) {
+		/* Link is down or autonegotiation is in progress. */
+		netif_stop_queue(dev);
+		spin_unlock(&fep->hw_lock);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Fill in a Tx ring entry */
+	bdp = fep->cur_tx;
+
+	/* Force read memory barier on the current transmit description */
+	rmb();
+	status = bdp->cbd_sc;
+
+	if (status & BD_ENET_TX_READY) {
+		/* All transmit buffers are full. Bail out.
+		 * This should not happen, since dev->tbusy should be set.
+		 */
+		netif_stop_queue(dev);
+		dev_err(&fep->pdev->dev, "%s: tx queue full!.\n", dev->name);
+		spin_unlock(&fep->hw_lock);
+		return NETDEV_TX_BUSY;
+	}
+
+	/* Clear all of the status flags */
+	status &= ~BD_ENET_TX_STATS;
+
+	/* Set buffer length and buffer pointer */
+	bufaddr = skb->data;
+	bdp->cbd_datlen = skb->len;
+
+	/* On some FEC implementations data must be aligned on
+	 * 4-byte boundaries. Use bounce buffers to copy data
+	 * and get it aligned.spin
+	 */
+	if ((unsigned long)bufaddr & MTIP_ALIGNMENT) {
+		unsigned int index;
+
+		index = bdp - fep->tx_bd_base;
+		memcpy(fep->tx_bounce[index],
+		       (void *)skb->data, skb->len);
+		bufaddr = fep->tx_bounce[index];
+	}
+
+	if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
+		swap_buffer(bufaddr, skb->len);
+
+	/* Save skb pointer. */
+	fep->tx_skbuff[fep->skb_cur] = skb;
+
+	fep->skb_cur = (fep->skb_cur + 1) & TX_RING_MOD_MASK;
+
+	/* Push the data cache so the CPM does not get stale memory
+	 * data.
+	 */
+	bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev, bufaddr,
+					  MTIP_SWITCH_TX_FRSIZE,
+					  DMA_TO_DEVICE);
+	if (unlikely(dma_mapping_error(&fep->pdev->dev, bdp->cbd_bufaddr))) {
+		dev_err(&fep->pdev->dev,
+			"Failed to map descriptor tx buffer\n");
+		dev->stats.tx_errors++;
+		dev->stats.tx_dropped++;
+		dev_kfree_skb_any(skb);
+		goto err;
+	}
+
+	/* Send it on its way.  Tell FEC it's ready, interrupt when done,
+	 * it's the last BD of the frame, and to put the CRC on the end.
+	 */
+
+	status |= (BD_ENET_TX_READY | BD_ENET_TX_INTR
+			| BD_ENET_TX_LAST | BD_ENET_TX_TC);
+
+	/* Synchronize all descriptor writes */
+	wmb();
+	bdp->cbd_sc = status;
+
+	netif_trans_update(dev);
+	skb_tx_timestamp(skb);
+
+	/* Trigger transmission start */
+	writel(MCF_ESW_TDAR_X_DES_ACTIVE, fep->hwp + ESW_TDAR);
+
+	dev->stats.tx_bytes += skb->len;
+	/* If this was the last BD in the ring,
+	 * start at the beginning again.
+	 */
+	if (status & BD_ENET_TX_WRAP)
+		bdp = fep->tx_bd_base;
+	else
+		bdp++;
+
+	if (bdp == fep->dirty_tx) {
+		fep->tx_full = 1;
+		netif_stop_queue(dev);
+	}
+
+	fep->cur_tx = bdp;
+ err:
+	spin_unlock(&fep->hw_lock);
+
+	return NETDEV_TX_OK;
+}
+
+static netdev_tx_t mtip_start_xmit(struct sk_buff *skb,
+				   struct net_device *dev)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+
+	return mtip_start_xmit_port(skb, dev, priv->portnum);
+}
+
 static void mtip_configure_enet_mii(struct switch_enet_private *fep, int port)
 {
 	struct phy_device *phydev = fep->phy_dev[port - 1];
@@ -601,6 +734,70 @@ static void mtip_switch_restart(struct net_device *dev, int duplex0,
 	mtip_config_switch(fep);
 }
 
+static void mtip_timeout(struct net_device *dev, unsigned int txqueue)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	struct switch_enet_private *fep = priv->fep;
+	struct cbd_t *bdp;
+	int i;
+
+	dev->stats.tx_errors++;
+
+	if (IS_ENABLED(CONFIG_SWITCH_DEBUG)) {
+		spin_lock(&fep->hw_lock);
+		dev_info(&dev->dev, "%s: transmit timed out.\n", dev->name);
+		dev_info(&dev->dev,
+			 "Ring data: cur_tx %lx%s, dirty_tx %lx cur_rx: %lx\n",
+			 (unsigned long)fep->cur_tx,
+			 fep->tx_full ? " (full)" : "",
+			 (unsigned long)fep->dirty_tx,
+			 (unsigned long)fep->cur_rx);
+
+		bdp = fep->tx_bd_base;
+		dev_info(&dev->dev, " tx: %u buffers\n", TX_RING_SIZE);
+		for (i = 0; i < TX_RING_SIZE; i++) {
+			dev_info(&dev->dev, "  %08lx: %04x %04x %08x\n",
+				 (kernel_ulong_t)bdp, bdp->cbd_sc,
+				 bdp->cbd_datlen, (int)bdp->cbd_bufaddr);
+			bdp++;
+		}
+
+		bdp = fep->rx_bd_base;
+		dev_info(&dev->dev, " rx: %lu buffers\n",
+			 (unsigned long)RX_RING_SIZE);
+		for (i = 0 ; i < RX_RING_SIZE; i++) {
+			dev_info(&dev->dev, "  %08lx: %04x %04x %08x\n",
+				 (kernel_ulong_t)bdp,
+				 bdp->cbd_sc, bdp->cbd_datlen,
+				 (int)bdp->cbd_bufaddr);
+			bdp++;
+		}
+		spin_unlock(&fep->hw_lock);
+	}
+
+	schedule_work(&priv->tx_timeout_work);
+}
+
+static void mtip_timeout_work(struct work_struct *work)
+{
+	struct mtip_ndev_priv *priv =
+		container_of(work, struct mtip_ndev_priv, tx_timeout_work);
+	struct switch_enet_private *fep = priv->fep;
+	struct net_device *dev = priv->dev;
+
+	rtnl_lock();
+	if (netif_device_present(dev) || netif_running(dev)) {
+		napi_disable(&fep->napi);
+		netif_tx_lock_bh(dev);
+		mtip_switch_restart(dev, fep->full_duplex[0],
+				    fep->full_duplex[1]);
+		netif_tx_wake_all_queues(dev);
+		netif_tx_unlock_bh(dev);
+		napi_enable(&fep->napi);
+	}
+	rtnl_unlock();
+}
+
 static irqreturn_t mtip_interrupt(int irq, void *ptr_fep)
 {
 	struct switch_enet_private *fep = ptr_fep;
@@ -1079,6 +1276,80 @@ static int mtip_close(struct net_device *dev)
 	return 0;
 }
 
+#define FEC_HASH_BITS	6		/* #bits in hash */
+static void mtip_set_multicast_list(struct net_device *dev)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	unsigned int hash_high = 0, hash_low = 0, crc;
+	struct switch_enet_private *fep = priv->fep;
+	void __iomem *enet_addr = fep->enet_addr;
+	struct netdev_hw_addr *ha;
+	unsigned char hash;
+
+	if (priv->portnum == 2)
+		enet_addr += MCF_ESW_ENET_PORT_OFFSET;
+
+	if (dev->flags & IFF_PROMISC) {
+		/* Promisc mode is required for switch - it is
+		 * already enabled during driver's probe.
+		 */
+		dev_dbg(&dev->dev, "%s: IFF_PROMISC\n", __func__);
+		return;
+	}
+
+	if (dev->flags & IFF_ALLMULTI) {
+		dev_dbg(&dev->dev, "%s: IFF_ALLMULTI\n", __func__);
+
+		/* Allow all multicast addresses */
+		writel(0xFFFFFFFF, enet_addr + MCF_FEC_GRP_HASH_TABLE_HIGH);
+		writel(0xFFFFFFFF, enet_addr + MCF_FEC_GRP_HASH_TABLE_LOW);
+
+		return;
+	}
+
+	netdev_for_each_mc_addr(ha, dev) {
+		/* Calculate crc32 value of mac address */
+		crc = ether_crc_le(dev->addr_len, ha->addr);
+
+		/* Only upper 6 bits (FEC_HASH_BITS) are used
+		 * which point to specific bit in the hash registers
+		 */
+		hash = (crc >> (32 - FEC_HASH_BITS)) & 0x3F;
+
+		if (hash > 31)
+			hash_high |= 1 << (hash - 32);
+		else
+			hash_low |= 1 << hash;
+	}
+
+	writel(hash_high, enet_addr + MCF_FEC_GRP_HASH_TABLE_HIGH);
+	writel(hash_low, enet_addr + MCF_FEC_GRP_HASH_TABLE_LOW);
+}
+
+static int mtip_set_mac_address(struct net_device *dev, void *p)
+{
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	struct switch_enet_private *fep = priv->fep;
+	void __iomem *enet_addr = fep->enet_addr;
+	struct sockaddr *addr = p;
+
+	if (!is_valid_ether_addr(addr->sa_data))
+		return -EADDRNOTAVAIL;
+	eth_hw_addr_set(dev, addr->sa_data);
+
+	if (priv->portnum == 2)
+		enet_addr += MCF_ESW_ENET_PORT_OFFSET;
+
+	writel(dev->dev_addr[3] | (dev->dev_addr[2] << 8) |
+	       (dev->dev_addr[1] << 16) | (dev->dev_addr[0] << 24),
+	       enet_addr + MCF_FEC_PALR);
+	writel((dev->dev_addr[5] << 16) | (dev->dev_addr[4] << 24),
+	       enet_addr + MCF_FEC_PAUR);
+
+	return mtip_update_atable_static((unsigned char *)dev->dev_addr,
+					 7, 7, fep);
+}
+
 static const struct ethtool_ops mtip_ethtool_ops = {
 	.get_link_ksettings     = phy_ethtool_get_link_ksettings,
 	.set_link_ksettings     = phy_ethtool_set_link_ksettings,
@@ -1090,6 +1361,10 @@ static const struct ethtool_ops mtip_ethtool_ops = {
 static const struct net_device_ops mtip_netdev_ops = {
 	.ndo_open		= mtip_open,
 	.ndo_stop		= mtip_close,
+	.ndo_start_xmit	= mtip_start_xmit,
+	.ndo_set_rx_mode	= mtip_set_multicast_list,
+	.ndo_tx_timeout	= mtip_timeout,
+	.ndo_set_mac_address	= mtip_set_mac_address,
 };
 
 bool mtip_is_switch_netdev_port(const struct net_device *ndev)
@@ -1191,6 +1466,8 @@ static int mtip_ndev_init(struct switch_enet_private *fep,
 			break;
 		}
 
+		INIT_WORK(&priv->tx_timeout_work, mtip_timeout_work);
+
 		dev_dbg(&fep->ndev[i]->dev, "%s: MTIP eth L2 switch %pM\n",
 			fep->ndev[i]->name, fep->ndev[i]->dev_addr);
 	}
-- 
2.39.5

Re: [net-next v15 06/12] net: mtip: Add net_device_ops functions to the L2 switch driver

Posted by Jakub Kicinski 2 months, 2 weeks ago

On Wed, 16 Jul 2025 23:47:25 +0200 Lukasz Majewski wrote:
> +static netdev_tx_t mtip_start_xmit_port(struct sk_buff *skb,
> +					struct net_device *dev, int port)
> +{
> +	struct mtip_ndev_priv *priv = netdev_priv(dev);
> +	struct switch_enet_private *fep = priv->fep;
> +	unsigned short status;
> +	struct cbd_t *bdp;
> +	void *bufaddr;
> +
> +	spin_lock(&fep->hw_lock);

I see some inconsistencies in how you take this lock.
Bunch of bare spin_lock() calls from BH context, but there's also
a _irqsave() call in mtip_adjust_link(). Please align to the strictest
context (not sure if the irqsave is actually needed, at a glance, IOW
whether the lock is taken from an IRQ)

> +	if (!fep->link[0] && !fep->link[1]) {
> +		/* Link is down or autonegotiation is in progress. */
> +		netif_stop_queue(dev);
> +		spin_unlock(&fep->hw_lock);
> +		return NETDEV_TX_BUSY;
> +	}
> +
> +	/* Fill in a Tx ring entry */
> +	bdp = fep->cur_tx;
> +
> +	/* Force read memory barier on the current transmit description */

Barrier are between things. What is this barrier separating, and what
write barrier does it pair with? As far as I can tell cur_tx is just
a value in memory, and accesses are under ->hw_lock, so there should
be no ordering concerns.

> +	rmb();
> +	status = bdp->cbd_sc;
> +
> +	if (status & BD_ENET_TX_READY) {
> +		/* All transmit buffers are full. Bail out.
> +		 * This should not happen, since dev->tbusy should be set.
> +		 */
> +		netif_stop_queue(dev);
> +		dev_err(&fep->pdev->dev, "%s: tx queue full!.\n", dev->name);

This needs to be rate limited, we don't want to flood the logs in case
there's a bug.

Also at a glance it seems like you have one fep for multiple netdevs.
So stopping one netdev's Tx queue when fep fills up will not stop the
other ports from pushing frames, right?

> +		spin_unlock(&fep->hw_lock);
> +		return NETDEV_TX_BUSY;
> +	}
> +
> +	/* Clear all of the status flags */
> +	status &= ~BD_ENET_TX_STATS;
> +
> +	/* Set buffer length and buffer pointer */
> +	bufaddr = skb->data;
> +	bdp->cbd_datlen = skb->len;
> +
> +	/* On some FEC implementations data must be aligned on
> +	 * 4-byte boundaries. Use bounce buffers to copy data
> +	 * and get it aligned.spin
> +	 */
> +	if ((unsigned long)bufaddr & MTIP_ALIGNMENT) {

I think you should add 

	if ... ||
           fep->quirks & FEC_QUIRK_SWAP_FRAME)

here. You can't modify skb->data without calling skb_cow_data()
but you already have buffers allocated so can as well use them.

> +		unsigned int index;
> +
> +		index = bdp - fep->tx_bd_base;
> +		memcpy(fep->tx_bounce[index],
> +		       (void *)skb->data, skb->len);

this fits on one 80 char line BTW, quite easily:

		memcpy(fep->tx_bounce[index], (void *)skb->data, skb->len);

Also the cast to void * is not necessary in C.

> +		bufaddr = fep->tx_bounce[index];
> +	}
> +
> +	if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
> +		swap_buffer(bufaddr, skb->len);
> +
> +	/* Save skb pointer. */
> +	fep->tx_skbuff[fep->skb_cur] = skb;
> +
> +	fep->skb_cur = (fep->skb_cur + 1) & TX_RING_MOD_MASK;

Not sure if this is buggy, but maybe delay updating things until the
mapping succeeds? Fewer things to unwind.

> +	/* Push the data cache so the CPM does not get stale memory
> +	 * data.
> +	 */
> +	bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev, bufaddr,
> +					  MTIP_SWITCH_TX_FRSIZE,
> +					  DMA_TO_DEVICE);
> +	if (unlikely(dma_mapping_error(&fep->pdev->dev, bdp->cbd_bufaddr))) {
> +		dev_err(&fep->pdev->dev,
> +			"Failed to map descriptor tx buffer\n");
> +		dev->stats.tx_errors++;
> +		dev->stats.tx_dropped++;

dropped and errors are two different counters
I'd stick to dropped

> +		dev_kfree_skb_any(skb);
> +		goto err;
> +	}
> +
> +	/* Send it on its way.  Tell FEC it's ready, interrupt when done,
> +	 * it's the last BD of the frame, and to put the CRC on the end.
> +	 */
> +
> +	status |= (BD_ENET_TX_READY | BD_ENET_TX_INTR
> +			| BD_ENET_TX_LAST | BD_ENET_TX_TC);

The | goes at the end of the previous line, start of new line adjusts 
to the opening brackets..

> +
> +	/* Synchronize all descriptor writes */
> +	wmb();
> +	bdp->cbd_sc = status;
> +
> +	netif_trans_update(dev);

Is this call necessary?

> +	skb_tx_timestamp(skb);
> +
> +	/* Trigger transmission start */
> +	writel(MCF_ESW_TDAR_X_DES_ACTIVE, fep->hwp + ESW_TDAR);
> +
> +	dev->stats.tx_bytes += skb->len;
> +	/* If this was the last BD in the ring,
> +	 * start at the beginning again.
> +	 */
> +	if (status & BD_ENET_TX_WRAP)
> +		bdp = fep->tx_bd_base;
> +	else
> +		bdp++;
> +
> +	if (bdp == fep->dirty_tx) {
> +		fep->tx_full = 1;
> +		netif_stop_queue(dev);
> +	}
> +
> +	fep->cur_tx = bdp;
> + err:
> +	spin_unlock(&fep->hw_lock);
> +
> +	return NETDEV_TX_OK;
> +}

Re: [net-next v15 06/12] net: mtip: Add net_device_ops functions to the L2 switch driver

Posted by Lukasz Majewski 2 months, 2 weeks ago

Hi Jakub,

> On Wed, 16 Jul 2025 23:47:25 +0200 Lukasz Majewski wrote:
> > +static netdev_tx_t mtip_start_xmit_port(struct sk_buff *skb,
> > +					struct net_device *dev,
> > int port) +{
> > +	struct mtip_ndev_priv *priv = netdev_priv(dev);
> > +	struct switch_enet_private *fep = priv->fep;
> > +	unsigned short status;
> > +	struct cbd_t *bdp;
> > +	void *bufaddr;
> > +
> > +	spin_lock(&fep->hw_lock);  
> 
> I see some inconsistencies in how you take this lock.
> Bunch of bare spin_lock() calls from BH context, but there's also
> a _irqsave() call in mtip_adjust_link().

In the legacy NXP (Freescale) code for this IP block (i.e. MTIP switch)
the recommended way to re-setup it, when link or duplex changes, is to
reset and reconfigure it.

It requires setting up interrupts as well... In that situation, IMHO
disabling system interrupts is required to avoid some undefined
behaviour.

> Please align to the strictest
> context (not sure if the irqsave is actually needed, at a glance, IOW
> whether the lock is taken from an IRQ)

The spin_lock() for xmit port is similar to what is done for
fec_main.c. As this switch uses single uDMA for both ports as well as
there is no support (and need) for multiple queues it can be omitted.

> 
> > +	if (!fep->link[0] && !fep->link[1]) {
> > +		/* Link is down or autonegotiation is in progress.
> > */
> > +		netif_stop_queue(dev);
> > +		spin_unlock(&fep->hw_lock);
> > +		return NETDEV_TX_BUSY;
> > +	}
> > +
> > +	/* Fill in a Tx ring entry */
> > +	bdp = fep->cur_tx;
> > +
> > +	/* Force read memory barier on the current transmit
> > description */  
> 
> Barrier are between things. What is this barrier separating, and what
> write barrier does it pair with? As far as I can tell cur_tx is just
> a value in memory, and accesses are under ->hw_lock, so there should
> be no ordering concerns.

The bdp is the uDMA descritptor (memory allocated in the coherent dma
area). It is used by the uDMA when data is transferred to MTIP switch
internal buffer.

The bdp->cbd_sc is a half word, which is modified by uDMA engine, to
indicate if there are errors or transfer has ended.

The rmb() shall improve robustness - it assures that the status
corresponds to what was set by uDMA. On the other hand dma coherent
allocation shall do this as well.

The fec_main.c places the rmb() in similar places, so I followed their
approach.

> 
> > +	rmb();
> > +	status = bdp->cbd_sc;
> > +
> > +	if (status & BD_ENET_TX_READY) {
> > +		/* All transmit buffers are full. Bail out.
> > +		 * This should not happen, since dev->tbusy should
> > be set.
> > +		 */
> > +		netif_stop_queue(dev);
> > +		dev_err(&fep->pdev->dev, "%s: tx queue full!.\n",
> > dev->name);  
> 
> This needs to be rate limited, we don't want to flood the logs in case
> there's a bug.

+1

> 
> Also at a glance it seems like you have one fep for multiple netdevs.

Yes.

> So stopping one netdev's Tx queue when fep fills up will not stop the
> other ports from pushing frames, right?

This is a bit more complicated...

Other solutions - like cpsw_new - are conceptually simple; there are
two DMAs to two separate eth IP blocks.
During startup two separate devices are created. When one wants to
enable bridge (i.e. start in-hw offloading) - just single bit is setup
and ... that's it.

With vf610 / imx287 and MTIP it is a bit different (imx287 is even
worse as second ETH interface has incomplete functionality by design).

When switch is not active - you have two uDMA ports to two ENET IP
blocks. Full separation. That is what is done with fec_main.c driver.

When you enable MTIP switch - then you have just a single uDMA0 active
for "both" ports. In fact you "bridge" two ports into a single one -
that is why Freescale/NXP driver (for 2.6.y) just had eth0 to "model"
bridged interfaces. That was "simpler" (PHY management was done in the
driver as well).

Now, in this driver, we do have two network devices, which are "bridged"
(so there is br0). And of course there must be separation between
lan0/1 when this driver is used, but bridge is not (yet) created. This
works :-)

So I do have - 2x netdevs (handled by single uDMA0) + 2PHYS + br0 +
NAPI + switchdev (to avoid broadcast frame storms + {R}STP + FDB -
WIP).

Just pure fun :-) to model it all ... and make happy all maintainers :-)

> 
> > +		spin_unlock(&fep->hw_lock);
> > +		return NETDEV_TX_BUSY;
> > +	}
> > +
> > +	/* Clear all of the status flags */
> > +	status &= ~BD_ENET_TX_STATS;
> > +
> > +	/* Set buffer length and buffer pointer */
> > +	bufaddr = skb->data;
> > +	bdp->cbd_datlen = skb->len;
> > +
> > +	/* On some FEC implementations data must be aligned on
> > +	 * 4-byte boundaries. Use bounce buffers to copy data
> > +	 * and get it aligned.spin
> > +	 */
> > +	if ((unsigned long)bufaddr & MTIP_ALIGNMENT) {  
> 
> I think you should add 
> 
> 	if ... ||
>            fep->quirks & FEC_QUIRK_SWAP_FRAME)
> 
> here. You can't modify skb->data without calling skb_cow_data()
> but you already have buffers allocated so can as well use them.

The vf610 doesn't need the frame to be swapped, but has requirements
for alignment as well.

I would keep things as they are now - as they just improve readability.

Please keep in mind that this version only supports imx287, but the
plan is to add vf610 as well (to be more specific - this driver also
works on vf610, but I plan to add those patches after this one is
accepted and pulled). 

> 
> > +		unsigned int index;
> > +
> > +		index = bdp - fep->tx_bd_base;
> > +		memcpy(fep->tx_bounce[index],
> > +		       (void *)skb->data, skb->len);  
> 
> this fits on one 80 char line BTW, quite easily:
> 
> 		memcpy(fep->tx_bounce[index], (void *)skb->data,
> skb->len);
> 
> Also the cast to void * is not necessary in C.

+1

> 
> > +		bufaddr = fep->tx_bounce[index];
> > +	}
> > +
> > +	if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
> > +		swap_buffer(bufaddr, skb->len);
> > +
> > +	/* Save skb pointer. */
> > +	fep->tx_skbuff[fep->skb_cur] = skb;
> > +
> > +	fep->skb_cur = (fep->skb_cur + 1) & TX_RING_MOD_MASK;  
> 
> Not sure if this is buggy, but maybe delay updating things until the
> mapping succeeds? Fewer things to unwind.

Yes, the skb storage as well as ring buffer modification can be done
after dma mapping code.

> 
> > +	/* Push the data cache so the CPM does not get stale memory
> > +	 * data.
> > +	 */
> > +	bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev, bufaddr,
> > +					  MTIP_SWITCH_TX_FRSIZE,
> > +					  DMA_TO_DEVICE);
> > +	if (unlikely(dma_mapping_error(&fep->pdev->dev,
> > bdp->cbd_bufaddr))) {
> > +		dev_err(&fep->pdev->dev,
> > +			"Failed to map descriptor tx buffer\n");
> > +		dev->stats.tx_errors++;
> > +		dev->stats.tx_dropped++;  
> 
> dropped and errors are two different counters
> I'd stick to dropped

Ok.

> 
> > +		dev_kfree_skb_any(skb);
> > +		goto err;
> > +	}
> > +
> > +	/* Send it on its way.  Tell FEC it's ready, interrupt
> > when done,
> > +	 * it's the last BD of the frame, and to put the CRC on
> > the end.
> > +	 */
> > +
> > +	status |= (BD_ENET_TX_READY | BD_ENET_TX_INTR
> > +			| BD_ENET_TX_LAST | BD_ENET_TX_TC);  
> 
> The | goes at the end of the previous line, start of new line adjusts 
> to the opening brackets..
> 

I've refactored it.

> > +
> > +	/* Synchronize all descriptor writes */
> > +	wmb();
> > +	bdp->cbd_sc = status;
> > +
> > +	netif_trans_update(dev);  
> 
> Is this call necessary?

I've added it when I was forward porting the old driver. It can be
removed.

> 
> > +	skb_tx_timestamp(skb);
> > +
> > +	/* Trigger transmission start */
> > +	writel(MCF_ESW_TDAR_X_DES_ACTIVE, fep->hwp + ESW_TDAR);
> > +
> > +	dev->stats.tx_bytes += skb->len;
> > +	/* If this was the last BD in the ring,
> > +	 * start at the beginning again.
> > +	 */
> > +	if (status & BD_ENET_TX_WRAP)
> > +		bdp = fep->tx_bd_base;
> > +	else
> > +		bdp++;
> > +
> > +	if (bdp == fep->dirty_tx) {
> > +		fep->tx_full = 1;
> > +		netif_stop_queue(dev);
> > +	}
> > +
> > +	fep->cur_tx = bdp;
> > + err:
> > +	spin_unlock(&fep->hw_lock);
> > +
> > +	return NETDEV_TX_OK;
> > +}  

Thanks for the feedback.

Best regards,

Lukasz Majewski

--

DENX Software Engineering GmbH, Managing Director: Johanna Denk,
Tabea Lutz HRB 165235 Munich, Office: Kirchenstr.5, D-82194
Groebenzell, Germany
Phone: (+49)-8142-66989-59 Fax: (+49)-8142-66989-80 Email: lukma@denx.de

Re: [net-next v15 06/12] net: mtip: Add net_device_ops functions to the L2 switch driver

Posted by Lukasz Majewski 2 months, 2 weeks ago

Hi Jakub, Paolo,

Do you have more comments and questions regarding this driver after my
explanation?

Shall I do something more?

Thanks in advance for you feedback.

> Hi Jakub,
> 
> > On Wed, 16 Jul 2025 23:47:25 +0200 Lukasz Majewski wrote:  
> > > +static netdev_tx_t mtip_start_xmit_port(struct sk_buff *skb,
> > > +					struct net_device *dev,
> > > int port) +{
> > > +	struct mtip_ndev_priv *priv = netdev_priv(dev);
> > > +	struct switch_enet_private *fep = priv->fep;
> > > +	unsigned short status;
> > > +	struct cbd_t *bdp;
> > > +	void *bufaddr;
> > > +
> > > +	spin_lock(&fep->hw_lock);    
> > 
> > I see some inconsistencies in how you take this lock.
> > Bunch of bare spin_lock() calls from BH context, but there's also
> > a _irqsave() call in mtip_adjust_link().  
> 
> In the legacy NXP (Freescale) code for this IP block (i.e. MTIP
> switch) the recommended way to re-setup it, when link or duplex
> changes, is to reset and reconfigure it.
> 
> It requires setting up interrupts as well... In that situation, IMHO
> disabling system interrupts is required to avoid some undefined
> behaviour.
> 
> > Please align to the strictest
> > context (not sure if the irqsave is actually needed, at a glance,
> > IOW whether the lock is taken from an IRQ)  
> 
> The spin_lock() for xmit port is similar to what is done for
> fec_main.c. As this switch uses single uDMA for both ports as well as
> there is no support (and need) for multiple queues it can be omitted.
> 
> >   
> > > +	if (!fep->link[0] && !fep->link[1]) {
> > > +		/* Link is down or autonegotiation is in
> > > progress. */
> > > +		netif_stop_queue(dev);
> > > +		spin_unlock(&fep->hw_lock);
> > > +		return NETDEV_TX_BUSY;
> > > +	}
> > > +
> > > +	/* Fill in a Tx ring entry */
> > > +	bdp = fep->cur_tx;
> > > +
> > > +	/* Force read memory barier on the current transmit
> > > description */    
> > 
> > Barrier are between things. What is this barrier separating, and
> > what write barrier does it pair with? As far as I can tell cur_tx
> > is just a value in memory, and accesses are under ->hw_lock, so
> > there should be no ordering concerns.  
> 
> The bdp is the uDMA descritptor (memory allocated in the coherent dma
> area). It is used by the uDMA when data is transferred to MTIP switch
> internal buffer.
> 
> The bdp->cbd_sc is a half word, which is modified by uDMA engine, to
> indicate if there are errors or transfer has ended.
> 
> The rmb() shall improve robustness - it assures that the status
> corresponds to what was set by uDMA. On the other hand dma coherent
> allocation shall do this as well.
> 
> The fec_main.c places the rmb() in similar places, so I followed their
> approach.
> 
> >   
> > > +	rmb();
> > > +	status = bdp->cbd_sc;
> > > +
> > > +	if (status & BD_ENET_TX_READY) {
> > > +		/* All transmit buffers are full. Bail out.
> > > +		 * This should not happen, since dev->tbusy
> > > should be set.
> > > +		 */
> > > +		netif_stop_queue(dev);
> > > +		dev_err(&fep->pdev->dev, "%s: tx queue full!.\n",
> > > dev->name);    
> > 
> > This needs to be rate limited, we don't want to flood the logs in
> > case there's a bug.  
> 
> +1
> 
> > 
> > Also at a glance it seems like you have one fep for multiple
> > netdevs.  
> 
> Yes.
> 
> > So stopping one netdev's Tx queue when fep fills up will not stop
> > the other ports from pushing frames, right?  
> 
> This is a bit more complicated...
> 
> Other solutions - like cpsw_new - are conceptually simple; there are
> two DMAs to two separate eth IP blocks.
> During startup two separate devices are created. When one wants to
> enable bridge (i.e. start in-hw offloading) - just single bit is setup
> and ... that's it.
> 
> With vf610 / imx287 and MTIP it is a bit different (imx287 is even
> worse as second ETH interface has incomplete functionality by design).
> 
> When switch is not active - you have two uDMA ports to two ENET IP
> blocks. Full separation. That is what is done with fec_main.c driver.
> 
> When you enable MTIP switch - then you have just a single uDMA0 active
> for "both" ports. In fact you "bridge" two ports into a single one -
> that is why Freescale/NXP driver (for 2.6.y) just had eth0 to "model"
> bridged interfaces. That was "simpler" (PHY management was done in the
> driver as well).
> 
> Now, in this driver, we do have two network devices, which are
> "bridged" (so there is br0). And of course there must be separation
> between lan0/1 when this driver is used, but bridge is not (yet)
> created. This works :-)
> 
> 
> So I do have - 2x netdevs (handled by single uDMA0) + 2PHYS + br0 +
> NAPI + switchdev (to avoid broadcast frame storms + {R}STP + FDB -
> WIP).
> 
> 
> Just pure fun :-) to model it all ... and make happy all maintainers
> :-)
> 
> >   
> > > +		spin_unlock(&fep->hw_lock);
> > > +		return NETDEV_TX_BUSY;
> > > +	}
> > > +
> > > +	/* Clear all of the status flags */
> > > +	status &= ~BD_ENET_TX_STATS;
> > > +
> > > +	/* Set buffer length and buffer pointer */
> > > +	bufaddr = skb->data;
> > > +	bdp->cbd_datlen = skb->len;
> > > +
> > > +	/* On some FEC implementations data must be aligned on
> > > +	 * 4-byte boundaries. Use bounce buffers to copy data
> > > +	 * and get it aligned.spin
> > > +	 */
> > > +	if ((unsigned long)bufaddr & MTIP_ALIGNMENT) {    
> > 
> > I think you should add 
> > 
> > 	if ... ||
> >            fep->quirks & FEC_QUIRK_SWAP_FRAME)
> > 
> > here. You can't modify skb->data without calling skb_cow_data()
> > but you already have buffers allocated so can as well use them.  
> 
> The vf610 doesn't need the frame to be swapped, but has requirements
> for alignment as well.
> 
> I would keep things as they are now - as they just improve
> readability.
> 
> Please keep in mind that this version only supports imx287, but the
> plan is to add vf610 as well (to be more specific - this driver also
> works on vf610, but I plan to add those patches after this one is
> accepted and pulled). 
> 
> >   
> > > +		unsigned int index;
> > > +
> > > +		index = bdp - fep->tx_bd_base;
> > > +		memcpy(fep->tx_bounce[index],
> > > +		       (void *)skb->data, skb->len);    
> > 
> > this fits on one 80 char line BTW, quite easily:
> > 
> > 		memcpy(fep->tx_bounce[index], (void *)skb->data,
> > skb->len);
> > 
> > Also the cast to void * is not necessary in C.  
> 
> +1
> 
> >   
> > > +		bufaddr = fep->tx_bounce[index];
> > > +	}
> > > +
> > > +	if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
> > > +		swap_buffer(bufaddr, skb->len);
> > > +
> > > +	/* Save skb pointer. */
> > > +	fep->tx_skbuff[fep->skb_cur] = skb;
> > > +
> > > +	fep->skb_cur = (fep->skb_cur + 1) & TX_RING_MOD_MASK;    
> > 
> > Not sure if this is buggy, but maybe delay updating things until the
> > mapping succeeds? Fewer things to unwind.  
> 
> Yes, the skb storage as well as ring buffer modification can be done
> after dma mapping code.
> 
> >   
> > > +	/* Push the data cache so the CPM does not get stale
> > > memory
> > > +	 * data.
> > > +	 */
> > > +	bdp->cbd_bufaddr = dma_map_single(&fep->pdev->dev,
> > > bufaddr,
> > > +					  MTIP_SWITCH_TX_FRSIZE,
> > > +					  DMA_TO_DEVICE);
> > > +	if (unlikely(dma_mapping_error(&fep->pdev->dev,
> > > bdp->cbd_bufaddr))) {
> > > +		dev_err(&fep->pdev->dev,
> > > +			"Failed to map descriptor tx buffer\n");
> > > +		dev->stats.tx_errors++;
> > > +		dev->stats.tx_dropped++;    
> > 
> > dropped and errors are two different counters
> > I'd stick to dropped  
> 
> Ok.
> 
> >   
> > > +		dev_kfree_skb_any(skb);
> > > +		goto err;
> > > +	}
> > > +
> > > +	/* Send it on its way.  Tell FEC it's ready, interrupt
> > > when done,
> > > +	 * it's the last BD of the frame, and to put the CRC on
> > > the end.
> > > +	 */
> > > +
> > > +	status |= (BD_ENET_TX_READY | BD_ENET_TX_INTR
> > > +			| BD_ENET_TX_LAST | BD_ENET_TX_TC);    
> > 
> > The | goes at the end of the previous line, start of new line
> > adjusts to the opening brackets..
> >   
> 
> I've refactored it.
> 
> > > +
> > > +	/* Synchronize all descriptor writes */
> > > +	wmb();
> > > +	bdp->cbd_sc = status;
> > > +
> > > +	netif_trans_update(dev);    
> > 
> > Is this call necessary?  
> 
> I've added it when I was forward porting the old driver. It can be
> removed.
> 
> >   
> > > +	skb_tx_timestamp(skb);
> > > +
> > > +	/* Trigger transmission start */
> > > +	writel(MCF_ESW_TDAR_X_DES_ACTIVE, fep->hwp + ESW_TDAR);
> > > +
> > > +	dev->stats.tx_bytes += skb->len;
> > > +	/* If this was the last BD in the ring,
> > > +	 * start at the beginning again.
> > > +	 */
> > > +	if (status & BD_ENET_TX_WRAP)
> > > +		bdp = fep->tx_bd_base;
> > > +	else
> > > +		bdp++;
> > > +
> > > +	if (bdp == fep->dirty_tx) {
> > > +		fep->tx_full = 1;
> > > +		netif_stop_queue(dev);
> > > +	}
> > > +
> > > +	fep->cur_tx = bdp;
> > > + err:
> > > +	spin_unlock(&fep->hw_lock);
> > > +
> > > +	return NETDEV_TX_OK;
> > > +}    
> 
> 
> Thanks for the feedback.
> 
> Best regards,
> 
> Lukasz Majewski
> 
> --
> 
> DENX Software Engineering GmbH, Managing Director: Johanna Denk,
> Tabea Lutz HRB 165235 Munich, Office: Kirchenstr.5, D-82194
> Groebenzell, Germany
> Phone: (+49)-8142-66989-59 Fax: (+49)-8142-66989-80 Email:
> lukma@denx.de




Best regards,

Lukasz Majewski

--

DENX Software Engineering GmbH, Managing Director: Johanna Denk,
Tabea Lutz HRB 165235 Munich, Office: Kirchenstr.5, D-82194
Groebenzell, Germany
Phone: (+49)-8142-66989-59 Fax: (+49)-8142-66989-80 Email: lukma@denx.de

Re: [net-next v15 06/12] net: mtip: Add net_device_ops functions to the L2 switch driver

Posted by Jakub Kicinski 2 months, 2 weeks ago

On Wed, 23 Jul 2025 22:05:17 +0200 Lukasz Majewski wrote:
> Do you have more comments and questions regarding this driver after my
> explanation?
> 
> Shall I do something more?

Addressing the comments may be more useful than explaining them away.

Re: [net-next v15 06/12] net: mtip: Add net_device_ops functions to the L2 switch driver

Posted by Lukasz Majewski 2 months, 2 weeks ago

Hi Jakub,

> On Wed, 23 Jul 2025 22:05:17 +0200 Lukasz Majewski wrote:
> > Do you have more comments and questions regarding this driver after
> > my explanation?
> > 
> > Shall I do something more?  
> 
> Addressing the comments may be more useful than explaining them away.

As I've already described what are the issues I try to solve (and some
design decisions), I will correct them (where applicable) and prepare
next code version.


Best regards,

Lukasz Majewski

--

DENX Software Engineering GmbH, Managing Director: Johanna Denk,
Tabea Lutz HRB 165235 Munich, Office: Kirchenstr.5, D-82194
Groebenzell, Germany
Phone: (+49)-8142-66989-59 Fax: (+49)-8142-66989-80 Email: lukma@denx.de