[net-next v13 06/11] net: mtip: Add mtip_switch_{rx|tx} functions to the L2 switch driver

Lukasz Majewski posted 11 patches 3 months, 2 weeks ago
There is a newer version of this series
[net-next v13 06/11] net: mtip: Add mtip_switch_{rx|tx} functions to the L2 switch driver
Posted by Lukasz Majewski 3 months, 2 weeks ago
This patch provides mtip_switch_tx and mtip_switch_rx functions
code for MTIP L2 switch.

Signed-off-by: Lukasz Majewski <lukma@denx.de>
---
Changes for v13:
- New patch - created by excluding some code from large (i.e. v12 and
  earlier) MTIP driver
---
 .../net/ethernet/freescale/mtipsw/mtipl2sw.c  | 252 ++++++++++++++++++
 1 file changed, 252 insertions(+)

diff --git a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
index 813cd39d6d56..a4e38e0d773e 100644
--- a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
+++ b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
@@ -228,6 +228,39 @@ struct mtip_port_info *mtip_portinfofifo_read(struct switch_enet_private *fep)
 	return info;
 }
 
+static void mtip_atable_get_entry_port_number(struct switch_enet_private *fep,
+					      unsigned char *mac_addr, u8 *port)
+{
+	int block_index, block_index_end, entry;
+	u32 mac_addr_lo, mac_addr_hi;
+	u32 read_lo, read_hi;
+
+	mac_addr_lo = (u32)((mac_addr[3] << 24) | (mac_addr[2] << 16) |
+			    (mac_addr[1] << 8) | mac_addr[0]);
+	mac_addr_hi = (u32)((mac_addr[5] << 8) | (mac_addr[4]));
+
+	block_index = GET_BLOCK_PTR(crc8_calc(mac_addr));
+	block_index_end = block_index + ATABLE_ENTRY_PER_SLOT;
+
+	/* now search all the entries in the selected block */
+	for (entry = block_index; entry < block_index_end; entry++) {
+		mtip_read_atable(fep, entry, &read_lo, &read_hi);
+		*port = MTIP_PORT_FORWARDING_INIT;
+
+		if (read_lo == mac_addr_lo &&
+		    ((read_hi & 0x0000FFFF) ==
+		     (mac_addr_hi & 0x0000FFFF))) {
+			/* found the correct address */
+			if ((read_hi & (1 << 16)) && (!(read_hi & (1 << 17))))
+				*port = FIELD_GET(AT_PORT_MASK, read_hi);
+			break;
+		}
+	}
+
+	dev_dbg(&fep->pdev->dev, "%s: MAC: %pM PORT: 0x%x\n", __func__,
+		mac_addr, *port);
+}
+
 /* Clear complete MAC Look Up Table */
 void mtip_clear_atable(struct switch_enet_private *fep)
 {
@@ -820,10 +853,229 @@ static irqreturn_t mtip_interrupt(int irq, void *ptr_fep)
 
 static void mtip_switch_tx(struct net_device *dev)
 {
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	struct switch_enet_private *fep = priv->fep;
+	unsigned short status;
+	struct sk_buff *skb;
+	unsigned long flags;
+	struct cbd_t *bdp;
+
+	spin_lock_irqsave(&fep->hw_lock, flags);
+	bdp = fep->dirty_tx;
+
+	while (((status = bdp->cbd_sc) & BD_ENET_TX_READY) == 0) {
+		if (bdp == fep->cur_tx && fep->tx_full == 0)
+			break;
+
+		dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr,
+				 MTIP_SWITCH_TX_FRSIZE, DMA_TO_DEVICE);
+		bdp->cbd_bufaddr = 0;
+		skb = fep->tx_skbuff[fep->skb_dirty];
+		/* Check for errors */
+		if (status & (BD_ENET_TX_HB | BD_ENET_TX_LC |
+				   BD_ENET_TX_RL | BD_ENET_TX_UN |
+				   BD_ENET_TX_CSL)) {
+			dev->stats.tx_errors++;
+			if (status & BD_ENET_TX_HB)  /* No heartbeat */
+				dev->stats.tx_heartbeat_errors++;
+			if (status & BD_ENET_TX_LC)  /* Late collision */
+				dev->stats.tx_window_errors++;
+			if (status & BD_ENET_TX_RL)  /* Retrans limit */
+				dev->stats.tx_aborted_errors++;
+			if (status & BD_ENET_TX_UN)  /* Underrun */
+				dev->stats.tx_fifo_errors++;
+			if (status & BD_ENET_TX_CSL) /* Carrier lost */
+				dev->stats.tx_carrier_errors++;
+		} else {
+			dev->stats.tx_packets++;
+		}
+
+		if (status & BD_ENET_TX_READY)
+			dev_err(&fep->pdev->dev,
+				"Enet xmit interrupt and TX_READY.\n");
+
+		/* Deferred means some collisions occurred during transmit,
+		 * but we eventually sent the packet OK.
+		 */
+		if (status & BD_ENET_TX_DEF)
+			dev->stats.collisions++;
+
+		/* Free the sk buffer associated with this last transmit */
+		dev_consume_skb_irq(skb);
+		fep->tx_skbuff[fep->skb_dirty] = NULL;
+		fep->skb_dirty = (fep->skb_dirty + 1) & TX_RING_MOD_MASK;
+
+		/* Update pointer to next buffer descriptor to be transmitted */
+		if (status & BD_ENET_TX_WRAP)
+			bdp = fep->tx_bd_base;
+		else
+			bdp++;
+
+		/* Since we have freed up a buffer, the ring is no longer
+		 * full.
+		 */
+		if (fep->tx_full) {
+			fep->tx_full = 0;
+			if (netif_queue_stopped(dev))
+				netif_wake_queue(dev);
+		}
+	}
+	fep->dirty_tx = bdp;
+	spin_unlock_irqrestore(&fep->hw_lock, flags);
 }
 
+/* During a receive, the cur_rx points to the current incoming buffer.
+ * When we update through the ring, if the next incoming buffer has
+ * not been given to the system, we just set the empty indicator,
+ * effectively tossing the packet.
+ */
 static int mtip_switch_rx(struct net_device *dev, int budget, int *port)
 {
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	u8 *data, rx_port = MTIP_PORT_FORWARDING_INIT;
+	struct switch_enet_private *fep = priv->fep;
+	unsigned short status, pkt_len;
+	struct net_device *pndev;
+	struct ethhdr *eth_hdr;
+	int pkt_received = 0;
+	struct sk_buff *skb;
+	struct cbd_t *bdp;
+	struct page *page;
+
+	spin_lock_bh(&fep->hw_lock);
+
+	/* First, grab all of the stats for the incoming packet.
+	 * These get messed up if we get called due to a busy condition.
+	 */
+	bdp = fep->cur_rx;
+
+	while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) {
+		if (pkt_received >= budget)
+			break;
+
+		pkt_received++;
+		/* Since we have allocated space to hold a complete frame,
+		 * the last indicator should be set.
+		 */
+		if ((status & BD_ENET_RX_LAST) == 0)
+			dev_warn_ratelimited(&dev->dev,
+					     "SWITCH ENET: rcv is not +last\n");
+
+		if (!fep->usage_count)
+			goto rx_processing_done;
+
+		/* Check for errors. */
+		if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH | BD_ENET_RX_NO |
+			      BD_ENET_RX_CR | BD_ENET_RX_OV)) {
+			dev->stats.rx_errors++;
+			if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH)) {
+				/* Frame too long or too short. */
+				dev->stats.rx_length_errors++;
+			}
+			if (status & BD_ENET_RX_NO)	/* Frame alignment */
+				dev->stats.rx_frame_errors++;
+			if (status & BD_ENET_RX_CR)	/* CRC Error */
+				dev->stats.rx_crc_errors++;
+			if (status & BD_ENET_RX_OV)	/* FIFO overrun */
+				dev->stats.rx_fifo_errors++;
+		}
+
+		/* Report late collisions as a frame error.
+		 * On this error, the BD is closed, but we don't know what we
+		 * have in the buffer.  So, just drop this frame on the floor.
+		 */
+		if (status & BD_ENET_RX_CL) {
+			dev->stats.rx_errors++;
+			dev->stats.rx_frame_errors++;
+			goto rx_processing_done;
+		}
+
+		/* Get correct RX page */
+		page = fep->page[bdp - fep->rx_bd_base];
+		/* Process the incoming frame */
+		pkt_len = bdp->cbd_datlen;
+		data = (__u8 *)__va(bdp->cbd_bufaddr);
+
+		dma_sync_single_for_cpu(&fep->pdev->dev, bdp->cbd_bufaddr,
+					pkt_len, DMA_FROM_DEVICE);
+		prefetch(page_address(page));
+
+		if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
+			swap_buffer(data, pkt_len);
+
+		if (data) {
+			eth_hdr = (struct ethhdr *)data;
+			mtip_atable_get_entry_port_number(fep,
+							  eth_hdr->h_source,
+							  &rx_port);
+			if (rx_port == MTIP_PORT_FORWARDING_INIT)
+				mtip_atable_dynamicms_learn_migration(fep,
+								      fep->curr_time,
+								      eth_hdr->h_source,
+								      &rx_port);
+		}
+
+		if ((rx_port == 1 || rx_port == 2) && fep->ndev[rx_port - 1])
+			pndev = fep->ndev[rx_port - 1];
+		else
+			pndev = dev;
+
+		*port = rx_port;
+
+		/* This does 16 byte alignment, exactly what we need.
+		 * The packet length includes FCS, but we don't want to
+		 * include that when passing upstream as it messes up
+		 * bridging applications.
+		 */
+		skb = netdev_alloc_skb(pndev, pkt_len + NET_IP_ALIGN);
+		if (unlikely(!skb)) {
+			dev_dbg(&fep->pdev->dev,
+				"%s: Memory squeeze, dropping packet.\n",
+				pndev->name);
+			page_pool_recycle_direct(fep->page_pool, page);
+			pndev->stats.rx_dropped++;
+			goto err_mem;
+		} else {
+			skb_reserve(skb, NET_IP_ALIGN);
+			skb_put(skb, pkt_len);      /* Make room */
+			skb_copy_to_linear_data(skb, data, pkt_len);
+			skb->protocol = eth_type_trans(skb, pndev);
+			napi_gro_receive(&fep->napi, skb);
+		}
+
+		pndev->stats.rx_packets++;
+		pndev->stats.rx_bytes += pkt_len;
+
+ rx_processing_done:
+		/* Clear the status flags for this buffer */
+		status &= ~BD_ENET_RX_STATS;
+
+		/* Mark the buffer empty */
+		status |= BD_ENET_RX_EMPTY;
+		/* Make sure that updates to the descriptor are performed */
+		wmb();
+		bdp->cbd_sc = status;
+
+		/* Update BD pointer to next entry */
+		if (status & BD_ENET_RX_WRAP)
+			bdp = fep->rx_bd_base;
+		else
+			bdp++;
+
+		/* Doing this here will keep the FEC running while we process
+		 * incoming frames.  On a heavily loaded network, we should be
+		 * able to keep up at the expense of system resources.
+		 */
+		writel(MCF_ESW_RDAR_R_DES_ACTIVE, fep->hwp + ESW_RDAR);
+	} /* while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) */
+
+	fep->cur_rx = bdp;
+	spin_unlock_bh(&fep->hw_lock);
+
+	return pkt_received;
+
+ err_mem:
+	spin_unlock_bh(&fep->hw_lock);
 	return -ENOMEM;
 }
 
-- 
2.39.5
Re: [net-next v13 06/11] net: mtip: Add mtip_switch_{rx|tx} functions to the L2 switch driver
Posted by Paolo Abeni 3 months, 2 weeks ago
On 6/22/25 11:37 AM, Lukasz Majewski wrote:
> This patch provides mtip_switch_tx and mtip_switch_rx functions
> code for MTIP L2 switch.
> 
> Signed-off-by: Lukasz Majewski <lukma@denx.de>
> ---
> Changes for v13:
> - New patch - created by excluding some code from large (i.e. v12 and
>   earlier) MTIP driver
> ---
>  .../net/ethernet/freescale/mtipsw/mtipl2sw.c  | 252 ++++++++++++++++++
>  1 file changed, 252 insertions(+)
> 
> diff --git a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
> index 813cd39d6d56..a4e38e0d773e 100644
> --- a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
> +++ b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
> @@ -228,6 +228,39 @@ struct mtip_port_info *mtip_portinfofifo_read(struct switch_enet_private *fep)
>  	return info;
>  }
>  
> +static void mtip_atable_get_entry_port_number(struct switch_enet_private *fep,
> +					      unsigned char *mac_addr, u8 *port)
> +{
> +	int block_index, block_index_end, entry;
> +	u32 mac_addr_lo, mac_addr_hi;
> +	u32 read_lo, read_hi;
> +
> +	mac_addr_lo = (u32)((mac_addr[3] << 24) | (mac_addr[2] << 16) |
> +			    (mac_addr[1] << 8) | mac_addr[0]);
> +	mac_addr_hi = (u32)((mac_addr[5] << 8) | (mac_addr[4]));
> +
> +	block_index = GET_BLOCK_PTR(crc8_calc(mac_addr));
> +	block_index_end = block_index + ATABLE_ENTRY_PER_SLOT;
> +
> +	/* now search all the entries in the selected block */
> +	for (entry = block_index; entry < block_index_end; entry++) {
> +		mtip_read_atable(fep, entry, &read_lo, &read_hi);
> +		*port = MTIP_PORT_FORWARDING_INIT;
> +
> +		if (read_lo == mac_addr_lo &&
> +		    ((read_hi & 0x0000FFFF) ==
> +		     (mac_addr_hi & 0x0000FFFF))) {
> +			/* found the correct address */
> +			if ((read_hi & (1 << 16)) && (!(read_hi & (1 << 17))))
> +				*port = FIELD_GET(AT_PORT_MASK, read_hi);
> +			break;
> +		}
> +	}
> +
> +	dev_dbg(&fep->pdev->dev, "%s: MAC: %pM PORT: 0x%x\n", __func__,
> +		mac_addr, *port);
> +}
> +
>  /* Clear complete MAC Look Up Table */
>  void mtip_clear_atable(struct switch_enet_private *fep)
>  {
> @@ -820,10 +853,229 @@ static irqreturn_t mtip_interrupt(int irq, void *ptr_fep)
>  
>  static void mtip_switch_tx(struct net_device *dev)
>  {
> +	struct mtip_ndev_priv *priv = netdev_priv(dev);
> +	struct switch_enet_private *fep = priv->fep;
> +	unsigned short status;
> +	struct sk_buff *skb;
> +	unsigned long flags;
> +	struct cbd_t *bdp;
> +
> +	spin_lock_irqsave(&fep->hw_lock, flags);

This is called from napi (bh) context, and every other caller is/should
be BH, too. You should use

	spin_lock_bh()

Also please test your patches with CONFIG_LOCKDEP and
CONFIG_DEBUG_SPINLOCK enabled, thet will help finding this king of issues.

/P

> +	bdp = fep->dirty_tx;
> +
> +	while (((status = bdp->cbd_sc) & BD_ENET_TX_READY) == 0) {
> +		if (bdp == fep->cur_tx && fep->tx_full == 0)
> +			break;
> +
> +		dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr,
> +				 MTIP_SWITCH_TX_FRSIZE, DMA_TO_DEVICE);
> +		bdp->cbd_bufaddr = 0;
> +		skb = fep->tx_skbuff[fep->skb_dirty];
> +		/* Check for errors */
> +		if (status & (BD_ENET_TX_HB | BD_ENET_TX_LC |
> +				   BD_ENET_TX_RL | BD_ENET_TX_UN |
> +				   BD_ENET_TX_CSL)) {
> +			dev->stats.tx_errors++;
> +			if (status & BD_ENET_TX_HB)  /* No heartbeat */
> +				dev->stats.tx_heartbeat_errors++;
> +			if (status & BD_ENET_TX_LC)  /* Late collision */
> +				dev->stats.tx_window_errors++;
> +			if (status & BD_ENET_TX_RL)  /* Retrans limit */
> +				dev->stats.tx_aborted_errors++;
> +			if (status & BD_ENET_TX_UN)  /* Underrun */
> +				dev->stats.tx_fifo_errors++;
> +			if (status & BD_ENET_TX_CSL) /* Carrier lost */
> +				dev->stats.tx_carrier_errors++;
> +		} else {
> +			dev->stats.tx_packets++;
> +		}
> +
> +		if (status & BD_ENET_TX_READY)
> +			dev_err(&fep->pdev->dev,
> +				"Enet xmit interrupt and TX_READY.\n");
> +
> +		/* Deferred means some collisions occurred during transmit,
> +		 * but we eventually sent the packet OK.
> +		 */
> +		if (status & BD_ENET_TX_DEF)
> +			dev->stats.collisions++;
> +
> +		/* Free the sk buffer associated with this last transmit */
> +		dev_consume_skb_irq(skb);
> +		fep->tx_skbuff[fep->skb_dirty] = NULL;
> +		fep->skb_dirty = (fep->skb_dirty + 1) & TX_RING_MOD_MASK;
> +
> +		/* Update pointer to next buffer descriptor to be transmitted */
> +		if (status & BD_ENET_TX_WRAP)
> +			bdp = fep->tx_bd_base;
> +		else
> +			bdp++;
> +
> +		/* Since we have freed up a buffer, the ring is no longer
> +		 * full.
> +		 */
> +		if (fep->tx_full) {
> +			fep->tx_full = 0;
> +			if (netif_queue_stopped(dev))
> +				netif_wake_queue(dev);
> +		}
> +	}
> +	fep->dirty_tx = bdp;
> +	spin_unlock_irqrestore(&fep->hw_lock, flags);
>  }
>  
> +/* During a receive, the cur_rx points to the current incoming buffer.
> + * When we update through the ring, if the next incoming buffer has
> + * not been given to the system, we just set the empty indicator,
> + * effectively tossing the packet.
> + */
>  static int mtip_switch_rx(struct net_device *dev, int budget, int *port)
>  {
> +	struct mtip_ndev_priv *priv = netdev_priv(dev);
> +	u8 *data, rx_port = MTIP_PORT_FORWARDING_INIT;
> +	struct switch_enet_private *fep = priv->fep;
> +	unsigned short status, pkt_len;
> +	struct net_device *pndev;
> +	struct ethhdr *eth_hdr;
> +	int pkt_received = 0;
> +	struct sk_buff *skb;
> +	struct cbd_t *bdp;
> +	struct page *page;
> +
> +	spin_lock_bh(&fep->hw_lock);
> +
> +	/* First, grab all of the stats for the incoming packet.
> +	 * These get messed up if we get called due to a busy condition.
> +	 */
> +	bdp = fep->cur_rx;
> +
> +	while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) {
> +		if (pkt_received >= budget)
> +			break;
> +
> +		pkt_received++;
> +		/* Since we have allocated space to hold a complete frame,
> +		 * the last indicator should be set.
> +		 */
> +		if ((status & BD_ENET_RX_LAST) == 0)
> +			dev_warn_ratelimited(&dev->dev,
> +					     "SWITCH ENET: rcv is not +last\n");

Probably you want to break the look when this condition is hit.

> +
> +		if (!fep->usage_count)
> +			goto rx_processing_done;
> +
> +		/* Check for errors. */
> +		if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH | BD_ENET_RX_NO |
> +			      BD_ENET_RX_CR | BD_ENET_RX_OV)) {
> +			dev->stats.rx_errors++;
> +			if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH)) {
> +				/* Frame too long or too short. */
> +				dev->stats.rx_length_errors++;
> +			}
> +			if (status & BD_ENET_RX_NO)	/* Frame alignment */
> +				dev->stats.rx_frame_errors++;
> +			if (status & BD_ENET_RX_CR)	/* CRC Error */
> +				dev->stats.rx_crc_errors++;
> +			if (status & BD_ENET_RX_OV)	/* FIFO overrun */
> +				dev->stats.rx_fifo_errors++;
> +		}
> +
> +		/* Report late collisions as a frame error.
> +		 * On this error, the BD is closed, but we don't know what we
> +		 * have in the buffer.  So, just drop this frame on the floor.
> +		 */
> +		if (status & BD_ENET_RX_CL) {
> +			dev->stats.rx_errors++;
> +			dev->stats.rx_frame_errors++;
> +			goto rx_processing_done;
> +		}
> +
> +		/* Get correct RX page */
> +		page = fep->page[bdp - fep->rx_bd_base];
> +		/* Process the incoming frame */
> +		pkt_len = bdp->cbd_datlen;
> +		data = (__u8 *)__va(bdp->cbd_bufaddr);
> +
> +		dma_sync_single_for_cpu(&fep->pdev->dev, bdp->cbd_bufaddr,
> +					pkt_len, DMA_FROM_DEVICE);
> +		prefetch(page_address(page));

Use net_prefetch() instead

> +
> +		if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
> +			swap_buffer(data, pkt_len);
> +
> +		if (data) {
> +			eth_hdr = (struct ethhdr *)data;
> +			mtip_atable_get_entry_port_number(fep,
> +							  eth_hdr->h_source,
> +							  &rx_port);
> +			if (rx_port == MTIP_PORT_FORWARDING_INIT)
> +				mtip_atable_dynamicms_learn_migration(fep,
> +								      fep->curr_time,
> +								      eth_hdr->h_source,
> +								      &rx_port);
> +		}
> +
> +		if ((rx_port == 1 || rx_port == 2) && fep->ndev[rx_port - 1])
> +			pndev = fep->ndev[rx_port - 1];
> +		else
> +			pndev = dev;
> +
> +		*port = rx_port;

Do i read correctly that several network device use the same napi
instance? That will break napi assumptions and will incorrectly allow
napi merging packets coming from different devices.

> +
> +		/* This does 16 byte alignment, exactly what we need.
> +		 * The packet length includes FCS, but we don't want to
> +		 * include that when passing upstream as it messes up
> +		 * bridging applications.
> +		 */
> +		skb = netdev_alloc_skb(pndev, pkt_len + NET_IP_ALIGN);
> +		if (unlikely(!skb)) {
> +			dev_dbg(&fep->pdev->dev,
> +				"%s: Memory squeeze, dropping packet.\n",
> +				pndev->name);
> +			page_pool_recycle_direct(fep->page_pool, page);
> +			pndev->stats.rx_dropped++;
> +			goto err_mem;
> +		} else {

No need for the else statement above.

/P
Re: [net-next v13 06/11] net: mtip: Add mtip_switch_{rx|tx} functions to the L2 switch driver
Posted by Lukasz Majewski 3 months, 1 week ago
Hi Paolo,

> >  static void mtip_switch_tx(struct net_device *dev)
> >  {
> > +	struct mtip_ndev_priv *priv = netdev_priv(dev);
> > +	struct switch_enet_private *fep = priv->fep;
> > +	unsigned short status;
> > +	struct sk_buff *skb;
> > +	unsigned long flags;
> > +	struct cbd_t *bdp;
> > +
> > +	spin_lock_irqsave(&fep->hw_lock, flags);  
> 
> This is called from napi (bh) context, and every other caller
> is/should be BH, too. You should use
> 
> 	spin_lock_bh()

I've double check the spin locking in the driver - I've also consult
the fec_main.c.

It looks like the mtip_switch_rx() and fec_enet_rx() are not using
explicit locks and rely on NAPI locking.

On the other hand - the fec_enet_tx (and corresponding MTIP variant)
use spin_lock(), not the _bh() variant.

> 
> Also please test your patches with CONFIG_LOCKDEP and
> CONFIG_DEBUG_SPINLOCK enabled, thet will help finding this king of
> issues.

This was enabled by default. By changing all locks to _bh() there were
deadlocks observed.

On the MTIP driver (due to this HW IP block) there are some locks which
must disable interrupts:

1. One is when mtip_adjust_link() is called - as it is the same for
both switch ports. Moreover, at some use cases it is required that the
switch IP block is reset.

2. The mtip_atable_dynamicms_learn_migration() - it changes the content
of dynamic switching table. IRQ from switch shall not be possible at
this time as it can be called from mtip_switch_rx() (from NAPI) and
from timer/kthread (at specified period).


To sum up:

I'm going to prepare the v14 with changes around the timer / kthread
running mtip_atable_dynamicms_learn_migration() and use time stamps
extracted from jiffies.

Locks will be optimized and following paradigm used with fec_main.c
driver.

> 
> /P


Best regards,

Lukasz Majewski

--

DENX Software Engineering GmbH,      Managing Director: Erika Unter
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-59 Fax: (+49)-8142-66989-80 Email: lukma@denx.de
Re: [net-next v13 06/11] net: mtip: Add mtip_switch_{rx|tx} functions to the L2 switch driver
Posted by Lukasz Majewski 3 months, 2 weeks ago
Hi Paolo,

> On 6/22/25 11:37 AM, Lukasz Majewski wrote:
> > This patch provides mtip_switch_tx and mtip_switch_rx functions
> > code for MTIP L2 switch.
> > 
> > Signed-off-by: Lukasz Majewski <lukma@denx.de>
> > ---
> > Changes for v13:
> > - New patch - created by excluding some code from large (i.e. v12
> > and earlier) MTIP driver
> > ---
> >  .../net/ethernet/freescale/mtipsw/mtipl2sw.c  | 252
> > ++++++++++++++++++ 1 file changed, 252 insertions(+)
> > 
> > diff --git a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
> > b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c index
> > 813cd39d6d56..a4e38e0d773e 100644 ---
> > a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c +++
> > b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c @@ -228,6
> > +228,39 @@ struct mtip_port_info *mtip_portinfofifo_read(struct
> > switch_enet_private *fep) return info; }
> >  
> > +static void mtip_atable_get_entry_port_number(struct
> > switch_enet_private *fep,
> > +					      unsigned char
> > *mac_addr, u8 *port) +{
> > +	int block_index, block_index_end, entry;
> > +	u32 mac_addr_lo, mac_addr_hi;
> > +	u32 read_lo, read_hi;
> > +
> > +	mac_addr_lo = (u32)((mac_addr[3] << 24) | (mac_addr[2] <<
> > 16) |
> > +			    (mac_addr[1] << 8) | mac_addr[0]);
> > +	mac_addr_hi = (u32)((mac_addr[5] << 8) | (mac_addr[4]));
> > +
> > +	block_index = GET_BLOCK_PTR(crc8_calc(mac_addr));
> > +	block_index_end = block_index + ATABLE_ENTRY_PER_SLOT;
> > +
> > +	/* now search all the entries in the selected block */
> > +	for (entry = block_index; entry < block_index_end;
> > entry++) {
> > +		mtip_read_atable(fep, entry, &read_lo, &read_hi);
> > +		*port = MTIP_PORT_FORWARDING_INIT;
> > +
> > +		if (read_lo == mac_addr_lo &&
> > +		    ((read_hi & 0x0000FFFF) ==
> > +		     (mac_addr_hi & 0x0000FFFF))) {
> > +			/* found the correct address */
> > +			if ((read_hi & (1 << 16)) && (!(read_hi &
> > (1 << 17))))
> > +				*port = FIELD_GET(AT_PORT_MASK,
> > read_hi);
> > +			break;
> > +		}
> > +	}
> > +
> > +	dev_dbg(&fep->pdev->dev, "%s: MAC: %pM PORT: 0x%x\n",
> > __func__,
> > +		mac_addr, *port);
> > +}
> > +
> >  /* Clear complete MAC Look Up Table */
> >  void mtip_clear_atable(struct switch_enet_private *fep)
> >  {
> > @@ -820,10 +853,229 @@ static irqreturn_t mtip_interrupt(int irq,
> > void *ptr_fep) 
> >  static void mtip_switch_tx(struct net_device *dev)
> >  {
> > +	struct mtip_ndev_priv *priv = netdev_priv(dev);
> > +	struct switch_enet_private *fep = priv->fep;
> > +	unsigned short status;
> > +	struct sk_buff *skb;
> > +	unsigned long flags;
> > +	struct cbd_t *bdp;
> > +
> > +	spin_lock_irqsave(&fep->hw_lock, flags);  
> 
> This is called from napi (bh) context, and every other caller
> is/should be BH, too. You should use
> 
> 	spin_lock_bh()
> 

Ok.

> Also please test your patches with CONFIG_LOCKDEP and
> CONFIG_DEBUG_SPINLOCK enabled, thet will help finding this king of
> issues.

Ok. I will check with lockdep.

> 
> /P
> 
> > +	bdp = fep->dirty_tx;
> > +
> > +	while (((status = bdp->cbd_sc) & BD_ENET_TX_READY) == 0) {
> > +		if (bdp == fep->cur_tx && fep->tx_full == 0)
> > +			break;
> > +
> > +		dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr,
> > +				 MTIP_SWITCH_TX_FRSIZE,
> > DMA_TO_DEVICE);
> > +		bdp->cbd_bufaddr = 0;
> > +		skb = fep->tx_skbuff[fep->skb_dirty];
> > +		/* Check for errors */
> > +		if (status & (BD_ENET_TX_HB | BD_ENET_TX_LC |
> > +				   BD_ENET_TX_RL | BD_ENET_TX_UN |
> > +				   BD_ENET_TX_CSL)) {
> > +			dev->stats.tx_errors++;
> > +			if (status & BD_ENET_TX_HB)  /* No
> > heartbeat */
> > +				dev->stats.tx_heartbeat_errors++;
> > +			if (status & BD_ENET_TX_LC)  /* Late
> > collision */
> > +				dev->stats.tx_window_errors++;
> > +			if (status & BD_ENET_TX_RL)  /* Retrans
> > limit */
> > +				dev->stats.tx_aborted_errors++;
> > +			if (status & BD_ENET_TX_UN)  /* Underrun */
> > +				dev->stats.tx_fifo_errors++;
> > +			if (status & BD_ENET_TX_CSL) /* Carrier
> > lost */
> > +				dev->stats.tx_carrier_errors++;
> > +		} else {
> > +			dev->stats.tx_packets++;
> > +		}
> > +
> > +		if (status & BD_ENET_TX_READY)
> > +			dev_err(&fep->pdev->dev,
> > +				"Enet xmit interrupt and
> > TX_READY.\n"); +
> > +		/* Deferred means some collisions occurred during
> > transmit,
> > +		 * but we eventually sent the packet OK.
> > +		 */
> > +		if (status & BD_ENET_TX_DEF)
> > +			dev->stats.collisions++;
> > +
> > +		/* Free the sk buffer associated with this last
> > transmit */
> > +		dev_consume_skb_irq(skb);
> > +		fep->tx_skbuff[fep->skb_dirty] = NULL;
> > +		fep->skb_dirty = (fep->skb_dirty + 1) &
> > TX_RING_MOD_MASK; +
> > +		/* Update pointer to next buffer descriptor to be
> > transmitted */
> > +		if (status & BD_ENET_TX_WRAP)
> > +			bdp = fep->tx_bd_base;
> > +		else
> > +			bdp++;
> > +
> > +		/* Since we have freed up a buffer, the ring is no
> > longer
> > +		 * full.
> > +		 */
> > +		if (fep->tx_full) {
> > +			fep->tx_full = 0;
> > +			if (netif_queue_stopped(dev))
> > +				netif_wake_queue(dev);
> > +		}
> > +	}
> > +	fep->dirty_tx = bdp;
> > +	spin_unlock_irqrestore(&fep->hw_lock, flags);
> >  }
> >  
> > +/* During a receive, the cur_rx points to the current incoming
> > buffer.
> > + * When we update through the ring, if the next incoming buffer has
> > + * not been given to the system, we just set the empty indicator,
> > + * effectively tossing the packet.
> > + */
> >  static int mtip_switch_rx(struct net_device *dev, int budget, int
> > *port) {
> > +	struct mtip_ndev_priv *priv = netdev_priv(dev);
> > +	u8 *data, rx_port = MTIP_PORT_FORWARDING_INIT;
> > +	struct switch_enet_private *fep = priv->fep;
> > +	unsigned short status, pkt_len;
> > +	struct net_device *pndev;
> > +	struct ethhdr *eth_hdr;
> > +	int pkt_received = 0;
> > +	struct sk_buff *skb;
> > +	struct cbd_t *bdp;
> > +	struct page *page;
> > +
> > +	spin_lock_bh(&fep->hw_lock);
> > +
> > +	/* First, grab all of the stats for the incoming packet.
> > +	 * These get messed up if we get called due to a busy
> > condition.
> > +	 */
> > +	bdp = fep->cur_rx;
> > +
> > +	while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) {
> > +		if (pkt_received >= budget)
> > +			break;
> > +
> > +		pkt_received++;
> > +		/* Since we have allocated space to hold a
> > complete frame,
> > +		 * the last indicator should be set.
> > +		 */
> > +		if ((status & BD_ENET_RX_LAST) == 0)
> > +			dev_warn_ratelimited(&dev->dev,
> > +					     "SWITCH ENET: rcv is
> > not +last\n");  
> 
> Probably you want to break the look when this condition is hit.

Ok.

> 
> > +
> > +		if (!fep->usage_count)
> > +			goto rx_processing_done;
> > +
> > +		/* Check for errors. */
> > +		if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH |
> > BD_ENET_RX_NO |
> > +			      BD_ENET_RX_CR | BD_ENET_RX_OV)) {
> > +			dev->stats.rx_errors++;
> > +			if (status & (BD_ENET_RX_LG |
> > BD_ENET_RX_SH)) {
> > +				/* Frame too long or too short. */
> > +				dev->stats.rx_length_errors++;
> > +			}
> > +			if (status & BD_ENET_RX_NO)	/*
> > Frame alignment */
> > +				dev->stats.rx_frame_errors++;
> > +			if (status & BD_ENET_RX_CR)	/* CRC
> > Error */
> > +				dev->stats.rx_crc_errors++;
> > +			if (status & BD_ENET_RX_OV)	/* FIFO
> > overrun */
> > +				dev->stats.rx_fifo_errors++;
> > +		}
> > +
> > +		/* Report late collisions as a frame error.
> > +		 * On this error, the BD is closed, but we don't
> > know what we
> > +		 * have in the buffer.  So, just drop this frame
> > on the floor.
> > +		 */
> > +		if (status & BD_ENET_RX_CL) {
> > +			dev->stats.rx_errors++;
> > +			dev->stats.rx_frame_errors++;
> > +			goto rx_processing_done;
> > +		}
> > +
> > +		/* Get correct RX page */
> > +		page = fep->page[bdp - fep->rx_bd_base];
> > +		/* Process the incoming frame */
> > +		pkt_len = bdp->cbd_datlen;
> > +		data = (__u8 *)__va(bdp->cbd_bufaddr);
> > +
> > +		dma_sync_single_for_cpu(&fep->pdev->dev,
> > bdp->cbd_bufaddr,
> > +					pkt_len, DMA_FROM_DEVICE);
> > +		prefetch(page_address(page));  
> 
> Use net_prefetch() instead
> 

Ok.

> > +
> > +		if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
> > +			swap_buffer(data, pkt_len);
> > +
> > +		if (data) {
> > +			eth_hdr = (struct ethhdr *)data;
> > +			mtip_atable_get_entry_port_number(fep,
> > +
> > eth_hdr->h_source,
> > +
> > &rx_port);
> > +			if (rx_port == MTIP_PORT_FORWARDING_INIT)
> > +
> > mtip_atable_dynamicms_learn_migration(fep,
> > +
> >    fep->curr_time,
> > +
> >    eth_hdr->h_source,
> > +
> >    &rx_port);
> > +		}
> > +
> > +		if ((rx_port == 1 || rx_port == 2) &&
> > fep->ndev[rx_port - 1])
> > +			pndev = fep->ndev[rx_port - 1];
> > +		else
> > +			pndev = dev;
> > +
> > +		*port = rx_port;  
> 
> Do i read correctly that several network device use the same napi
> instance? That will break napi assumptions and will incorrectly allow
> napi merging packets coming from different devices.

Isn't that for what the L2 switch is for? :-)


To present the problem is this IP block:

1. I will put aside the "separate" mode, which is only used until the
bridge is created [*]. In this mode we do have separate uDMAs for each
ports (i.e. uDMA0 and uDMA1).

2. When offloading in HW L2 frames switching we do have single uDMA0 for
the bridge (other such devices have separate DMAs for each port - like
ti's cpsw - the offloading is only by setting one bit - i.e. this bit
enables switching without touching DMA configuration/setup).

That is why the NAPI is as single instance defined inside
struct switch_enet_private. It reflects the single uDMA0 used when
switching offloading is activated.

> 
> > +
> > +		/* This does 16 byte alignment, exactly what we
> > need.
> > +		 * The packet length includes FCS, but we don't
> > want to
> > +		 * include that when passing upstream as it messes
> > up
> > +		 * bridging applications.
> > +		 */
> > +		skb = netdev_alloc_skb(pndev, pkt_len +
> > NET_IP_ALIGN);
> > +		if (unlikely(!skb)) {
> > +			dev_dbg(&fep->pdev->dev,
> > +				"%s: Memory squeeze, dropping
> > packet.\n",
> > +				pndev->name);
> > +			page_pool_recycle_direct(fep->page_pool,
> > page);
> > +			pndev->stats.rx_dropped++;
> > +			goto err_mem;
> > +		} else {  
> 
> No need for the else statement above.
> 

Ok.

> /P
> 

Note:

[*]:

ip link add name br0 type bridge;
ip link set lan0 master br0; 
ip link set lan1 master br0; 


Best regards,

Lukasz Majewski

--

DENX Software Engineering GmbH,      Managing Director: Erika Unter
HRB 165235 Munich, Office: Kirchenstr.5, D-82194 Groebenzell, Germany
Phone: (+49)-8142-66989-59 Fax: (+49)-8142-66989-80 Email: lukma@denx.de