[net-next v18 5/7] net: mtip: Add mtip_switch_{rx|tx} functions to the L2 switch driver

Lukasz Majewski posted 7 patches 4 months ago
There is a newer version of this series
[net-next v18 5/7] net: mtip: Add mtip_switch_{rx|tx} functions to the L2 switch driver
Posted by Lukasz Majewski 4 months ago
This patch provides mtip_switch_tx and mtip_switch_rx functions
code for MTIP L2 switch.

Signed-off-by: Lukasz Majewski <lukasz.majewski@mailbox.org>
---
Changes for v13:
- New patch - created by excluding some code from large (i.e. v12 and
  earlier) MTIP driver

Changes for v14:
- Rewrite RX error handling code
- Remove } else { from if (unlikely(!skb)) { condition in mtip_switch_rx()
- Remove locking from RX patch (done under NAPI API and similar to fec_main.c
  driver)
- Use net_prefetch() instead of prefetch()

Changes for v15:
- Use page_address() instead of __va()
- Remove the check if data is NOT null, as it cannot be (those values are
  assured to be allocated earlier for RX path).

Changes for v16:
- Disable RX interrupt when in switch RX function
- Set offload_fwd_mark when L2 offloading is enabled (fix broadcast flooding)
- Replace spin_{un}lock() with _bh variant

Changes for v17 - v18:
- None
---
 .../net/ethernet/freescale/mtipsw/mtipl2sw.c  | 240 +++++++++++++++++-
 1 file changed, 239 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
index 39a4997fc8fe..0f7d2f479161 100644
--- a/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
+++ b/drivers/net/ethernet/freescale/mtipsw/mtipl2sw.c
@@ -228,6 +228,39 @@ struct mtip_port_info *mtip_portinfofifo_read(struct switch_enet_private *fep)
 	return info;
 }
 
+static void mtip_atable_get_entry_port_number(struct switch_enet_private *fep,
+					      unsigned char *mac_addr, u8 *port)
+{
+	int block_index, block_index_end, entry;
+	u32 mac_addr_lo, mac_addr_hi;
+	u32 read_lo, read_hi;
+
+	mac_addr_lo = (u32)((mac_addr[3] << 24) | (mac_addr[2] << 16) |
+			    (mac_addr[1] << 8) | mac_addr[0]);
+	mac_addr_hi = (u32)((mac_addr[5] << 8) | (mac_addr[4]));
+
+	block_index = GET_BLOCK_PTR(crc8_calc(mac_addr));
+	block_index_end = block_index + ATABLE_ENTRY_PER_SLOT;
+
+	/* now search all the entries in the selected block */
+	for (entry = block_index; entry < block_index_end; entry++) {
+		mtip_read_atable(fep, entry, &read_lo, &read_hi);
+		*port = MTIP_PORT_FORWARDING_INIT;
+
+		if (read_lo == mac_addr_lo &&
+		    ((read_hi & 0x0000FFFF) ==
+		     (mac_addr_hi & 0x0000FFFF))) {
+			/* found the correct address */
+			if ((read_hi & (1 << 16)) && (!(read_hi & (1 << 17))))
+				*port = FIELD_GET(AT_PORT_MASK, read_hi);
+			break;
+		}
+	}
+
+	dev_dbg(&fep->pdev->dev, "%s: MAC: %pM PORT: 0x%x\n", __func__,
+		mac_addr, *port);
+}
+
 /* Clear complete MAC Look Up Table */
 void mtip_clear_atable(struct switch_enet_private *fep)
 {
@@ -810,11 +843,216 @@ static irqreturn_t mtip_interrupt(int irq, void *ptr_fep)
 
 static void mtip_switch_tx(struct net_device *dev)
 {
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	struct switch_enet_private *fep = priv->fep;
+	unsigned short status;
+	struct sk_buff *skb;
+	struct cbd_t *bdp;
+
+	spin_lock_bh(&fep->hw_lock);
+	bdp = fep->dirty_tx;
+
+	while (((status = bdp->cbd_sc) & BD_ENET_TX_READY) == 0) {
+		if (bdp == fep->cur_tx && fep->tx_full == 0)
+			break;
+
+		dma_unmap_single(&fep->pdev->dev, bdp->cbd_bufaddr,
+				 MTIP_SWITCH_TX_FRSIZE, DMA_TO_DEVICE);
+		bdp->cbd_bufaddr = 0;
+		skb = fep->tx_skbuff[fep->skb_dirty];
+		/* Check for errors */
+		if (status & (BD_ENET_TX_HB | BD_ENET_TX_LC |
+				   BD_ENET_TX_RL | BD_ENET_TX_UN |
+				   BD_ENET_TX_CSL)) {
+			dev->stats.tx_errors++;
+			if (status & BD_ENET_TX_HB)  /* No heartbeat */
+				dev->stats.tx_heartbeat_errors++;
+			if (status & BD_ENET_TX_LC)  /* Late collision */
+				dev->stats.tx_window_errors++;
+			if (status & BD_ENET_TX_RL)  /* Retrans limit */
+				dev->stats.tx_aborted_errors++;
+			if (status & BD_ENET_TX_UN)  /* Underrun */
+				dev->stats.tx_fifo_errors++;
+			if (status & BD_ENET_TX_CSL) /* Carrier lost */
+				dev->stats.tx_carrier_errors++;
+		} else {
+			dev->stats.tx_packets++;
+		}
+
+		if (status & BD_ENET_TX_READY)
+			dev_err(&fep->pdev->dev,
+				"Enet xmit interrupt and TX_READY.\n");
+
+		/* Deferred means some collisions occurred during transmit,
+		 * but we eventually sent the packet OK.
+		 */
+		if (status & BD_ENET_TX_DEF)
+			dev->stats.collisions++;
+
+		/* Free the sk buffer associated with this last transmit */
+		dev_consume_skb_irq(skb);
+		fep->tx_skbuff[fep->skb_dirty] = NULL;
+		fep->skb_dirty = (fep->skb_dirty + 1) & TX_RING_MOD_MASK;
+
+		/* Update pointer to next buffer descriptor to be transmitted */
+		if (status & BD_ENET_TX_WRAP)
+			bdp = fep->tx_bd_base;
+		else
+			bdp++;
+
+		/* Since we have freed up a buffer, the ring is no longer
+		 * full.
+		 */
+		if (fep->tx_full) {
+			fep->tx_full = 0;
+			if (netif_queue_stopped(dev))
+				netif_wake_queue(dev);
+		}
+	}
+	fep->dirty_tx = bdp;
+	spin_unlock_bh(&fep->hw_lock);
 }
 
+/* During a receive, the cur_rx points to the current incoming buffer.
+ * When we update through the ring, if the next incoming buffer has
+ * not been given to the system, we just set the empty indicator,
+ * effectively tossing the packet.
+ */
 static int mtip_switch_rx(struct net_device *dev, int budget, int *port)
 {
-	return -ENOMEM;
+	struct mtip_ndev_priv *priv = netdev_priv(dev);
+	u8 *data, rx_port = MTIP_PORT_FORWARDING_INIT;
+	struct switch_enet_private *fep = priv->fep;
+	unsigned short status, pkt_len;
+	struct net_device *pndev;
+	struct ethhdr *eth_hdr;
+	int pkt_received = 0;
+	struct sk_buff *skb;
+	struct cbd_t *bdp;
+	struct page *page;
+
+	/* First, grab all of the stats for the incoming packet.
+	 * These get messed up if we get called due to a busy condition.
+	 */
+	bdp = fep->cur_rx;
+
+	while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) {
+		if (pkt_received >= budget)
+			break;
+
+		pkt_received++;
+
+		writel(MCF_ESW_IMR_RXF, fep->hwp + ESW_ISR);
+		if (!fep->usage_count)
+			goto rx_processing_done;
+
+		status ^= BD_ENET_RX_LAST;
+		/* Check for errors. */
+		if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH | BD_ENET_RX_NO |
+			      BD_ENET_RX_CR | BD_ENET_RX_OV | BD_ENET_RX_LAST |
+			      BD_ENET_RX_CL)) {
+			dev->stats.rx_errors++;
+			if (status & BD_ENET_RX_OV) {
+				/* FIFO overrun */
+				dev->stats.rx_fifo_errors++;
+				goto rx_processing_done;
+			}
+			if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH
+				      | BD_ENET_RX_LAST)) {
+				/* Frame too long or too short. */
+				dev->stats.rx_length_errors++;
+				if (status & BD_ENET_RX_LAST)
+					netdev_err(dev, "rcv is not +last\n");
+			}
+			if (status & BD_ENET_RX_CR)	/* CRC Error */
+				dev->stats.rx_crc_errors++;
+
+			/* Report late collisions as a frame error. */
+			if (status & (BD_ENET_RX_NO | BD_ENET_RX_CL))
+				dev->stats.rx_frame_errors++;
+			goto rx_processing_done;
+		}
+
+		/* Get correct RX page */
+		page = fep->page[bdp - fep->rx_bd_base];
+		/* Process the incoming frame */
+		pkt_len = bdp->cbd_datlen;
+
+		dma_sync_single_for_cpu(&fep->pdev->dev, bdp->cbd_bufaddr,
+					pkt_len, DMA_FROM_DEVICE);
+		net_prefetch(page_address(page));
+		data = page_address(page);
+
+		if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
+			swap_buffer(data, pkt_len);
+
+		eth_hdr = (struct ethhdr *)data;
+		mtip_atable_get_entry_port_number(fep, eth_hdr->h_source,
+						  &rx_port);
+		if (rx_port == MTIP_PORT_FORWARDING_INIT)
+			mtip_atable_dynamicms_learn_migration(fep,
+							      mtip_get_time(),
+							      eth_hdr->h_source,
+							      &rx_port);
+
+		if ((rx_port == 1 || rx_port == 2) && fep->ndev[rx_port - 1])
+			pndev = fep->ndev[rx_port - 1];
+		else
+			pndev = dev;
+
+		*port = rx_port;
+
+		/* This does 16 byte alignment, exactly what we need.
+		 * The packet length includes FCS, but we don't want to
+		 * include that when passing upstream as it messes up
+		 * bridging applications.
+		 */
+		skb = netdev_alloc_skb(pndev, pkt_len + NET_IP_ALIGN);
+		if (unlikely(!skb)) {
+			dev_dbg(&fep->pdev->dev,
+				"%s: Memory squeeze, dropping packet.\n",
+				pndev->name);
+			page_pool_recycle_direct(fep->page_pool, page);
+			pndev->stats.rx_dropped++;
+			return -ENOMEM;
+		}
+
+		skb_reserve(skb, NET_IP_ALIGN);
+		skb_put(skb, pkt_len);      /* Make room */
+		skb_copy_to_linear_data(skb, data, pkt_len);
+		skb->protocol = eth_type_trans(skb, pndev);
+		skb->offload_fwd_mark = fep->br_offload;
+		napi_gro_receive(&fep->napi, skb);
+
+		pndev->stats.rx_packets++;
+		pndev->stats.rx_bytes += pkt_len;
+
+ rx_processing_done:
+		/* Clear the status flags for this buffer */
+		status &= ~BD_ENET_RX_STATS;
+
+		/* Mark the buffer empty */
+		status |= BD_ENET_RX_EMPTY;
+		/* Make sure that updates to the descriptor are performed */
+		wmb();
+		bdp->cbd_sc = status;
+
+		/* Update BD pointer to next entry */
+		if (status & BD_ENET_RX_WRAP)
+			bdp = fep->rx_bd_base;
+		else
+			bdp++;
+
+		/* Doing this here will keep the FEC running while we process
+		 * incoming frames.  On a heavily loaded network, we should be
+		 * able to keep up at the expense of system resources.
+		 */
+		writel(MCF_ESW_RDAR_R_DES_ACTIVE, fep->hwp + ESW_RDAR);
+	} /* while (!((status = bdp->cbd_sc) & BD_ENET_RX_EMPTY)) */
+
+	fep->cur_rx = bdp;
+
+	return pkt_received;
 }
 
 static void mtip_adjust_link(struct net_device *dev)
-- 
2.39.5
Re: [net-next v18 5/7] net: mtip: Add mtip_switch_{rx|tx} functions to the L2 switch driver
Posted by Jakub Kicinski 4 months ago
On Wed, 13 Aug 2025 09:07:53 +0200 Lukasz Majewski wrote:
> +		page = fep->page[bdp - fep->rx_bd_base];
> +		/* Process the incoming frame */
> +		pkt_len = bdp->cbd_datlen;
> +
> +		dma_sync_single_for_cpu(&fep->pdev->dev, bdp->cbd_bufaddr,
> +					pkt_len, DMA_FROM_DEVICE);
> +		net_prefetch(page_address(page));
> +		data = page_address(page);
> +
> +		if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
> +			swap_buffer(data, pkt_len);
> +
> +		eth_hdr = (struct ethhdr *)data;
> +		mtip_atable_get_entry_port_number(fep, eth_hdr->h_source,
> +						  &rx_port);
> +		if (rx_port == MTIP_PORT_FORWARDING_INIT)
> +			mtip_atable_dynamicms_learn_migration(fep,
> +							      mtip_get_time(),
> +							      eth_hdr->h_source,
> +							      &rx_port);
> +
> +		if ((rx_port == 1 || rx_port == 2) && fep->ndev[rx_port - 1])
> +			pndev = fep->ndev[rx_port - 1];
> +		else
> +			pndev = dev;
> +
> +		*port = rx_port;
> +
> +		/* This does 16 byte alignment, exactly what we need.
> +		 * The packet length includes FCS, but we don't want to
> +		 * include that when passing upstream as it messes up
> +		 * bridging applications.
> +		 */
> +		skb = netdev_alloc_skb(pndev, pkt_len + NET_IP_ALIGN);
> +		if (unlikely(!skb)) {
> +			dev_dbg(&fep->pdev->dev,
> +				"%s: Memory squeeze, dropping packet.\n",
> +				pndev->name);
> +			page_pool_recycle_direct(fep->page_pool, page);
> +			pndev->stats.rx_dropped++;
> +			return -ENOMEM;
> +		}
> +
> +		skb_reserve(skb, NET_IP_ALIGN);
> +		skb_put(skb, pkt_len);      /* Make room */
> +		skb_copy_to_linear_data(skb, data, pkt_len);
> +		skb->protocol = eth_type_trans(skb, pndev);
> +		skb->offload_fwd_mark = fep->br_offload;
> +		napi_gro_receive(&fep->napi, skb);

The rx buffer circulation is very odd. You seem to pre-allocate buffers
for the full ring from a page_pool. And then copy the data out of those
pages. The normal process is that after packet is received a new page is
allocated to give to HW, and old is attached to an skb, and sent up the
stack.

Also you are releasing the page to be recycled without clearing it from
the ring. I think you'd free it again on shutdown, so it's a
double-free.
-- 
pw-bot: cr
Re: [net-next v18 5/7] net: mtip: Add mtip_switch_{rx|tx} functions to the L2 switch driver
Posted by Łukasz Majewski 3 months, 3 weeks ago
Hi Jakub,

> On Wed, 13 Aug 2025 09:07:53 +0200 Lukasz Majewski wrote:
> > +		page = fep->page[bdp - fep->rx_bd_base];
> > +		/* Process the incoming frame */
> > +		pkt_len = bdp->cbd_datlen;
> > +
> > +		dma_sync_single_for_cpu(&fep->pdev->dev,
> > bdp->cbd_bufaddr,
> > +					pkt_len, DMA_FROM_DEVICE);
> > +		net_prefetch(page_address(page));
> > +		data = page_address(page);
> > +
> > +		if (fep->quirks & FEC_QUIRK_SWAP_FRAME)
> > +			swap_buffer(data, pkt_len);
> > +
> > +		eth_hdr = (struct ethhdr *)data;
> > +		mtip_atable_get_entry_port_number(fep,
> > eth_hdr->h_source,
> > +						  &rx_port);
> > +		if (rx_port == MTIP_PORT_FORWARDING_INIT)
> > +			mtip_atable_dynamicms_learn_migration(fep,
> > +
> > mtip_get_time(),
> > +
> > eth_hdr->h_source,
> > +
> > &rx_port); +
> > +		if ((rx_port == 1 || rx_port == 2) &&
> > fep->ndev[rx_port - 1])
> > +			pndev = fep->ndev[rx_port - 1];
> > +		else
> > +			pndev = dev;
> > +
> > +		*port = rx_port;
> > +
> > +		/* This does 16 byte alignment, exactly what we
> > need.
> > +		 * The packet length includes FCS, but we don't
> > want to
> > +		 * include that when passing upstream as it messes
> > up
> > +		 * bridging applications.
> > +		 */
> > +		skb = netdev_alloc_skb(pndev, pkt_len +
> > NET_IP_ALIGN);
> > +		if (unlikely(!skb)) {
> > +			dev_dbg(&fep->pdev->dev,
> > +				"%s: Memory squeeze, dropping
> > packet.\n",
> > +				pndev->name);
> > +			page_pool_recycle_direct(fep->page_pool,
> > page);
> > +			pndev->stats.rx_dropped++;
> > +			return -ENOMEM;
> > +		}
> > +
> > +		skb_reserve(skb, NET_IP_ALIGN);
> > +		skb_put(skb, pkt_len);      /* Make room */
> > +		skb_copy_to_linear_data(skb, data, pkt_len);
> > +		skb->protocol = eth_type_trans(skb, pndev);
> > +		skb->offload_fwd_mark = fep->br_offload;
> > +		napi_gro_receive(&fep->napi, skb);  
> 
> The rx buffer circulation is very odd.

The fec_main.c uses page_pool_alloc_pages() to allocate RX page from
the pool.

At the RX function the __build_skb(data, ...) is called to create skb.

Last step with the RX function is to call skb_mark_for_recycle(skb),
which sets skb->pp_recycle = 1.

And yes, in the MTIP I do copy the data to the newly created skb in RX
function (anyway, I need to swap bytes in the buffer). 

It seems like extra copy is performed in the RX function.

> You seem to pre-allocate
> buffers for the full ring from a page_pool. And then copy the data
> out of those pages.

Yes, correct.

> The normal process is that after packet is
> received a new page is allocated to give to HW, and old is attached
> to an skb, and sent up the stack.

Ok.

> 
> Also you are releasing the page to be recycled without clearing it
> from the ring. I think you'd free it again on shutdown, so it's a
> double-free.

No, the page is persistent. It will be removed when the driver is
closed and memory for pages and descriptors is released.

-- 
Best regards,

Łukasz Majewski
Re: [net-next v18 5/7] net: mtip: Add mtip_switch_{rx|tx} functions to the L2 switch driver
Posted by Jakub Kicinski 3 months, 3 weeks ago
On Tue, 19 Aug 2025 10:31:19 +0200 Łukasz Majewski wrote:
> > The rx buffer circulation is very odd.  
> 
> The fec_main.c uses page_pool_alloc_pages() to allocate RX page from
> the pool.
> 
> At the RX function the __build_skb(data, ...) is called to create skb.
> 
> Last step with the RX function is to call skb_mark_for_recycle(skb),
> which sets skb->pp_recycle = 1.
> 
> And yes, in the MTIP I do copy the data to the newly created skb in RX
> function (anyway, I need to swap bytes in the buffer). 
> 
> It seems like extra copy is performed in the RX function.

Right, so the use of page pool is entirely pointless.
The strength of the page pool is recycling the pages.
If you don't free / allocate pages on the fast path you're 
just paying the extra overhead (of having a populated cache)

> > Also you are releasing the page to be recycled without clearing it
> > from the ring. I think you'd free it again on shutdown, so it's a
> > double-free.  
> 
> No, the page is persistent. It will be removed when the driver is
> closed and memory for pages and descriptors is released.

So remove the page_pool_recycle_direct() call, please:

  static int mtip_switch_rx(struct net_device *dev, int budget, int *port)
	...
	skb = netdev_alloc_skb(pndev, pkt_len + NET_IP_ALIGN);
	if (unlikely(!skb)) {
			...
			page_pool_recycle_direct(fep->page_pool, page);