Descriptor Recycling and Batch processing for CPSW

[RFC PATCH 6/6] net: ethernet: ti: am65-cpsw-nuss: Enable batch processing for TX / TX CMPL
Posted by Siddharth Vadapalli 1 week, 1 day ago
Enable batch processing on the transmit and transmit completion paths by
submitting a batch of packet descriptors on transmit and similarly by
dequeueing a batch of packet descriptors on transmit completion.

Signed-off-by: Siddharth Vadapalli <s-vadapalli@ti.com>
---
 drivers/net/ethernet/ti/am65-cpsw-nuss.c | 201 +++++++++++++++++++----
 drivers/net/ethernet/ti/am65-cpsw-nuss.h |  12 ++
 2 files changed, 178 insertions(+), 35 deletions(-)

diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
index fc165579a479..2b354af14cb7 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c
@@ -1624,14 +1624,14 @@ static inline void am65_cpsw_nuss_xmit_recycle(struct am65_cpsw_tx_chn *tx_chn,
 	am65_cpsw_nuss_put_tx_desc(tx_chn, first_desc);
 }
 
-static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common,
-					   int chn, unsigned int budget, bool *tdown)
+static int am65_cpsw_nuss_tx_cmpl_free_batch(struct am65_cpsw_common *common, int chn,
+					     u32 batch_size, unsigned int budget,
+					     bool *tdown)
 {
 	bool single_port = AM65_CPSW_IS_CPSW2G(common);
 	enum am65_cpsw_tx_buf_type buf_type;
 	struct am65_cpsw_tx_swdata *swdata;
 	struct cppi5_host_desc_t *desc_tx;
-	struct device *dev = common->dev;
 	struct am65_cpsw_tx_chn *tx_chn;
 	struct netdev_queue *netif_txq;
 	unsigned int total_bytes = 0;
@@ -1640,21 +1640,13 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common,
 	unsigned int pkt_len;
 	struct sk_buff *skb;
 	dma_addr_t desc_dma;
-	int res, num_tx = 0;
+	int num_tx = 0, i;
 
 	tx_chn = &common->tx_chns[chn];
 
-	while (true) {
-		if (!single_port)
-			spin_lock(&tx_chn->lock);
-		res = k3_udma_glue_pop_tx_chn(tx_chn->tx_chn, &desc_dma);
-		if (!single_port)
-			spin_unlock(&tx_chn->lock);
-
-		if (res == -ENODATA)
-			break;
-
-		if (cppi5_desc_is_tdcm(desc_dma)) {
+	for (i = 0; i < batch_size; i++) {
+		desc_dma = tx_chn->cmpl_desc_dma_array[i];
+		if (unlikely(cppi5_desc_is_tdcm(desc_dma))) {
 			if (atomic_dec_and_test(&common->tdown_cnt))
 				complete(&common->tdown_complete);
 			*tdown = true;
@@ -1701,7 +1693,34 @@ static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common,
 		am65_cpsw_nuss_tx_wake(tx_chn, ndev, netif_txq);
 	}
 
-	dev_dbg(dev, "%s:%u pkt:%d\n", __func__, chn, num_tx);
+	return num_tx;
+}
+
+static int am65_cpsw_nuss_tx_compl_packets(struct am65_cpsw_common *common,
+					   int chn, unsigned int budget, bool *tdown)
+{
+	bool single_port = AM65_CPSW_IS_CPSW2G(common);
+	struct am65_cpsw_tx_chn *tx_chn;
+	u32 batch_size = 0;
+	int res, num_tx;
+
+	tx_chn = &common->tx_chns[chn];
+
+	if (!single_port)
+		spin_lock(&tx_chn->lock);
+
+	res = k3_udma_glue_pop_tx_chn_batch(tx_chn->tx_chn, tx_chn->cmpl_desc_dma_array,
+					    &batch_size, AM65_CPSW_TX_BATCH_SIZE);
+	if (!batch_size) {
+		if (!single_port)
+			spin_unlock(&tx_chn->lock);
+		return 0;
+	}
+
+	num_tx = am65_cpsw_nuss_tx_cmpl_free_batch(common, chn, batch_size, budget, tdown);
+
+	if (!single_port)
+		spin_unlock(&tx_chn->lock);
 
 	return num_tx;
 }
@@ -1760,18 +1779,48 @@ static irqreturn_t am65_cpsw_nuss_tx_irq(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+static void am65_cpsw_nuss_submit_ndev_batch(struct am65_cpsw_common *common)
+{
+	bool single_port = AM65_CPSW_IS_CPSW2G(common);
+	struct am65_cpsw_tx_desc_batch *tx_desc_batch;
+	struct am65_cpsw_tx_chn *tx_chn;
+	int ret, i;
+
+	/* Submit packets across netdevs across TX Channels */
+	for (i = 0; i < AM65_CPSW_MAX_QUEUES; i++) {
+		if (common->tx_desc_batch[i].tx_batch_idx) {
+			tx_chn = &common->tx_chns[i];
+			tx_desc_batch = &common->tx_desc_batch[i];
+			if (!single_port)
+				spin_lock_bh(&tx_chn->lock);
+			ret = k3_udma_glue_push_tx_chn_batch(tx_chn->tx_chn,
+							     tx_desc_batch->desc_tx_array,
+							     tx_desc_batch->desc_dma_array,
+							     tx_desc_batch->tx_batch_idx);
+			if (!single_port)
+				spin_unlock_bh(&tx_chn->lock);
+			if (ret)
+				dev_err(common->dev, "failed to push %u pkts on queue %d\n",
+					tx_desc_batch->tx_batch_idx, i);
+			tx_desc_batch->tx_batch_idx = 0;
+		}
+	}
+	atomic_set(&common->tx_batch_count, 0);
+}
+
 static netdev_tx_t am65_cpsw_nuss_ndo_slave_xmit(struct sk_buff *skb,
 						 struct net_device *ndev)
 {
 	struct am65_cpsw_common *common = am65_ndev_to_common(ndev);
 	struct cppi5_host_desc_t *first_desc, *next_desc, *cur_desc;
 	struct am65_cpsw_port *port = am65_ndev_to_port(ndev);
+	struct am65_cpsw_tx_desc_batch *tx_desc_batch;
 	struct am65_cpsw_tx_swdata *swdata;
 	struct device *dev = common->dev;
 	struct am65_cpsw_tx_chn *tx_chn;
 	struct netdev_queue *netif_txq;
 	dma_addr_t desc_dma, buf_dma;
-	int ret, q_idx, i;
+	int q_idx, i;
 	u32 *psdata;
 	u32 pkt_len;
 
@@ -1883,20 +1932,31 @@ static netdev_tx_t am65_cpsw_nuss_ndo_slave_xmit(struct sk_buff *skb,
 
 	cppi5_hdesc_set_pktlen(first_desc, pkt_len);
 	desc_dma = k3_cppi_desc_pool_virt2dma(tx_chn->desc_pool, first_desc);
-	if (AM65_CPSW_IS_CPSW2G(common)) {
-		ret = k3_udma_glue_push_tx_chn(tx_chn->tx_chn, first_desc, desc_dma);
-	} else {
-		spin_lock_bh(&tx_chn->lock);
-		ret = k3_udma_glue_push_tx_chn(tx_chn->tx_chn, first_desc, desc_dma);
-		spin_unlock_bh(&tx_chn->lock);
-	}
-	if (ret) {
-		dev_err(dev, "can't push desc %d\n", ret);
-		/* inform bql */
-		netdev_tx_completed_queue(netif_txq, 1, pkt_len);
-		ndev->stats.tx_errors++;
-		goto err_free_descs;
-	}
+
+	/* Batch processing begins */
+	spin_lock_bh(&common->tx_batch_lock);
+
+	tx_desc_batch = &common->tx_desc_batch[q_idx];
+	tx_desc_batch->desc_tx_array[tx_desc_batch->tx_batch_idx] = first_desc;
+	tx_desc_batch->desc_dma_array[tx_desc_batch->tx_batch_idx] = desc_dma;
+	tx_desc_batch->tx_batch_idx++;
+
+	/* Push the batch across all queues and all netdevs in any of the
+	 * following scenarios:
+	 * 1. If we reach the batch size
+	 * 2. If queue is stopped
+	 * 3. No more packets are expected for ndev
+	 * 4. We do not have sufficient free descriptors for upcoming packets
+	 *    and need to push the batch to reclaim them via completion
+	 */
+	if ((atomic_inc_return(&common->tx_batch_count) == AM65_CPSW_TX_BATCH_SIZE) ||
+	    netif_xmit_stopped(netif_txq) ||
+	    !netdev_xmit_more() ||
+	    (am65_cpsw_nuss_num_free_tx_desc(tx_chn) < MAX_SKB_FRAGS))
+		am65_cpsw_nuss_submit_ndev_batch(common);
+
+	/* Batch processing ends */
+	spin_unlock_bh(&common->tx_batch_lock);
 
 	if (am65_cpsw_nuss_num_free_tx_desc(tx_chn) < MAX_SKB_FRAGS) {
 		netif_tx_stop_queue(netif_txq);
@@ -2121,19 +2181,88 @@ static int am65_cpsw_ndo_xdp_xmit(struct net_device *ndev, int n,
 				  struct xdp_frame **frames, u32 flags)
 {
 	struct am65_cpsw_common *common = am65_ndev_to_common(ndev);
+	struct am65_cpsw_port *port = am65_ndev_to_port(ndev);
+	struct am65_cpsw_tx_desc_batch *tx_desc_batch;
+	struct cppi5_host_desc_t *host_desc;
+	struct am65_cpsw_tx_swdata *swdata;
 	struct am65_cpsw_tx_chn *tx_chn;
 	struct netdev_queue *netif_txq;
+	dma_addr_t dma_desc, dma_buf;
 	int cpu = smp_processor_id();
-	int i, nxmit = 0;
+	int i, q_idx, nxmit = 0;
+	struct xdp_frame *xdpf;
+	u32 pkt_len;
 
-	tx_chn = &common->tx_chns[cpu % common->tx_ch_num];
+	q_idx = cpu % common->tx_ch_num;
+	tx_chn = &common->tx_chns[q_idx];
 	netif_txq = netdev_get_tx_queue(ndev, tx_chn->id);
 
 	__netif_tx_lock(netif_txq, cpu);
 	for (i = 0; i < n; i++) {
-		if (am65_cpsw_xdp_tx_frame(ndev, tx_chn, frames[i],
-					   AM65_CPSW_TX_BUF_TYPE_XDP_NDO))
+		host_desc = am65_cpsw_nuss_get_tx_desc(tx_chn);
+		if (unlikely(!host_desc)) {
+			ndev->stats.tx_dropped++;
+			break;
+		}
+
+		xdpf = frames[i];
+		pkt_len = xdpf->len;
+
+		am65_cpsw_nuss_set_buf_type(tx_chn, host_desc, AM65_CPSW_TX_BUF_TYPE_XDP_NDO);
+
+		dma_buf = dma_map_single(tx_chn->dma_dev, xdpf->data,
+					 pkt_len, DMA_TO_DEVICE);
+		if (unlikely(dma_mapping_error(tx_chn->dma_dev, dma_buf))) {
+			ndev->stats.tx_dropped++;
+			am65_cpsw_nuss_put_tx_desc(tx_chn, host_desc);
 			break;
+		}
+
+		cppi5_hdesc_init(host_desc, CPPI5_INFO0_HDESC_EPIB_PRESENT,
+				 AM65_CPSW_NAV_PS_DATA_SIZE);
+		cppi5_hdesc_set_pkttype(host_desc, AM65_CPSW_CPPI_TX_PKT_TYPE);
+		cppi5_hdesc_set_pktlen(host_desc, pkt_len);
+		cppi5_desc_set_pktids(&host_desc->hdr, 0, AM65_CPSW_CPPI_TX_FLOW_ID);
+		cppi5_desc_set_tags_ids(&host_desc->hdr, 0, port->port_id);
+
+		k3_udma_glue_tx_dma_to_cppi5_addr(tx_chn->tx_chn, &dma_buf);
+		cppi5_hdesc_attach_buf(host_desc, dma_buf, pkt_len, dma_buf, pkt_len);
+
+		swdata = cppi5_hdesc_get_swdata(host_desc);
+		swdata->ndev = ndev;
+		swdata->xdpf = xdpf;
+
+		/* Report BQL before sending the packet */
+		netif_txq = netdev_get_tx_queue(ndev, tx_chn->id);
+		netdev_tx_sent_queue(netif_txq, pkt_len);
+
+		dma_desc = k3_cppi_desc_pool_virt2dma(tx_chn->desc_pool, host_desc);
+
+		/* Batch processing begins */
+		spin_lock_bh(&common->tx_batch_lock);
+
+		tx_desc_batch = &common->tx_desc_batch[q_idx];
+		tx_desc_batch->desc_tx_array[tx_desc_batch->tx_batch_idx] = host_desc;
+		tx_desc_batch->desc_dma_array[tx_desc_batch->tx_batch_idx] = dma_desc;
+		tx_desc_batch->tx_batch_idx++;
+
+		/* Push the batch across all queues and all netdevs in any of the
+		 * following scenarios:
+		 * 1. If we reach the batch size
+		 * 2. If queue is stopped
+		 * 3. We are at the last XDP frame in the batch
+		 * 4. We do not have sufficient free descriptors for upcoming packets
+		 *    and need to push the batch to reclaim them via completion
+		 */
+		if ((atomic_inc_return(&common->tx_batch_count) == AM65_CPSW_TX_BATCH_SIZE) ||
+		    netif_xmit_stopped(netif_txq) ||
+		    (i == (n - 1)) ||
+		    (am65_cpsw_nuss_num_free_tx_desc(tx_chn) < MAX_SKB_FRAGS))
+			am65_cpsw_nuss_submit_ndev_batch(common);
+
+		/* Batch processing ends */
+		spin_unlock_bh(&common->tx_batch_lock);
+
 		nxmit++;
 	}
 	__netif_tx_unlock(netif_txq);
@@ -2497,6 +2626,8 @@ static int am65_cpsw_nuss_init_tx_chns(struct am65_cpsw_common *common)
 			 dev_name(dev), tx_chn->id);
 	}
 
+	atomic_set(&common->tx_batch_count, 0);
+
 	ret = am65_cpsw_nuss_ndev_add_tx_napi(common);
 	if (ret) {
 		dev_err(dev, "Failed to add tx NAPI %d\n", ret);
diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.h b/drivers/net/ethernet/ti/am65-cpsw-nuss.h
index e64b4cfd6f2c..81405e3bed79 100644
--- a/drivers/net/ethernet/ti/am65-cpsw-nuss.h
+++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.h
@@ -28,6 +28,8 @@ struct am65_cpts;
 #define AM65_CPSW_MAX_TX_DESC	500
 #define AM65_CPSW_MAX_RX_DESC	500
 
+#define AM65_CPSW_TX_BATCH_SIZE	128
+
 #define AM65_CPSW_PORT_VLAN_REG_OFFSET	0x014
 
 struct am65_cpsw_slave_data {
@@ -93,6 +95,7 @@ struct am65_cpsw_tx_chn {
 	struct k3_cppi_desc_pool *desc_pool;
 	struct k3_udma_glue_tx_channel *tx_chn;
 	spinlock_t lock; /* protect TX rings in multi-port mode */
+	dma_addr_t cmpl_desc_dma_array[AM65_CPSW_TX_BATCH_SIZE];
 	struct am65_cpsw_tx_ring tx_ring;
 	struct hrtimer tx_hrtimer;
 	unsigned long tx_pace_timeout;
@@ -165,6 +168,12 @@ struct am65_cpsw_devlink {
 	struct am65_cpsw_common *common;
 };
 
+struct am65_cpsw_tx_desc_batch {
+	struct cppi5_host_desc_t *desc_tx_array[AM65_CPSW_TX_BATCH_SIZE];
+	dma_addr_t desc_dma_array[AM65_CPSW_TX_BATCH_SIZE];
+	u8 tx_batch_idx;
+};
+
 struct am65_cpsw_common {
 	struct device		*dev;
 	struct device		*mdio_dev;
@@ -188,6 +197,9 @@ struct am65_cpsw_common {
 	struct am65_cpsw_tx_chn	tx_chns[AM65_CPSW_MAX_QUEUES];
 	struct completion	tdown_complete;
 	atomic_t		tdown_cnt;
+	atomic_t		tx_batch_count;
+	spinlock_t		tx_batch_lock; /* protect TX batch operations */
+	struct am65_cpsw_tx_desc_batch tx_desc_batch[AM65_CPSW_MAX_QUEUES];
 
 	int			rx_ch_num_flows;
 	struct am65_cpsw_rx_chn	rx_chns;
-- 
2.51.1
[RFC PATCH 1/6] soc: ti: k3-ringacc: Add helper to get realtime count of free elements
[RFC PATCH 2/6] soc: ti: k3-ringacc: Add helpers for batch push and pop operations
[RFC PATCH 3/6] dmaengine: ti: k3-udma-glue: Add helpers for batch operations on TX/RX DMA
[RFC PATCH 4/6] net: ethernet: ti: am65-cpsw-nuss: Do not set buf_type for SKB fragments
[RFC PATCH 5/6] net: ethernet: ti: am65-cpsw-nuss: Recycle TX and RX CPPI Descriptors
[RFC PATCH 6/6] net: ethernet: ti: am65-cpsw-nuss: Enable batch processing for TX / TX CMPL