[PATCH net-next v2] net: bcmasp: Switch to page pool for RX path

Florian Fainelli posted 1 patch 2 months, 1 week ago
drivers/net/ethernet/broadcom/Kconfig         |   1 +
drivers/net/ethernet/broadcom/asp2/bcmasp.h   |   8 +-
.../net/ethernet/broadcom/asp2/bcmasp_intf.c  | 125 +++++++++++++++---
.../ethernet/broadcom/asp2/bcmasp_intf_defs.h |   4 +
4 files changed, 115 insertions(+), 23 deletions(-)
[PATCH net-next v2] net: bcmasp: Switch to page pool for RX path
Posted by Florian Fainelli 2 months, 1 week ago
This shows an improvement of 1.9% in reducing the CPU cycles and data
cache misses.

Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
---
Changes in v2:

- addressed Nicolai's comments by setting the .netdev and .napi members
  and dropped the useless comment and .dma_dir initialization

 drivers/net/ethernet/broadcom/Kconfig         |   1 +
 drivers/net/ethernet/broadcom/asp2/bcmasp.h   |   8 +-
 .../net/ethernet/broadcom/asp2/bcmasp_intf.c  | 125 +++++++++++++++---
 .../ethernet/broadcom/asp2/bcmasp_intf_defs.h |   4 +
 4 files changed, 115 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/Kconfig b/drivers/net/ethernet/broadcom/Kconfig
index dd164acafd01..4287edc7ddd6 100644
--- a/drivers/net/ethernet/broadcom/Kconfig
+++ b/drivers/net/ethernet/broadcom/Kconfig
@@ -272,6 +272,7 @@ config BCMASP
 	depends on OF
 	select PHYLIB
 	select MDIO_BCM_UNIMAC
+	select PAGE_POOL
 	help
 	  This configuration enables the Broadcom ASP 2.0 Ethernet controller
 	  driver which is present in Broadcom STB SoCs such as 72165.
diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp.h b/drivers/net/ethernet/broadcom/asp2/bcmasp.h
index 29cd87335ec8..8c8ffaeadc79 100644
--- a/drivers/net/ethernet/broadcom/asp2/bcmasp.h
+++ b/drivers/net/ethernet/broadcom/asp2/bcmasp.h
@@ -6,6 +6,7 @@
 #include <linux/phy.h>
 #include <linux/io-64-nonatomic-hi-lo.h>
 #include <uapi/linux/ethtool.h>
+#include <net/page_pool/helpers.h>
 
 #define ASP_INTR2_OFFSET			0x1000
 #define  ASP_INTR2_STATUS			0x0
@@ -298,16 +299,19 @@ struct bcmasp_intf {
 	void __iomem			*rx_edpkt_cfg;
 	void __iomem			*rx_edpkt_dma;
 	int				rx_edpkt_index;
-	int				rx_buf_order;
 	struct bcmasp_desc		*rx_edpkt_cpu;
 	dma_addr_t			rx_edpkt_dma_addr;
 	dma_addr_t			rx_edpkt_dma_read;
 	dma_addr_t			rx_edpkt_dma_valid;
 
-	/* RX buffer prefetcher ring*/
+	/* Streaming RX data ring (RBUF_4K mode) */
 	void				*rx_ring_cpu;
 	dma_addr_t			rx_ring_dma;
 	dma_addr_t			rx_ring_dma_valid;
+	int				rx_buf_order;
+
+	/* Page pool for recycling RX SKB data pages */
+	struct page_pool		*rx_page_pool;
 	struct napi_struct		rx_napi;
 
 	struct bcmasp_res		res;
diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c
index b368ec2fea43..ec63f50a849e 100644
--- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c
+++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c
@@ -15,6 +15,7 @@
 #include <linux/platform_device.h>
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/page_pool/helpers.h>
 
 #include "bcmasp.h"
 #include "bcmasp_intf_defs.h"
@@ -482,10 +483,14 @@ static int bcmasp_rx_poll(struct napi_struct *napi, int budget)
 	struct bcmasp_desc *desc;
 	struct sk_buff *skb;
 	dma_addr_t valid;
+	struct page *page;
 	void *data;
 	u64 flags;
 	u32 len;
 
+	/* Hardware advances DMA_VALID as it writes each descriptor
+	 * (RBUF_4K streaming mode); software chases with rx_edpkt_dma_read.
+	 */
 	valid = rx_edpkt_dma_rq(intf, RX_EDPKT_DMA_VALID) + 1;
 	if (valid == intf->rx_edpkt_dma_addr + DESC_RING_SIZE)
 		valid = intf->rx_edpkt_dma_addr;
@@ -493,12 +498,12 @@ static int bcmasp_rx_poll(struct napi_struct *napi, int budget)
 	while ((processed < budget) && (valid != intf->rx_edpkt_dma_read)) {
 		desc = &intf->rx_edpkt_cpu[intf->rx_edpkt_index];
 
-		/* Ensure that descriptor has been fully written to DRAM by
-		 * hardware before reading by the CPU
+		/* Ensure the descriptor has been fully written to DRAM by
+		 * the hardware before the CPU reads it.
 		 */
 		rmb();
 
-		/* Calculate virt addr by offsetting from physical addr */
+		/* Locate the packet data inside the streaming ring buffer. */
 		data = intf->rx_ring_cpu +
 			(DESC_ADDR(desc->buf) - intf->rx_ring_dma);
 
@@ -524,19 +529,38 @@ static int bcmasp_rx_poll(struct napi_struct *napi, int budget)
 
 		len = desc->size;
 
-		skb = napi_alloc_skb(napi, len);
-		if (!skb) {
+		/* Allocate a page pool page as the SKB data area so the
+		 * kernel can recycle it efficiently after the packet is
+		 * consumed, avoiding repeated slab allocations.
+		 */
+		page = page_pool_dev_alloc_pages(intf->rx_page_pool);
+		if (!page) {
 			u64_stats_update_begin(&stats->syncp);
 			u64_stats_inc(&stats->rx_dropped);
 			u64_stats_update_end(&stats->syncp);
 			intf->mib.alloc_rx_skb_failed++;
+			goto next;
+		}
 
+		skb = napi_build_skb(page_address(page), PAGE_SIZE);
+		if (!skb) {
+			u64_stats_update_begin(&stats->syncp);
+			u64_stats_inc(&stats->rx_dropped);
+			u64_stats_update_end(&stats->syncp);
+			intf->mib.alloc_rx_skb_failed++;
+			page_pool_recycle_direct(intf->rx_page_pool, page);
 			goto next;
 		}
 
+		/* Reserve headroom then copy the full descriptor payload
+		 * (hardware prepends a 2-byte alignment pad at the start).
+		 */
+		skb_reserve(skb, NET_SKB_PAD);
 		skb_put(skb, len);
 		memcpy(skb->data, data, len);
+		skb_mark_for_recycle(skb);
 
+		/* Skip the 2-byte hardware alignment pad. */
 		skb_pull(skb, 2);
 		len -= 2;
 		if (likely(intf->crc_fwd)) {
@@ -558,6 +582,7 @@ static int bcmasp_rx_poll(struct napi_struct *napi, int budget)
 		u64_stats_update_end(&stats->syncp);
 
 next:
+		/* Return this portion of the streaming ring buffer to HW. */
 		rx_edpkt_cfg_wq(intf, (DESC_ADDR(desc->buf) + desc->size),
 				RX_EDPKT_RING_BUFFER_READ);
 
@@ -661,12 +686,31 @@ static void bcmasp_adj_link(struct net_device *dev)
 		phy_print_status(phydev);
 }
 
-static int bcmasp_alloc_buffers(struct bcmasp_intf *intf)
+static struct page_pool *
+bcmasp_rx_page_pool_create(struct bcmasp_intf *intf)
+{
+	struct page_pool_params pp_params = {
+		.order		= 0,
+		.flags		= 0,
+		.pool_size	= NUM_4K_BUFFERS,
+		.nid		= NUMA_NO_NODE,
+		.dev		= &intf->parent->pdev->dev,
+		.napi		= &intf->rx_napi,
+		.netdev		= intf->ndev,
+		.offset		= 0,
+		.max_len	= PAGE_SIZE,
+	};
+
+	return page_pool_create(&pp_params);
+}
+
+static int bcmasp_alloc_rx_buffers(struct bcmasp_intf *intf)
 {
 	struct device *kdev = &intf->parent->pdev->dev;
 	struct page *buffer_pg;
+	int ret;
 
-	/* Alloc RX */
+	/* Contiguous streaming ring that hardware writes packet data into. */
 	intf->rx_buf_order = get_order(RING_BUFFER_SIZE);
 	buffer_pg = alloc_pages(GFP_KERNEL, intf->rx_buf_order);
 	if (!buffer_pg)
@@ -675,13 +719,55 @@ static int bcmasp_alloc_buffers(struct bcmasp_intf *intf)
 	intf->rx_ring_cpu = page_to_virt(buffer_pg);
 	intf->rx_ring_dma = dma_map_page(kdev, buffer_pg, 0, RING_BUFFER_SIZE,
 					 DMA_FROM_DEVICE);
-	if (dma_mapping_error(kdev, intf->rx_ring_dma))
-		goto free_rx_buffer;
+	if (dma_mapping_error(kdev, intf->rx_ring_dma)) {
+		ret = -ENOMEM;
+		goto free_ring_pages;
+	}
+
+	/* Page pool for SKB data areas (copy targets, not DMA buffers). */
+	intf->rx_page_pool = bcmasp_rx_page_pool_create(intf);
+	if (IS_ERR(intf->rx_page_pool)) {
+		ret = PTR_ERR(intf->rx_page_pool);
+		intf->rx_page_pool = NULL;
+		goto free_ring_dma;
+	}
+
+	return 0;
+
+free_ring_dma:
+	dma_unmap_page(kdev, intf->rx_ring_dma, RING_BUFFER_SIZE,
+		       DMA_FROM_DEVICE);
+free_ring_pages:
+	__free_pages(buffer_pg, intf->rx_buf_order);
+	return ret;
+}
+
+static void bcmasp_reclaim_rx_buffers(struct bcmasp_intf *intf)
+{
+	struct device *kdev = &intf->parent->pdev->dev;
+
+	page_pool_destroy(intf->rx_page_pool);
+	intf->rx_page_pool = NULL;
+	dma_unmap_page(kdev, intf->rx_ring_dma, RING_BUFFER_SIZE,
+		       DMA_FROM_DEVICE);
+	__free_pages(virt_to_page(intf->rx_ring_cpu), intf->rx_buf_order);
+}
+
+static int bcmasp_alloc_buffers(struct bcmasp_intf *intf)
+{
+	struct device *kdev = &intf->parent->pdev->dev;
+	int ret;
+
+	/* Alloc RX */
+	ret = bcmasp_alloc_rx_buffers(intf);
+	if (ret)
+		return ret;
 
 	intf->rx_edpkt_cpu = dma_alloc_coherent(kdev, DESC_RING_SIZE,
-						&intf->rx_edpkt_dma_addr, GFP_KERNEL);
+						&intf->rx_edpkt_dma_addr,
+						GFP_KERNEL);
 	if (!intf->rx_edpkt_cpu)
-		goto free_rx_buffer_dma;
+		goto free_rx_buffers;
 
 	/* Alloc TX */
 	intf->tx_spb_cpu = dma_alloc_coherent(kdev, DESC_RING_SIZE,
@@ -701,11 +787,8 @@ static int bcmasp_alloc_buffers(struct bcmasp_intf *intf)
 free_rx_edpkt_dma:
 	dma_free_coherent(kdev, DESC_RING_SIZE, intf->rx_edpkt_cpu,
 			  intf->rx_edpkt_dma_addr);
-free_rx_buffer_dma:
-	dma_unmap_page(kdev, intf->rx_ring_dma, RING_BUFFER_SIZE,
-		       DMA_FROM_DEVICE);
-free_rx_buffer:
-	__free_pages(buffer_pg, intf->rx_buf_order);
+free_rx_buffers:
+	bcmasp_reclaim_rx_buffers(intf);
 
 	return -ENOMEM;
 }
@@ -717,9 +800,7 @@ static void bcmasp_reclaim_free_buffers(struct bcmasp_intf *intf)
 	/* RX buffers */
 	dma_free_coherent(kdev, DESC_RING_SIZE, intf->rx_edpkt_cpu,
 			  intf->rx_edpkt_dma_addr);
-	dma_unmap_page(kdev, intf->rx_ring_dma, RING_BUFFER_SIZE,
-		       DMA_FROM_DEVICE);
-	__free_pages(virt_to_page(intf->rx_ring_cpu), intf->rx_buf_order);
+	bcmasp_reclaim_rx_buffers(intf);
 
 	/* TX buffers */
 	dma_free_coherent(kdev, DESC_RING_SIZE, intf->tx_spb_cpu,
@@ -738,7 +819,7 @@ static void bcmasp_init_rx(struct bcmasp_intf *intf)
 	/* Make sure channels are disabled */
 	rx_edpkt_cfg_wl(intf, 0x0, RX_EDPKT_CFG_ENABLE);
 
-	/* Rx SPB */
+	/* Streaming data ring: hardware writes raw packet bytes here. */
 	rx_edpkt_cfg_wq(intf, intf->rx_ring_dma, RX_EDPKT_RING_BUFFER_READ);
 	rx_edpkt_cfg_wq(intf, intf->rx_ring_dma, RX_EDPKT_RING_BUFFER_WRITE);
 	rx_edpkt_cfg_wq(intf, intf->rx_ring_dma, RX_EDPKT_RING_BUFFER_BASE);
@@ -747,7 +828,9 @@ static void bcmasp_init_rx(struct bcmasp_intf *intf)
 	rx_edpkt_cfg_wq(intf, intf->rx_ring_dma_valid,
 			RX_EDPKT_RING_BUFFER_VALID);
 
-	/* EDPKT */
+	/* EDPKT descriptor ring: hardware fills descriptors pointing into
+	 * the streaming ring buffer above (RBUF_4K mode).
+	 */
 	rx_edpkt_cfg_wl(intf, (RX_EDPKT_CFG_CFG0_RBUF_4K <<
 			RX_EDPKT_CFG_CFG0_DBUF_SHIFT) |
 		       (RX_EDPKT_CFG_CFG0_64_ALN <<
diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf_defs.h b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf_defs.h
index af7418348e81..0318f257452a 100644
--- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf_defs.h
+++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf_defs.h
@@ -246,6 +246,10 @@
 	((((intf)->channel - 6) * 0x14) + 0xa2000)
 #define  RX_SPB_TOP_BLKOUT		0x00
 
+/*
+ * Number of 4 KB pages that make up the contiguous RBUF_4K streaming ring
+ * and the page pool used as copy-target SKB data areas.
+ */
 #define NUM_4K_BUFFERS			32
 #define RING_BUFFER_SIZE		(PAGE_SIZE * NUM_4K_BUFFERS)
 
-- 
2.34.1
Re: [PATCH net-next v2] net: bcmasp: Switch to page pool for RX path
Posted by Nicolai Buchwitz 2 months, 1 week ago
On 8.4.2026 02:18, Florian Fainelli wrote:
> This shows an improvement of 1.9% in reducing the CPU cycles and data
> cache misses.
> 
> Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
> ---
> Changes in v2:
> 
> - addressed Nicolai's comments by setting the .netdev and .napi members
>   and dropped the useless comment and .dma_dir initialization

> [...]

Reviewed-by: Nicolai Buchwitz <nb@tipi-net.de>
Re: [PATCH net-next v2] net: bcmasp: Switch to page pool for RX path
Posted by Justin Chen 2 months, 1 week ago

On 4/7/2026 5:18 PM, Florian Fainelli wrote:
> This shows an improvement of 1.9% in reducing the CPU cycles and data
> cache misses.
> 
> Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>

Reviewed-by: Justin Chen <justin.chen@broadcom.com>