[v1] Add more functionality to BNGE

[v7, net-next 06/10] bng_en: Allocate packet buffers

Posted by Bhargava Marreddy 2 weeks, 6 days ago

Populate packet buffers into the RX and AGG rings while these
rings are being initialized.

Signed-off-by: Bhargava Marreddy <bhargava.marreddy@broadcom.com>
Reviewed-by: Vikas Gupta <vikas.gupta@broadcom.com>
Reviewed-by: Rajashekar Hudumula <rajashekar.hudumula@broadcom.com>
---
 .../net/ethernet/broadcom/bnge/bnge_netdev.c  | 223 ++++++++++++++++++
 1 file changed, 223 insertions(+)

diff --git a/drivers/net/ethernet/broadcom/bnge/bnge_netdev.c b/drivers/net/ethernet/broadcom/bnge/bnge_netdev.c
index 77bd8f6ce39..ee7cf8596cd 100644
--- a/drivers/net/ethernet/broadcom/bnge/bnge_netdev.c
+++ b/drivers/net/ethernet/broadcom/bnge/bnge_netdev.c
@@ -257,6 +257,76 @@ static bool bnge_separate_head_pool(struct bnge_rx_ring_info *rxr)
 	return rxr->need_head_pool || PAGE_SIZE > BNGE_RX_PAGE_SIZE;
 }
 
+static void bnge_free_one_rx_ring(struct bnge_net *bn,
+				  struct bnge_rx_ring_info *rxr)
+{
+	int i, max_idx;
+
+	if (!rxr->rx_buf_ring)
+		return;
+
+	max_idx = bn->rx_nr_pages * RX_DESC_CNT;
+
+	for (i = 0; i < max_idx; i++) {
+		struct bnge_sw_rx_bd *rx_buf = &rxr->rx_buf_ring[i];
+		void *data = rx_buf->data;
+
+		if (!data)
+			continue;
+
+		rx_buf->data = NULL;
+		page_pool_free_va(rxr->head_pool, data, true);
+	}
+}
+
+static void bnge_free_one_rx_agg_ring(struct bnge_net *bn,
+				      struct bnge_rx_ring_info *rxr)
+{
+	int i, max_idx;
+
+	if (!rxr->rx_agg_buf_ring)
+		return;
+
+	max_idx = bn->rx_agg_nr_pages * RX_DESC_CNT;
+
+	for (i = 0; i < max_idx; i++) {
+		struct bnge_sw_rx_agg_bd *rx_agg_buf = &rxr->rx_agg_buf_ring[i];
+		netmem_ref netmem = rx_agg_buf->netmem;
+
+		if (!netmem)
+			continue;
+
+		rx_agg_buf->netmem = 0;
+		__clear_bit(i, rxr->rx_agg_bmap);
+
+		page_pool_recycle_direct_netmem(rxr->page_pool, netmem);
+	}
+}
+
+static void bnge_free_one_rx_pkt_mem(struct bnge_net *bn,
+				     struct bnge_rx_ring_info *rxr)
+{
+	bnge_free_one_rx_ring(bn, rxr);
+	bnge_free_one_rx_agg_ring(bn, rxr);
+}
+
+static void bnge_free_rx_pkt_bufs(struct bnge_net *bn)
+{
+	struct bnge_dev *bd = bn->bd;
+	int i;
+
+	if (!bn->rx_ring)
+		return;
+
+	for (i = 0; i < bd->rx_nr_rings; i++)
+		bnge_free_one_rx_pkt_mem(bn, &bn->rx_ring[i]);
+}
+
+static void bnge_free_pkts_mem(struct bnge_net *bn)
+{
+	bnge_free_rx_pkt_bufs(bn);
+}
+
 static void bnge_free_rx_rings(struct bnge_net *bn)
 {
 	struct bnge_dev *bd = bn->bd;
@@ -737,6 +807,156 @@ static void bnge_init_nq_tree(struct bnge_net *bn)
 	}
 }
 
+static netmem_ref __bnge_alloc_rx_netmem(struct bnge_net *bn,
+					 dma_addr_t *mapping,
+					 struct bnge_rx_ring_info *rxr,
+					 unsigned int *offset,
+					 gfp_t gfp)
+{
+	netmem_ref netmem;
+
+	if (PAGE_SIZE > BNGE_RX_PAGE_SIZE) {
+		netmem = page_pool_alloc_frag_netmem(rxr->page_pool, offset,
+						     BNGE_RX_PAGE_SIZE, gfp);
+	} else {
+		netmem = page_pool_alloc_netmems(rxr->page_pool, gfp);
+		*offset = 0;
+	}
+	if (!netmem)
+		return 0;
+
+	*mapping = page_pool_get_dma_addr_netmem(netmem) + *offset;
+	return netmem;
+}
+
+static u8 *__bnge_alloc_rx_frag(struct bnge_net *bn, dma_addr_t *mapping,
+				struct bnge_rx_ring_info *rxr,
+				gfp_t gfp)
+{
+	unsigned int offset;
+	struct page *page;
+
+	page = page_pool_alloc_frag(rxr->head_pool, &offset,
+				    bn->rx_buf_size, gfp);
+	if (!page)
+		return NULL;
+
+	*mapping = page_pool_get_dma_addr(page) + bn->rx_dma_offset + offset;
+	return page_address(page) + offset;
+}
+
+static int bnge_alloc_rx_data(struct bnge_net *bn,
+			      struct bnge_rx_ring_info *rxr,
+			      u16 prod, gfp_t gfp)
+{
+	struct bnge_sw_rx_bd *rx_buf = &rxr->rx_buf_ring[RING_RX(bn, prod)];
+	struct rx_bd *rxbd;
+	dma_addr_t mapping;
+	u8 *data;
+
+	rxbd = &rxr->rx_desc_ring[RX_RING(bn, prod)][RX_IDX(prod)];
+	data = __bnge_alloc_rx_frag(bn, &mapping, rxr, gfp);
+	if (!data)
+		return -ENOMEM;
+
+	rx_buf->data = data;
+	rx_buf->data_ptr = data + bn->rx_offset;
+	rx_buf->mapping = mapping;
+
+	rxbd->rx_bd_haddr = cpu_to_le64(mapping);
+
+	return 0;
+}
+
+static void bnge_alloc_one_rx_pkt_mem(struct bnge_net *bn,
+				      struct bnge_rx_ring_info *rxr,
+				      int ring_nr)
+{
+	u32 prod;
+	int i;
+
+	prod = rxr->rx_prod;
+	for (i = 0; i < bn->rx_ring_size; i++) {
+		if (bnge_alloc_rx_data(bn, rxr, prod, GFP_KERNEL)) {
+			netdev_warn(bn->netdev, "init'ed rx ring %d with %d/%d skbs only\n",
+				    ring_nr, i, bn->rx_ring_size);
+			break;
+		}
+		prod = NEXT_RX(prod);
+	}
+	rxr->rx_prod = prod;
+}
+
+static u16 bnge_find_next_agg_idx(struct bnge_rx_ring_info *rxr, u16 idx)
+{
+	u16 next, max = rxr->rx_agg_bmap_size;
+
+	next = find_next_zero_bit(rxr->rx_agg_bmap, max, idx);
+	if (next >= max)
+		next = find_first_zero_bit(rxr->rx_agg_bmap, max);
+	return next;
+}
+
+static int bnge_alloc_rx_netmem(struct bnge_net *bn,
+				struct bnge_rx_ring_info *rxr,
+				u16 prod, gfp_t gfp)
+{
+	struct bnge_sw_rx_agg_bd *rx_agg_buf;
+	u16 sw_prod = rxr->rx_sw_agg_prod;
+	unsigned int offset = 0;
+	struct rx_bd *rxbd;
+	dma_addr_t mapping;
+	netmem_ref netmem;
+
+	rxbd = &rxr->rx_agg_desc_ring[RX_AGG_RING(bn, prod)][RX_IDX(prod)];
+	netmem = __bnge_alloc_rx_netmem(bn, &mapping, rxr, &offset, gfp);
+	if (!netmem)
+		return -ENOMEM;
+
+	if (unlikely(test_bit(sw_prod, rxr->rx_agg_bmap)))
+		sw_prod = bnge_find_next_agg_idx(rxr, sw_prod);
+
+	__set_bit(sw_prod, rxr->rx_agg_bmap);
+	rx_agg_buf = &rxr->rx_agg_buf_ring[sw_prod];
+	rxr->rx_sw_agg_prod = RING_RX_AGG(bn, NEXT_RX_AGG(sw_prod));
+
+	rx_agg_buf->netmem = netmem;
+	rx_agg_buf->offset = offset;
+	rx_agg_buf->mapping = mapping;
+	rxbd->rx_bd_haddr = cpu_to_le64(mapping);
+	rxbd->rx_bd_opaque = sw_prod;
+	return 0;
+}
+
+static void bnge_alloc_one_rx_ring_netmem(struct bnge_net *bn,
+					  struct bnge_rx_ring_info *rxr,
+					  int ring_nr)
+{
+	u32 prod;
+	int i;
+
+	prod = rxr->rx_agg_prod;
+	for (i = 0; i < bn->rx_agg_ring_size; i++) {
+		if (bnge_alloc_rx_netmem(bn, rxr, prod, GFP_KERNEL)) {
+			netdev_warn(bn->netdev, "init'ed rx agg ring %d with %d/%d pages only\n",
+				    ring_nr, i, bn->rx_agg_ring_size);
+			break;
+		}
+		prod = NEXT_RX_AGG(prod);
+	}
+	rxr->rx_agg_prod = prod;
+}
+
+static void bnge_alloc_one_rx_ring(struct bnge_net *bn, int ring_nr)
+{
+	struct bnge_rx_ring_info *rxr = &bn->rx_ring[ring_nr];
+
+	bnge_alloc_one_rx_pkt_mem(bn, rxr, ring_nr);
+
+	if (bnge_is_agg_reqd(bn->bd))
+		bnge_alloc_one_rx_ring_netmem(bn, rxr, ring_nr);
+}
+
 static void bnge_init_rxbd_pages(struct bnge_ring_struct *ring, u32 type)
 {
 	struct rx_bd **rx_desc_ring;
@@ -799,6 +1019,8 @@ static void bnge_init_one_rx_ring(struct bnge_net *bn, int ring_nr)
 			     &rxr->bnapi->napi);
 
 	bnge_init_one_rx_agg_ring_rxbd(bn, rxr);
+
+	bnge_alloc_one_rx_ring(bn, ring_nr);
 }
 
 static void bnge_init_rx_rings(struct bnge_net *bn)
@@ -1106,6 +1328,7 @@ static void bnge_close_core(struct bnge_net *bn)
 	struct bnge_dev *bd = bn->bd;
 
 	clear_bit(BNGE_STATE_OPEN, &bd->state);
+	bnge_free_pkts_mem(bn);
 	bnge_free_irq(bn);
 	bnge_del_napi(bn);
 
-- 
2.47.3

Re: [v7, net-next 06/10] bng_en: Allocate packet buffers

Posted by Jakub Kicinski 2 weeks, 3 days ago

On Fri, 12 Sep 2025 01:05:01 +0530 Bhargava Marreddy wrote:
> +static void bnge_alloc_one_rx_pkt_mem(struct bnge_net *bn,
> +				      struct bnge_rx_ring_info *rxr,
> +				      int ring_nr)
> +{
> +	u32 prod;
> +	int i;
> +
> +	prod = rxr->rx_prod;
> +	for (i = 0; i < bn->rx_ring_size; i++) {
> +		if (bnge_alloc_rx_data(bn, rxr, prod, GFP_KERNEL)) {
> +			netdev_warn(bn->netdev, "init'ed rx ring %d with %d/%d skbs only\n",
> +				    ring_nr, i, bn->rx_ring_size);
> +			break;
> +		}
> +		prod = NEXT_RX(prod);
> +	}
> +	rxr->rx_prod = prod;

You should have some sort of minimal fill level of the Rx rings.
Right now ndo_open will succeed even when Rx rings are completely empty.
Looks like you made even more functions void since v6, this is going in
the wrong direction. Most drivers actually expect the entire ring to be
filled. You can have a partial fill, but knowing bnxt I'm worried the
driver will actually never try to fill the rings back up.
-- 
pw-bot: cr

Re: [v7, net-next 06/10] bng_en: Allocate packet buffers

Posted by Bhargava Chenna Marreddy 2 weeks, 3 days ago

On Mon, Sep 15, 2025 at 2:01 AM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Fri, 12 Sep 2025 01:05:01 +0530 Bhargava Marreddy wrote:
> > +static void bnge_alloc_one_rx_pkt_mem(struct bnge_net *bn,
> > +                                   struct bnge_rx_ring_info *rxr,
> > +                                   int ring_nr)
> > +{
> > +     u32 prod;
> > +     int i;
> > +
> > +     prod = rxr->rx_prod;
> > +     for (i = 0; i < bn->rx_ring_size; i++) {
> > +             if (bnge_alloc_rx_data(bn, rxr, prod, GFP_KERNEL)) {
> > +                     netdev_warn(bn->netdev, "init'ed rx ring %d with %d/%d skbs only\n",
> > +                                 ring_nr, i, bn->rx_ring_size);
> > +                     break;
> > +             }
> > +             prod = NEXT_RX(prod);
> > +     }
> > +     rxr->rx_prod = prod;
>
> You should have some sort of minimal fill level of the Rx rings.
> Right now ndo_open will succeed even when Rx rings are completely empty.
> Looks like you made even more functions void since v6, this is going in
I changed those functions to void only because in this patchset they can’t fail.
> the wrong direction. Most drivers actually expect the entire ring to be
> filled. You can have a partial fill, but knowing bnxt I'm worried the
> driver will actually never try to fill the rings back up.
I believe the driver should return an error if any buffer allocation
fails and handle the unwinding accordingly.
What do you think?

Thanks,
Bhargava Marreddy
> --
> pw-bot: cr

Re: [v7, net-next 06/10] bng_en: Allocate packet buffers

Posted by Jakub Kicinski 2 weeks, 2 days ago

On Mon, 15 Sep 2025 23:26:07 +0530 Bhargava Chenna Marreddy wrote:
> > You should have some sort of minimal fill level of the Rx rings.
> > Right now ndo_open will succeed even when Rx rings are completely empty.
> > Looks like you made even more functions void since v6, this is going in  
> I changed those functions to void only because in this patchset they can’t fail.
> > the wrong direction. Most drivers actually expect the entire ring to be
> > filled. You can have a partial fill, but knowing bnxt I'm worried the
> > driver will actually never try to fill the rings back up.  
> I believe the driver should return an error if any buffer allocation
> fails and handle the unwinding accordingly.

Yes, that's also my preference. I think allowing Rx buffer lists to not
be completely filled is okay if the driver author prefers that, but in
that case there needs to be some minimal "fill level" which makes the
device operational.

Speaking of Rx fill -- bnxt drops packets when it can't allocate a
replacement buffer. This used to be the recommended way of handling
allocation failures years ago. In modern drivers I believe it's better
to let the queue run dry and have a watchdog / service tasks which
periodically checks for complete depletion and kicks NAPI in.

Getting constantly interrupted with new packets when machine is trying
to recover from a hard OOM is not very helpful.

That's just a future note, I don't think this series itself contains
much of Rx.