[net-next v5 09/12] net: bnxt: Add SW GSO completion and teardown support

Joe Damato posted 12 patches 1 week, 3 days ago
There is a newer version of this series
[net-next v5 09/12] net: bnxt: Add SW GSO completion and teardown support
Posted by Joe Damato 1 week, 3 days ago
Update __bnxt_tx_int and bnxt_free_one_tx_ring_skbs to handle SW GSO
segments:

- MID segments: adjust tx_pkts/tx_bytes accounting and skip skb free
  (the skb is shared across all segments and freed only once)

- LAST segments: if the DMA IOVA path was used, use dma_iova_destroy to
  tear down the contiguous mapping. On the fallback path, payload DMA
  unmapping is handled by the existing per-BD dma_unmap_len walk.

Both MID and LAST completions advance tx_inline_cons to release the
segment's inline header slot back to the ring.

is_sw_gso is initialized to zero, so the new code paths are not run.

Suggested-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Signed-off-by: Joe Damato <joe@dama.to>
---
 v5:
   - Added Pavan's Reviewed-by. No functional changes.

 v3:
   - completion paths updated to use DMA IOVA APIs to teardown mappings.

 rfcv2:
   - Update the shared header buffer consumer on TX completion.

 drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 82 +++++++++++++++++--
 .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 ++++-
 2 files changed, 91 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 2759a4e2b148..40a16f96feba 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -74,6 +74,8 @@
 #include "bnxt_debugfs.h"
 #include "bnxt_coredump.h"
 #include "bnxt_hwmon.h"
+#include "bnxt_gso.h"
+#include <net/tso.h>
 
 #define BNXT_TX_TIMEOUT		(5 * HZ)
 #define BNXT_DEF_MSG_ENABLE	(NETIF_MSG_DRV | NETIF_MSG_HW | \
@@ -817,12 +819,13 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
 	bool rc = false;
 
 	while (RING_TX(bp, cons) != hw_cons) {
-		struct bnxt_sw_tx_bd *tx_buf;
+		struct bnxt_sw_tx_bd *tx_buf, *head_buf;
 		struct sk_buff *skb;
 		bool is_ts_pkt;
 		int j, last;
 
 		tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)];
+		head_buf = tx_buf;
 		skb = tx_buf->skb;
 
 		if (unlikely(!skb)) {
@@ -869,6 +872,23 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
 							    DMA_TO_DEVICE, 0);
 			}
 		}
+
+		if (unlikely(head_buf->is_sw_gso)) {
+			txr->tx_inline_cons++;
+			if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
+				if (dma_use_iova(&head_buf->iova_state))
+					dma_iova_destroy(&pdev->dev,
+							 &head_buf->iova_state,
+							 head_buf->iova_total_len,
+							 DMA_TO_DEVICE, 0);
+			} else {
+				tx_pkts--;
+				tx_bytes -= skb->len;
+				skb = NULL;
+			}
+			head_buf->is_sw_gso = 0;
+		}
+
 		if (unlikely(is_ts_pkt)) {
 			if (BNXT_CHIP_P5(bp)) {
 				/* PTP worker takes ownership of the skb */
@@ -3420,6 +3440,7 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp,
 
 	for (i = 0; i < max_idx;) {
 		struct bnxt_sw_tx_bd *tx_buf = &txr->tx_buf_ring[i];
+		struct bnxt_sw_tx_bd *head_buf = tx_buf;
 		struct sk_buff *skb;
 		int j, last;
 
@@ -3472,7 +3493,20 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp,
 							    DMA_TO_DEVICE, 0);
 			}
 		}
-		dev_kfree_skb(skb);
+		if (head_buf->is_sw_gso) {
+			txr->tx_inline_cons++;
+			if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
+				if (dma_use_iova(&head_buf->iova_state))
+					dma_iova_destroy(&pdev->dev,
+							 &head_buf->iova_state,
+							 head_buf->iova_total_len,
+							 DMA_TO_DEVICE, 0);
+			} else {
+				skb = NULL;
+			}
+		}
+		if (skb)
+			dev_kfree_skb(skb);
 	}
 	netdev_tx_reset_queue(netdev_get_tx_queue(bp->dev, idx));
 }
@@ -3998,9 +4032,9 @@ static void bnxt_free_tx_inline_buf(struct bnxt_tx_ring_info *txr,
 	txr->tx_inline_size = 0;
 }
 
-static int __maybe_unused bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr,
-						   struct pci_dev *pdev,
-						   unsigned int size)
+static int bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr,
+				    struct pci_dev *pdev,
+				    unsigned int size)
 {
 	txr->tx_inline_buf = kmalloc(size, GFP_KERNEL);
 	if (!txr->tx_inline_buf)
@@ -4103,6 +4137,14 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp)
 				sizeof(struct tx_push_bd);
 			txr->data_mapping = cpu_to_le64(mapping);
 		}
+		if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+		    (bp->dev->features & NETIF_F_GSO_UDP_L4)) {
+			rc = bnxt_alloc_tx_inline_buf(txr, pdev,
+						      BNXT_SW_USO_MAX_SEGS *
+						      TSO_HEADER_SIZE);
+			if (rc)
+				return rc;
+		}
 		qidx = bp->tc_to_qidx[j];
 		ring->queue_id = bp->q_info[qidx].queue_id;
 		spin_lock_init(&txr->xdp_tx_lock);
@@ -4645,6 +4687,10 @@ static int bnxt_init_tx_rings(struct bnxt *bp)
 
 	bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2,
 				   BNXT_MIN_TX_DESC_CNT);
+	if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+	    (bp->dev->features & NETIF_F_GSO_UDP_L4))
+		bp->tx_wake_thresh = max_t(int, bp->tx_wake_thresh,
+					   BNXT_SW_USO_MAX_DESCS);
 
 	for (i = 0; i < bp->tx_nr_rings; i++) {
 		struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
@@ -13833,6 +13879,11 @@ static netdev_features_t bnxt_fix_features(struct net_device *dev,
 	if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp, false))
 		features &= ~NETIF_F_NTUPLE;
 
+	if ((features & NETIF_F_GSO_UDP_L4) &&
+	    !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+	    bp->tx_ring_size < 2 * BNXT_SW_USO_MAX_DESCS)
+		features &= ~NETIF_F_GSO_UDP_L4;
+
 	if ((bp->flags & BNXT_FLAG_NO_AGG_RINGS) || bp->xdp_prog)
 		features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW);
 
@@ -13878,6 +13929,15 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features)
 	int rc = 0;
 	bool re_init = false;
 
+	if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) {
+		if (features & NETIF_F_GSO_UDP_L4)
+			bp->tx_wake_thresh = max_t(int, bp->tx_wake_thresh,
+						   BNXT_SW_USO_MAX_DESCS);
+		else
+			bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2,
+						   BNXT_MIN_TX_DESC_CNT);
+	}
+
 	flags &= ~BNXT_FLAG_ALL_CONFIG_FEATS;
 	if (features & NETIF_F_GRO_HW)
 		flags |= BNXT_FLAG_GRO;
@@ -16881,8 +16941,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 			   NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
 			   NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH |
 			   NETIF_F_RXCSUM | NETIF_F_GRO;
-	if (bp->flags & BNXT_FLAG_UDP_GSO_CAP)
-		dev->hw_features |= NETIF_F_GSO_UDP_L4;
+	dev->hw_features |= NETIF_F_GSO_UDP_L4;
 
 	if (BNXT_SUPPORTS_TPA(bp))
 		dev->hw_features |= NETIF_F_LRO;
@@ -16915,8 +16974,15 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	dev->priv_flags |= IFF_UNICAST_FLT;
 
 	netif_set_tso_max_size(dev, GSO_MAX_SIZE);
-	if (bp->tso_max_segs)
+	if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) {
+		u16 max_segs = BNXT_SW_USO_MAX_SEGS;
+
+		if (bp->tso_max_segs)
+			max_segs = min_t(u16, max_segs, bp->tso_max_segs);
+		netif_set_tso_max_segs(dev, max_segs);
+	} else if (bp->tso_max_segs) {
 		netif_set_tso_max_segs(dev, bp->tso_max_segs);
+	}
 
 	dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
 			    NETDEV_XDP_ACT_RX_SG;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 48e8e3be70d3..44b3fd18fcbe 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -33,6 +33,7 @@
 #include "bnxt_xdp.h"
 #include "bnxt_ptp.h"
 #include "bnxt_ethtool.h"
+#include "bnxt_gso.h"
 #include "bnxt_nvm_defs.h"	/* NVRAM content constant and structure defs */
 #include "bnxt_fw_hdr.h"	/* Firmware hdr constant and structure defs */
 #include "bnxt_coredump.h"
@@ -852,12 +853,18 @@ static int bnxt_set_ringparam(struct net_device *dev,
 	u8 tcp_data_split = kernel_ering->tcp_data_split;
 	struct bnxt *bp = netdev_priv(dev);
 	u8 hds_config_mod;
+	int rc;
 
 	if ((ering->rx_pending > BNXT_MAX_RX_DESC_CNT) ||
 	    (ering->tx_pending > BNXT_MAX_TX_DESC_CNT) ||
 	    (ering->tx_pending < BNXT_MIN_TX_DESC_CNT))
 		return -EINVAL;
 
+	if ((dev->features & NETIF_F_GSO_UDP_L4) &&
+	    !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+	    ering->tx_pending < 2 * BNXT_SW_USO_MAX_DESCS)
+		return -EINVAL;
+
 	hds_config_mod = tcp_data_split != dev->cfg->hds_config;
 	if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_DISABLED && hds_config_mod)
 		return -EINVAL;
@@ -882,9 +889,17 @@ static int bnxt_set_ringparam(struct net_device *dev,
 	bp->tx_ring_size = ering->tx_pending;
 	bnxt_set_ring_params(bp);
 
-	if (netif_running(dev))
-		return bnxt_open_nic(bp, false, false);
+	if (netif_running(dev)) {
+		rc = bnxt_open_nic(bp, false, false);
+		if (rc)
+			return rc;
+	}
 
+	/* ring size changes may affect features (SW USO requires a minimum
+	 * ring size), so recalculate features to ensure the correct features
+	 * are blocked/available.
+	 */
+	netdev_update_features(dev);
 	return 0;
 }
 
-- 
2.52.0
Re: [net-next v5 09/12] net: bnxt: Add SW GSO completion and teardown support
Posted by Paolo Abeni 1 week, 1 day ago
On 3/23/26 7:38 PM, Joe Damato wrote:
> Update __bnxt_tx_int and bnxt_free_one_tx_ring_skbs to handle SW GSO
> segments:
> 
> - MID segments: adjust tx_pkts/tx_bytes accounting and skip skb free
>   (the skb is shared across all segments and freed only once)
> 
> - LAST segments: if the DMA IOVA path was used, use dma_iova_destroy to
>   tear down the contiguous mapping. On the fallback path, payload DMA
>   unmapping is handled by the existing per-BD dma_unmap_len walk.
> 
> Both MID and LAST completions advance tx_inline_cons to release the
> segment's inline header slot back to the ring.
> 
> is_sw_gso is initialized to zero, so the new code paths are not run.
> 
> Suggested-by: Jakub Kicinski <kuba@kernel.org>
> Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
> Signed-off-by: Joe Damato <joe@dama.to>
> ---
>  v5:
>    - Added Pavan's Reviewed-by. No functional changes.
> 
>  v3:
>    - completion paths updated to use DMA IOVA APIs to teardown mappings.
> 
>  rfcv2:
>    - Update the shared header buffer consumer on TX completion.
> 
>  drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 82 +++++++++++++++++--
>  .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 ++++-
>  2 files changed, 91 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index 2759a4e2b148..40a16f96feba 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -74,6 +74,8 @@
>  #include "bnxt_debugfs.h"
>  #include "bnxt_coredump.h"
>  #include "bnxt_hwmon.h"
> +#include "bnxt_gso.h"
> +#include <net/tso.h>
>  
>  #define BNXT_TX_TIMEOUT		(5 * HZ)
>  #define BNXT_DEF_MSG_ENABLE	(NETIF_MSG_DRV | NETIF_MSG_HW | \
> @@ -817,12 +819,13 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
>  	bool rc = false;
>  
>  	while (RING_TX(bp, cons) != hw_cons) {
> -		struct bnxt_sw_tx_bd *tx_buf;
> +		struct bnxt_sw_tx_bd *tx_buf, *head_buf;
>  		struct sk_buff *skb;
>  		bool is_ts_pkt;
>  		int j, last;
>  
>  		tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)];
> +		head_buf = tx_buf;
>  		skb = tx_buf->skb;
>  
>  		if (unlikely(!skb)) {
> @@ -869,6 +872,23 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
>  							    DMA_TO_DEVICE, 0);
>  			}
>  		}
> +
> +		if (unlikely(head_buf->is_sw_gso)) {
> +			txr->tx_inline_cons++;
> +			if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
> +				if (dma_use_iova(&head_buf->iova_state))

I'm likely lost, but AFAICS the previous patch/bnxt_sw_udp_gso_xmit()
initialize head_buf->iova_state only when
`dma_use_iova(&head_buf->iova_state) == true`. I.e. in fallback scenario
the previous iova_state is maintained.

Additionally AFAICS dma_iova_destroy does not clear `head_buf->iova_state`.

It looks like that 2 consecutive skb hitting the same slot use a
different dma mapping strategy (fallback vs iova) bat things will
happen?!? should the previous patch always initializing
head_buf->iova_state?

/P
Re: [net-next v5 09/12] net: bnxt: Add SW GSO completion and teardown support
Posted by Joe Damato 1 week, 1 day ago
On Thu, Mar 26, 2026 at 01:39:17PM +0100, Paolo Abeni wrote:
> On 3/23/26 7:38 PM, Joe Damato wrote:
> > Update __bnxt_tx_int and bnxt_free_one_tx_ring_skbs to handle SW GSO
> > segments:
> > 
> > - MID segments: adjust tx_pkts/tx_bytes accounting and skip skb free
> >   (the skb is shared across all segments and freed only once)
> > 
> > - LAST segments: if the DMA IOVA path was used, use dma_iova_destroy to
> >   tear down the contiguous mapping. On the fallback path, payload DMA
> >   unmapping is handled by the existing per-BD dma_unmap_len walk.
> > 
> > Both MID and LAST completions advance tx_inline_cons to release the
> > segment's inline header slot back to the ring.
> > 
> > is_sw_gso is initialized to zero, so the new code paths are not run.
> > 
> > Suggested-by: Jakub Kicinski <kuba@kernel.org>
> > Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
> > Signed-off-by: Joe Damato <joe@dama.to>
> > ---
> >  v5:
> >    - Added Pavan's Reviewed-by. No functional changes.
> > 
> >  v3:
> >    - completion paths updated to use DMA IOVA APIs to teardown mappings.
> > 
> >  rfcv2:
> >    - Update the shared header buffer consumer on TX completion.
> > 
> >  drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 82 +++++++++++++++++--
> >  .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 ++++-
> >  2 files changed, 91 insertions(+), 10 deletions(-)
> > 
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > index 2759a4e2b148..40a16f96feba 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > @@ -74,6 +74,8 @@
> >  #include "bnxt_debugfs.h"
> >  #include "bnxt_coredump.h"
> >  #include "bnxt_hwmon.h"
> > +#include "bnxt_gso.h"
> > +#include <net/tso.h>
> >  
> >  #define BNXT_TX_TIMEOUT		(5 * HZ)
> >  #define BNXT_DEF_MSG_ENABLE	(NETIF_MSG_DRV | NETIF_MSG_HW | \
> > @@ -817,12 +819,13 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
> >  	bool rc = false;
> >  
> >  	while (RING_TX(bp, cons) != hw_cons) {
> > -		struct bnxt_sw_tx_bd *tx_buf;
> > +		struct bnxt_sw_tx_bd *tx_buf, *head_buf;
> >  		struct sk_buff *skb;
> >  		bool is_ts_pkt;
> >  		int j, last;
> >  
> >  		tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)];
> > +		head_buf = tx_buf;
> >  		skb = tx_buf->skb;
> >  
> >  		if (unlikely(!skb)) {
> > @@ -869,6 +872,23 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
> >  							    DMA_TO_DEVICE, 0);
> >  			}
> >  		}
> > +
> > +		if (unlikely(head_buf->is_sw_gso)) {
> > +			txr->tx_inline_cons++;
> > +			if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
> > +				if (dma_use_iova(&head_buf->iova_state))
> 
> I'm likely lost, but AFAICS the previous patch/bnxt_sw_udp_gso_xmit()
> initialize head_buf->iova_state only when
> `dma_use_iova(&head_buf->iova_state) == true`. I.e. in fallback scenario
> the previous iova_state is maintained.

Note that calling dma_iova_try_alloc zeroes the state before returning whether
the IOVA DMA API can be used or not and I call that uncoditionally (see
below).

> Additionally AFAICS dma_iova_destroy does not clear `head_buf->iova_state`.

That's my understanding, too, that dma_iova_destroy doesn't clear the state.
 
> It looks like that 2 consecutive skb hitting the same slot use a
> different dma mapping strategy (fallback vs iova) bat things will
> happen?!? should the previous patch always initializing
> head_buf->iova_state?

AFAICT, to switch the IOMMU domain would require unbind the device, changing
the IOMMU type, and re-binding the device... which would destroy all the rings
in the process and thus this wouldn't happen.

The only way I could potentially imagine this happening would be in extreme
IOVA pressure (maybe?):
  - packet A in slot N, dma_iova_try_alloc suceeds -> head_buf->iova_state
    copied
  - completion the packet occurs, dma_iova_destroy is called,
    head_buf->iova_state is not cleared
  - packet B in slot N, dma_iova_try_alloc fails due to IOVA pressure...
    head_buf->iova_state is stale

I'm pretty skeptical that this is a realistic case, TBH.

That said and since it seems my v5 got CR, I can send a v6 with this slight
change to address the case you've mentioned above.

I'll send in a couple hours unless I hear otherwise:

diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
index 9c30ee063ef5..7c198847a771 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
@@ -142,8 +142,12 @@ netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp,

                tx_buf->is_sw_gso = last ? BNXT_SW_GSO_LAST : BNXT_SW_GSO_MID;

-               /* Store IOVA state on the last segment for completion */
-               if (last && tso_dma_map_use_iova(&map)) {
+               /* Store IOVA state on the last segment for completion.
+                * Always copy so that a stale iova_state from a prior
+                * occupant of this ring slot cannot be misread by
+                * dma_use_iova() in the completion path.
+                */
+               if (last) {
                        tx_buf->iova_state = map.iova_state;
                        tx_buf->iova_total_len = map.total_len;
                }
Re: [net-next v5 09/12] net: bnxt: Add SW GSO completion and teardown support
Posted by Paolo Abeni 1 week, 1 day ago
On 3/26/26 6:02 PM, Joe Damato wrote:
> On Thu, Mar 26, 2026 at 01:39:17PM +0100, Paolo Abeni wrote:
>> On 3/23/26 7:38 PM, Joe Damato wrote:
>>> Update __bnxt_tx_int and bnxt_free_one_tx_ring_skbs to handle SW GSO
>>> segments:
>>>
>>> - MID segments: adjust tx_pkts/tx_bytes accounting and skip skb free
>>>   (the skb is shared across all segments and freed only once)
>>>
>>> - LAST segments: if the DMA IOVA path was used, use dma_iova_destroy to
>>>   tear down the contiguous mapping. On the fallback path, payload DMA
>>>   unmapping is handled by the existing per-BD dma_unmap_len walk.
>>>
>>> Both MID and LAST completions advance tx_inline_cons to release the
>>> segment's inline header slot back to the ring.
>>>
>>> is_sw_gso is initialized to zero, so the new code paths are not run.
>>>
>>> Suggested-by: Jakub Kicinski <kuba@kernel.org>
>>> Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
>>> Signed-off-by: Joe Damato <joe@dama.to>
>>> ---
>>>  v5:
>>>    - Added Pavan's Reviewed-by. No functional changes.
>>>
>>>  v3:
>>>    - completion paths updated to use DMA IOVA APIs to teardown mappings.
>>>
>>>  rfcv2:
>>>    - Update the shared header buffer consumer on TX completion.
>>>
>>>  drivers/net/ethernet/broadcom/bnxt/bnxt.c     | 82 +++++++++++++++++--
>>>  .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 ++++-
>>>  2 files changed, 91 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>> index 2759a4e2b148..40a16f96feba 100644
>>> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>> @@ -74,6 +74,8 @@
>>>  #include "bnxt_debugfs.h"
>>>  #include "bnxt_coredump.h"
>>>  #include "bnxt_hwmon.h"
>>> +#include "bnxt_gso.h"
>>> +#include <net/tso.h>
>>>  
>>>  #define BNXT_TX_TIMEOUT		(5 * HZ)
>>>  #define BNXT_DEF_MSG_ENABLE	(NETIF_MSG_DRV | NETIF_MSG_HW | \
>>> @@ -817,12 +819,13 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
>>>  	bool rc = false;
>>>  
>>>  	while (RING_TX(bp, cons) != hw_cons) {
>>> -		struct bnxt_sw_tx_bd *tx_buf;
>>> +		struct bnxt_sw_tx_bd *tx_buf, *head_buf;
>>>  		struct sk_buff *skb;
>>>  		bool is_ts_pkt;
>>>  		int j, last;
>>>  
>>>  		tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)];
>>> +		head_buf = tx_buf;
>>>  		skb = tx_buf->skb;
>>>  
>>>  		if (unlikely(!skb)) {
>>> @@ -869,6 +872,23 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
>>>  							    DMA_TO_DEVICE, 0);
>>>  			}
>>>  		}
>>> +
>>> +		if (unlikely(head_buf->is_sw_gso)) {
>>> +			txr->tx_inline_cons++;
>>> +			if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
>>> +				if (dma_use_iova(&head_buf->iova_state))
>>
>> I'm likely lost, but AFAICS the previous patch/bnxt_sw_udp_gso_xmit()
>> initialize head_buf->iova_state only when
>> `dma_use_iova(&head_buf->iova_state) == true`. I.e. in fallback scenario
>> the previous iova_state is maintained.
> 
> Note that calling dma_iova_try_alloc zeroes the state before returning whether
> the IOVA DMA API can be used or not and I call that uncoditionally (see
> below).
> 
>> Additionally AFAICS dma_iova_destroy does not clear `head_buf->iova_state`.
> 
> That's my understanding, too, that dma_iova_destroy doesn't clear the state.
>  
>> It looks like that 2 consecutive skb hitting the same slot use a
>> different dma mapping strategy (fallback vs iova) bat things will
>> happen?!? should the previous patch always initializing
>> head_buf->iova_state?
> 
> AFAICT, to switch the IOMMU domain would require unbind the device, changing
> the IOMMU type, and re-binding the device... which would destroy all the rings
> in the process and thus this wouldn't happen.
> 
> The only way I could potentially imagine this happening would be in extreme
> IOVA pressure (maybe?):
>   - packet A in slot N, dma_iova_try_alloc suceeds -> head_buf->iova_state
>     copied
>   - completion the packet occurs, dma_iova_destroy is called,
>     head_buf->iova_state is not cleared
>   - packet B in slot N, dma_iova_try_alloc fails due to IOVA pressure...
>     head_buf->iova_state is stale
> 
> I'm pretty skeptical that this is a realistic case, TBH.
> 
> That said and since it seems my v5 got CR, I can send a v6 with this slight
> change to address the case you've mentioned above.
> 
> I'll send in a couple hours unless I hear otherwise:
> 
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
> index 9c30ee063ef5..7c198847a771 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
> @@ -142,8 +142,12 @@ netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp,
> 
>                 tx_buf->is_sw_gso = last ? BNXT_SW_GSO_LAST : BNXT_SW_GSO_MID;
> 
> -               /* Store IOVA state on the last segment for completion */
> -               if (last && tso_dma_map_use_iova(&map)) {
> +               /* Store IOVA state on the last segment for completion.
> +                * Always copy so that a stale iova_state from a prior
> +                * occupant of this ring slot cannot be misread by
> +                * dma_use_iova() in the completion path.
> +                */
> +               if (last) {
>                         tx_buf->iova_state = map.iova_state;
>                         tx_buf->iova_total_len = map.total_len;
>                 }
> 

Since tso_dma_map_use_iova(&map) is the likely option, I tend to think
that the above change is worthy even if the problem I feared about is
extremely unlikely if possible at all: the code is IMHO easier to
follow, and FWIW does not overoptimize an unlikely scenario.

/P