Update __bnxt_tx_int and bnxt_free_one_tx_ring_skbs to handle SW GSO
segments:
- MID segments: adjust tx_pkts/tx_bytes accounting and skip skb free
(the skb is shared across all segments and freed only once)
- LAST segments: if the DMA IOVA path was used, use dma_iova_destroy to
tear down the contiguous mapping. On the fallback path, payload DMA
unmapping is handled by the existing per-BD dma_unmap_len walk.
Both MID and LAST completions advance tx_inline_cons to release the
segment's inline header slot back to the ring.
is_sw_gso is initialized to zero, so the new code paths are not run.
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Signed-off-by: Joe Damato <joe@dama.to>
---
v5:
- Added Pavan's Reviewed-by. No functional changes.
v3:
- completion paths updated to use DMA IOVA APIs to teardown mappings.
rfcv2:
- Update the shared header buffer consumer on TX completion.
drivers/net/ethernet/broadcom/bnxt/bnxt.c | 82 +++++++++++++++++--
.../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 ++++-
2 files changed, 91 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
index 2759a4e2b148..40a16f96feba 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
@@ -74,6 +74,8 @@
#include "bnxt_debugfs.h"
#include "bnxt_coredump.h"
#include "bnxt_hwmon.h"
+#include "bnxt_gso.h"
+#include <net/tso.h>
#define BNXT_TX_TIMEOUT (5 * HZ)
#define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW | \
@@ -817,12 +819,13 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
bool rc = false;
while (RING_TX(bp, cons) != hw_cons) {
- struct bnxt_sw_tx_bd *tx_buf;
+ struct bnxt_sw_tx_bd *tx_buf, *head_buf;
struct sk_buff *skb;
bool is_ts_pkt;
int j, last;
tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)];
+ head_buf = tx_buf;
skb = tx_buf->skb;
if (unlikely(!skb)) {
@@ -869,6 +872,23 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
DMA_TO_DEVICE, 0);
}
}
+
+ if (unlikely(head_buf->is_sw_gso)) {
+ txr->tx_inline_cons++;
+ if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
+ if (dma_use_iova(&head_buf->iova_state))
+ dma_iova_destroy(&pdev->dev,
+ &head_buf->iova_state,
+ head_buf->iova_total_len,
+ DMA_TO_DEVICE, 0);
+ } else {
+ tx_pkts--;
+ tx_bytes -= skb->len;
+ skb = NULL;
+ }
+ head_buf->is_sw_gso = 0;
+ }
+
if (unlikely(is_ts_pkt)) {
if (BNXT_CHIP_P5(bp)) {
/* PTP worker takes ownership of the skb */
@@ -3420,6 +3440,7 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp,
for (i = 0; i < max_idx;) {
struct bnxt_sw_tx_bd *tx_buf = &txr->tx_buf_ring[i];
+ struct bnxt_sw_tx_bd *head_buf = tx_buf;
struct sk_buff *skb;
int j, last;
@@ -3472,7 +3493,20 @@ static void bnxt_free_one_tx_ring_skbs(struct bnxt *bp,
DMA_TO_DEVICE, 0);
}
}
- dev_kfree_skb(skb);
+ if (head_buf->is_sw_gso) {
+ txr->tx_inline_cons++;
+ if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
+ if (dma_use_iova(&head_buf->iova_state))
+ dma_iova_destroy(&pdev->dev,
+ &head_buf->iova_state,
+ head_buf->iova_total_len,
+ DMA_TO_DEVICE, 0);
+ } else {
+ skb = NULL;
+ }
+ }
+ if (skb)
+ dev_kfree_skb(skb);
}
netdev_tx_reset_queue(netdev_get_tx_queue(bp->dev, idx));
}
@@ -3998,9 +4032,9 @@ static void bnxt_free_tx_inline_buf(struct bnxt_tx_ring_info *txr,
txr->tx_inline_size = 0;
}
-static int __maybe_unused bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr,
- struct pci_dev *pdev,
- unsigned int size)
+static int bnxt_alloc_tx_inline_buf(struct bnxt_tx_ring_info *txr,
+ struct pci_dev *pdev,
+ unsigned int size)
{
txr->tx_inline_buf = kmalloc(size, GFP_KERNEL);
if (!txr->tx_inline_buf)
@@ -4103,6 +4137,14 @@ static int bnxt_alloc_tx_rings(struct bnxt *bp)
sizeof(struct tx_push_bd);
txr->data_mapping = cpu_to_le64(mapping);
}
+ if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+ (bp->dev->features & NETIF_F_GSO_UDP_L4)) {
+ rc = bnxt_alloc_tx_inline_buf(txr, pdev,
+ BNXT_SW_USO_MAX_SEGS *
+ TSO_HEADER_SIZE);
+ if (rc)
+ return rc;
+ }
qidx = bp->tc_to_qidx[j];
ring->queue_id = bp->q_info[qidx].queue_id;
spin_lock_init(&txr->xdp_tx_lock);
@@ -4645,6 +4687,10 @@ static int bnxt_init_tx_rings(struct bnxt *bp)
bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2,
BNXT_MIN_TX_DESC_CNT);
+ if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+ (bp->dev->features & NETIF_F_GSO_UDP_L4))
+ bp->tx_wake_thresh = max_t(int, bp->tx_wake_thresh,
+ BNXT_SW_USO_MAX_DESCS);
for (i = 0; i < bp->tx_nr_rings; i++) {
struct bnxt_tx_ring_info *txr = &bp->tx_ring[i];
@@ -13833,6 +13879,11 @@ static netdev_features_t bnxt_fix_features(struct net_device *dev,
if ((features & NETIF_F_NTUPLE) && !bnxt_rfs_capable(bp, false))
features &= ~NETIF_F_NTUPLE;
+ if ((features & NETIF_F_GSO_UDP_L4) &&
+ !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+ bp->tx_ring_size < 2 * BNXT_SW_USO_MAX_DESCS)
+ features &= ~NETIF_F_GSO_UDP_L4;
+
if ((bp->flags & BNXT_FLAG_NO_AGG_RINGS) || bp->xdp_prog)
features &= ~(NETIF_F_LRO | NETIF_F_GRO_HW);
@@ -13878,6 +13929,15 @@ static int bnxt_set_features(struct net_device *dev, netdev_features_t features)
int rc = 0;
bool re_init = false;
+ if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) {
+ if (features & NETIF_F_GSO_UDP_L4)
+ bp->tx_wake_thresh = max_t(int, bp->tx_wake_thresh,
+ BNXT_SW_USO_MAX_DESCS);
+ else
+ bp->tx_wake_thresh = max_t(int, bp->tx_ring_size / 2,
+ BNXT_MIN_TX_DESC_CNT);
+ }
+
flags &= ~BNXT_FLAG_ALL_CONFIG_FEATS;
if (features & NETIF_F_GRO_HW)
flags |= BNXT_FLAG_GRO;
@@ -16881,8 +16941,7 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
NETIF_F_GSO_UDP_TUNNEL_CSUM | NETIF_F_GSO_GRE_CSUM |
NETIF_F_GSO_PARTIAL | NETIF_F_RXHASH |
NETIF_F_RXCSUM | NETIF_F_GRO;
- if (bp->flags & BNXT_FLAG_UDP_GSO_CAP)
- dev->hw_features |= NETIF_F_GSO_UDP_L4;
+ dev->hw_features |= NETIF_F_GSO_UDP_L4;
if (BNXT_SUPPORTS_TPA(bp))
dev->hw_features |= NETIF_F_LRO;
@@ -16915,8 +16974,15 @@ static int bnxt_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
dev->priv_flags |= IFF_UNICAST_FLT;
netif_set_tso_max_size(dev, GSO_MAX_SIZE);
- if (bp->tso_max_segs)
+ if (!(bp->flags & BNXT_FLAG_UDP_GSO_CAP)) {
+ u16 max_segs = BNXT_SW_USO_MAX_SEGS;
+
+ if (bp->tso_max_segs)
+ max_segs = min_t(u16, max_segs, bp->tso_max_segs);
+ netif_set_tso_max_segs(dev, max_segs);
+ } else if (bp->tso_max_segs) {
netif_set_tso_max_segs(dev, bp->tso_max_segs);
+ }
dev->xdp_features = NETDEV_XDP_ACT_BASIC | NETDEV_XDP_ACT_REDIRECT |
NETDEV_XDP_ACT_RX_SG;
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
index 48e8e3be70d3..44b3fd18fcbe 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_ethtool.c
@@ -33,6 +33,7 @@
#include "bnxt_xdp.h"
#include "bnxt_ptp.h"
#include "bnxt_ethtool.h"
+#include "bnxt_gso.h"
#include "bnxt_nvm_defs.h" /* NVRAM content constant and structure defs */
#include "bnxt_fw_hdr.h" /* Firmware hdr constant and structure defs */
#include "bnxt_coredump.h"
@@ -852,12 +853,18 @@ static int bnxt_set_ringparam(struct net_device *dev,
u8 tcp_data_split = kernel_ering->tcp_data_split;
struct bnxt *bp = netdev_priv(dev);
u8 hds_config_mod;
+ int rc;
if ((ering->rx_pending > BNXT_MAX_RX_DESC_CNT) ||
(ering->tx_pending > BNXT_MAX_TX_DESC_CNT) ||
(ering->tx_pending < BNXT_MIN_TX_DESC_CNT))
return -EINVAL;
+ if ((dev->features & NETIF_F_GSO_UDP_L4) &&
+ !(bp->flags & BNXT_FLAG_UDP_GSO_CAP) &&
+ ering->tx_pending < 2 * BNXT_SW_USO_MAX_DESCS)
+ return -EINVAL;
+
hds_config_mod = tcp_data_split != dev->cfg->hds_config;
if (tcp_data_split == ETHTOOL_TCP_DATA_SPLIT_DISABLED && hds_config_mod)
return -EINVAL;
@@ -882,9 +889,17 @@ static int bnxt_set_ringparam(struct net_device *dev,
bp->tx_ring_size = ering->tx_pending;
bnxt_set_ring_params(bp);
- if (netif_running(dev))
- return bnxt_open_nic(bp, false, false);
+ if (netif_running(dev)) {
+ rc = bnxt_open_nic(bp, false, false);
+ if (rc)
+ return rc;
+ }
+ /* ring size changes may affect features (SW USO requires a minimum
+ * ring size), so recalculate features to ensure the correct features
+ * are blocked/available.
+ */
+ netdev_update_features(dev);
return 0;
}
--
2.52.0
On 3/23/26 7:38 PM, Joe Damato wrote:
> Update __bnxt_tx_int and bnxt_free_one_tx_ring_skbs to handle SW GSO
> segments:
>
> - MID segments: adjust tx_pkts/tx_bytes accounting and skip skb free
> (the skb is shared across all segments and freed only once)
>
> - LAST segments: if the DMA IOVA path was used, use dma_iova_destroy to
> tear down the contiguous mapping. On the fallback path, payload DMA
> unmapping is handled by the existing per-BD dma_unmap_len walk.
>
> Both MID and LAST completions advance tx_inline_cons to release the
> segment's inline header slot back to the ring.
>
> is_sw_gso is initialized to zero, so the new code paths are not run.
>
> Suggested-by: Jakub Kicinski <kuba@kernel.org>
> Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
> Signed-off-by: Joe Damato <joe@dama.to>
> ---
> v5:
> - Added Pavan's Reviewed-by. No functional changes.
>
> v3:
> - completion paths updated to use DMA IOVA APIs to teardown mappings.
>
> rfcv2:
> - Update the shared header buffer consumer on TX completion.
>
> drivers/net/ethernet/broadcom/bnxt/bnxt.c | 82 +++++++++++++++++--
> .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 ++++-
> 2 files changed, 91 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> index 2759a4e2b148..40a16f96feba 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> @@ -74,6 +74,8 @@
> #include "bnxt_debugfs.h"
> #include "bnxt_coredump.h"
> #include "bnxt_hwmon.h"
> +#include "bnxt_gso.h"
> +#include <net/tso.h>
>
> #define BNXT_TX_TIMEOUT (5 * HZ)
> #define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW | \
> @@ -817,12 +819,13 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
> bool rc = false;
>
> while (RING_TX(bp, cons) != hw_cons) {
> - struct bnxt_sw_tx_bd *tx_buf;
> + struct bnxt_sw_tx_bd *tx_buf, *head_buf;
> struct sk_buff *skb;
> bool is_ts_pkt;
> int j, last;
>
> tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)];
> + head_buf = tx_buf;
> skb = tx_buf->skb;
>
> if (unlikely(!skb)) {
> @@ -869,6 +872,23 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
> DMA_TO_DEVICE, 0);
> }
> }
> +
> + if (unlikely(head_buf->is_sw_gso)) {
> + txr->tx_inline_cons++;
> + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
> + if (dma_use_iova(&head_buf->iova_state))
I'm likely lost, but AFAICS the previous patch/bnxt_sw_udp_gso_xmit()
initialize head_buf->iova_state only when
`dma_use_iova(&head_buf->iova_state) == true`. I.e. in fallback scenario
the previous iova_state is maintained.
Additionally AFAICS dma_iova_destroy does not clear `head_buf->iova_state`.
It looks like that 2 consecutive skb hitting the same slot use a
different dma mapping strategy (fallback vs iova) bat things will
happen?!? should the previous patch always initializing
head_buf->iova_state?
/P
On Thu, Mar 26, 2026 at 01:39:17PM +0100, Paolo Abeni wrote:
> On 3/23/26 7:38 PM, Joe Damato wrote:
> > Update __bnxt_tx_int and bnxt_free_one_tx_ring_skbs to handle SW GSO
> > segments:
> >
> > - MID segments: adjust tx_pkts/tx_bytes accounting and skip skb free
> > (the skb is shared across all segments and freed only once)
> >
> > - LAST segments: if the DMA IOVA path was used, use dma_iova_destroy to
> > tear down the contiguous mapping. On the fallback path, payload DMA
> > unmapping is handled by the existing per-BD dma_unmap_len walk.
> >
> > Both MID and LAST completions advance tx_inline_cons to release the
> > segment's inline header slot back to the ring.
> >
> > is_sw_gso is initialized to zero, so the new code paths are not run.
> >
> > Suggested-by: Jakub Kicinski <kuba@kernel.org>
> > Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
> > Signed-off-by: Joe Damato <joe@dama.to>
> > ---
> > v5:
> > - Added Pavan's Reviewed-by. No functional changes.
> >
> > v3:
> > - completion paths updated to use DMA IOVA APIs to teardown mappings.
> >
> > rfcv2:
> > - Update the shared header buffer consumer on TX completion.
> >
> > drivers/net/ethernet/broadcom/bnxt/bnxt.c | 82 +++++++++++++++++--
> > .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 ++++-
> > 2 files changed, 91 insertions(+), 10 deletions(-)
> >
> > diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > index 2759a4e2b148..40a16f96feba 100644
> > --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
> > @@ -74,6 +74,8 @@
> > #include "bnxt_debugfs.h"
> > #include "bnxt_coredump.h"
> > #include "bnxt_hwmon.h"
> > +#include "bnxt_gso.h"
> > +#include <net/tso.h>
> >
> > #define BNXT_TX_TIMEOUT (5 * HZ)
> > #define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW | \
> > @@ -817,12 +819,13 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
> > bool rc = false;
> >
> > while (RING_TX(bp, cons) != hw_cons) {
> > - struct bnxt_sw_tx_bd *tx_buf;
> > + struct bnxt_sw_tx_bd *tx_buf, *head_buf;
> > struct sk_buff *skb;
> > bool is_ts_pkt;
> > int j, last;
> >
> > tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)];
> > + head_buf = tx_buf;
> > skb = tx_buf->skb;
> >
> > if (unlikely(!skb)) {
> > @@ -869,6 +872,23 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
> > DMA_TO_DEVICE, 0);
> > }
> > }
> > +
> > + if (unlikely(head_buf->is_sw_gso)) {
> > + txr->tx_inline_cons++;
> > + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
> > + if (dma_use_iova(&head_buf->iova_state))
>
> I'm likely lost, but AFAICS the previous patch/bnxt_sw_udp_gso_xmit()
> initialize head_buf->iova_state only when
> `dma_use_iova(&head_buf->iova_state) == true`. I.e. in fallback scenario
> the previous iova_state is maintained.
Note that calling dma_iova_try_alloc zeroes the state before returning whether
the IOVA DMA API can be used or not and I call that uncoditionally (see
below).
> Additionally AFAICS dma_iova_destroy does not clear `head_buf->iova_state`.
That's my understanding, too, that dma_iova_destroy doesn't clear the state.
> It looks like that 2 consecutive skb hitting the same slot use a
> different dma mapping strategy (fallback vs iova) bat things will
> happen?!? should the previous patch always initializing
> head_buf->iova_state?
AFAICT, to switch the IOMMU domain would require unbind the device, changing
the IOMMU type, and re-binding the device... which would destroy all the rings
in the process and thus this wouldn't happen.
The only way I could potentially imagine this happening would be in extreme
IOVA pressure (maybe?):
- packet A in slot N, dma_iova_try_alloc suceeds -> head_buf->iova_state
copied
- completion the packet occurs, dma_iova_destroy is called,
head_buf->iova_state is not cleared
- packet B in slot N, dma_iova_try_alloc fails due to IOVA pressure...
head_buf->iova_state is stale
I'm pretty skeptical that this is a realistic case, TBH.
That said and since it seems my v5 got CR, I can send a v6 with this slight
change to address the case you've mentioned above.
I'll send in a couple hours unless I hear otherwise:
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
index 9c30ee063ef5..7c198847a771 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
@@ -142,8 +142,12 @@ netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp,
tx_buf->is_sw_gso = last ? BNXT_SW_GSO_LAST : BNXT_SW_GSO_MID;
- /* Store IOVA state on the last segment for completion */
- if (last && tso_dma_map_use_iova(&map)) {
+ /* Store IOVA state on the last segment for completion.
+ * Always copy so that a stale iova_state from a prior
+ * occupant of this ring slot cannot be misread by
+ * dma_use_iova() in the completion path.
+ */
+ if (last) {
tx_buf->iova_state = map.iova_state;
tx_buf->iova_total_len = map.total_len;
}
On 3/26/26 6:02 PM, Joe Damato wrote:
> On Thu, Mar 26, 2026 at 01:39:17PM +0100, Paolo Abeni wrote:
>> On 3/23/26 7:38 PM, Joe Damato wrote:
>>> Update __bnxt_tx_int and bnxt_free_one_tx_ring_skbs to handle SW GSO
>>> segments:
>>>
>>> - MID segments: adjust tx_pkts/tx_bytes accounting and skip skb free
>>> (the skb is shared across all segments and freed only once)
>>>
>>> - LAST segments: if the DMA IOVA path was used, use dma_iova_destroy to
>>> tear down the contiguous mapping. On the fallback path, payload DMA
>>> unmapping is handled by the existing per-BD dma_unmap_len walk.
>>>
>>> Both MID and LAST completions advance tx_inline_cons to release the
>>> segment's inline header slot back to the ring.
>>>
>>> is_sw_gso is initialized to zero, so the new code paths are not run.
>>>
>>> Suggested-by: Jakub Kicinski <kuba@kernel.org>
>>> Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
>>> Signed-off-by: Joe Damato <joe@dama.to>
>>> ---
>>> v5:
>>> - Added Pavan's Reviewed-by. No functional changes.
>>>
>>> v3:
>>> - completion paths updated to use DMA IOVA APIs to teardown mappings.
>>>
>>> rfcv2:
>>> - Update the shared header buffer consumer on TX completion.
>>>
>>> drivers/net/ethernet/broadcom/bnxt/bnxt.c | 82 +++++++++++++++++--
>>> .../net/ethernet/broadcom/bnxt/bnxt_ethtool.c | 19 ++++-
>>> 2 files changed, 91 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>> index 2759a4e2b148..40a16f96feba 100644
>>> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
>>> @@ -74,6 +74,8 @@
>>> #include "bnxt_debugfs.h"
>>> #include "bnxt_coredump.h"
>>> #include "bnxt_hwmon.h"
>>> +#include "bnxt_gso.h"
>>> +#include <net/tso.h>
>>>
>>> #define BNXT_TX_TIMEOUT (5 * HZ)
>>> #define BNXT_DEF_MSG_ENABLE (NETIF_MSG_DRV | NETIF_MSG_HW | \
>>> @@ -817,12 +819,13 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
>>> bool rc = false;
>>>
>>> while (RING_TX(bp, cons) != hw_cons) {
>>> - struct bnxt_sw_tx_bd *tx_buf;
>>> + struct bnxt_sw_tx_bd *tx_buf, *head_buf;
>>> struct sk_buff *skb;
>>> bool is_ts_pkt;
>>> int j, last;
>>>
>>> tx_buf = &txr->tx_buf_ring[RING_TX(bp, cons)];
>>> + head_buf = tx_buf;
>>> skb = tx_buf->skb;
>>>
>>> if (unlikely(!skb)) {
>>> @@ -869,6 +872,23 @@ static bool __bnxt_tx_int(struct bnxt *bp, struct bnxt_tx_ring_info *txr,
>>> DMA_TO_DEVICE, 0);
>>> }
>>> }
>>> +
>>> + if (unlikely(head_buf->is_sw_gso)) {
>>> + txr->tx_inline_cons++;
>>> + if (head_buf->is_sw_gso == BNXT_SW_GSO_LAST) {
>>> + if (dma_use_iova(&head_buf->iova_state))
>>
>> I'm likely lost, but AFAICS the previous patch/bnxt_sw_udp_gso_xmit()
>> initialize head_buf->iova_state only when
>> `dma_use_iova(&head_buf->iova_state) == true`. I.e. in fallback scenario
>> the previous iova_state is maintained.
>
> Note that calling dma_iova_try_alloc zeroes the state before returning whether
> the IOVA DMA API can be used or not and I call that uncoditionally (see
> below).
>
>> Additionally AFAICS dma_iova_destroy does not clear `head_buf->iova_state`.
>
> That's my understanding, too, that dma_iova_destroy doesn't clear the state.
>
>> It looks like that 2 consecutive skb hitting the same slot use a
>> different dma mapping strategy (fallback vs iova) bat things will
>> happen?!? should the previous patch always initializing
>> head_buf->iova_state?
>
> AFAICT, to switch the IOMMU domain would require unbind the device, changing
> the IOMMU type, and re-binding the device... which would destroy all the rings
> in the process and thus this wouldn't happen.
>
> The only way I could potentially imagine this happening would be in extreme
> IOVA pressure (maybe?):
> - packet A in slot N, dma_iova_try_alloc suceeds -> head_buf->iova_state
> copied
> - completion the packet occurs, dma_iova_destroy is called,
> head_buf->iova_state is not cleared
> - packet B in slot N, dma_iova_try_alloc fails due to IOVA pressure...
> head_buf->iova_state is stale
>
> I'm pretty skeptical that this is a realistic case, TBH.
>
> That said and since it seems my v5 got CR, I can send a v6 with this slight
> change to address the case you've mentioned above.
>
> I'll send in a couple hours unless I hear otherwise:
>
> diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
> index 9c30ee063ef5..7c198847a771 100644
> --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
> +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
> @@ -142,8 +142,12 @@ netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp,
>
> tx_buf->is_sw_gso = last ? BNXT_SW_GSO_LAST : BNXT_SW_GSO_MID;
>
> - /* Store IOVA state on the last segment for completion */
> - if (last && tso_dma_map_use_iova(&map)) {
> + /* Store IOVA state on the last segment for completion.
> + * Always copy so that a stale iova_state from a prior
> + * occupant of this ring slot cannot be misread by
> + * dma_use_iova() in the completion path.
> + */
> + if (last) {
> tx_buf->iova_state = map.iova_state;
> tx_buf->iova_total_len = map.total_len;
> }
>
Since tso_dma_map_use_iova(&map) is the likely option, I tend to think
that the above change is worthy even if the problem I feared about is
extremely unlikely if possible at all: the code is IMHO easier to
follow, and FWIW does not overoptimize an unlikely scenario.
/P
© 2016 - 2026 Red Hat, Inc.