Implement bnxt_sw_udp_gso_xmit() using the core tso_dma_map API and
the pre-allocated TX inline buffer for per-segment headers.
The xmit path:
1. Calls tso_start() to initialize TSO state
2. Stack-allocates a tso_dma_map and calls tso_dma_map_init() to
DMA-map the linear payload and all frags upfront.
3. For each segment:
- Copies and patches headers via tso_build_hdr() into the
pre-allocated tx_inline_buf (DMA-synced per segment)
- Counts payload BDs via tso_dma_map_count()
- Emits long BD (header) + ext BD + payload BDs
- Payload BDs use tso_dma_map_next() which yields (dma_addr,
chunk_len, mapping_len) tuples.
Header BDs set dma_unmap_len=0 since the inline buffer is pre-allocated
and unmapped only at ring teardown.
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
Signed-off-by: Joe Damato <joe@dama.to>
---
v6:
- Addressed Paolo's feedback where the IOVA API could fail transiently,
leaving stale state in iova_state. Fix this by always copying the state,
noting that dma_iova_try_alloc is called unconditionally in the
tso_dma_map_init function (via tso_dma_iova_try), which zeroes the state
even if the API can't be used.
- Since this was a very minor change, I retained Pavan's Reviewed-by.
v5:
- Added __maybe_unused to last_unmap_len and last_unmap_addr to silence a
build warning when CONFIG_NEED_DMA_MAP_STATE is disabled. No functional
changes.
- Added Pavan's Reviewed-by.
v4:
- Fixed the early return issue Pavan pointed out when num_segs <= 1; use the
drop label instead of returning.
v3:
- Added iova_state and iova_total_len to struct bnxt_sw_tx_bd.
- Stores iova_state on the last segment's tx_buf during xmit.
rfcv2:
- set the unmap len on the last descriptor, so that when completions fire
only the last completion unmaps the region.
drivers/net/ethernet/broadcom/bnxt/bnxt.h | 4 +
drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c | 210 ++++++++++++++++++
2 files changed, 214 insertions(+)
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
index 18b08789b3a4..865546f3bfce 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h
@@ -11,6 +11,8 @@
#ifndef BNXT_H
#define BNXT_H
+#include <linux/dma-mapping.h>
+
#define DRV_MODULE_NAME "bnxt_en"
/* DO NOT CHANGE DRV_VER_* defines
@@ -897,6 +899,8 @@ struct bnxt_sw_tx_bd {
u16 rx_prod;
u16 txts_prod;
};
+ struct dma_iova_state iova_state;
+ size_t iova_total_len;
};
#define BNXT_SW_GSO_MID 1
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
index b296769ee4fe..7c198847a771 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_gso.c
@@ -19,11 +19,221 @@
#include "bnxt.h"
#include "bnxt_gso.h"
+static u32 bnxt_sw_gso_lhint(unsigned int len)
+{
+ if (len <= 512)
+ return TX_BD_FLAGS_LHINT_512_AND_SMALLER;
+ else if (len <= 1023)
+ return TX_BD_FLAGS_LHINT_512_TO_1023;
+ else if (len <= 2047)
+ return TX_BD_FLAGS_LHINT_1024_TO_2047;
+ else
+ return TX_BD_FLAGS_LHINT_2048_AND_LARGER;
+}
+
netdev_tx_t bnxt_sw_udp_gso_xmit(struct bnxt *bp,
struct bnxt_tx_ring_info *txr,
struct netdev_queue *txq,
struct sk_buff *skb)
{
+ unsigned int last_unmap_len __maybe_unused = 0;
+ dma_addr_t last_unmap_addr __maybe_unused = 0;
+ struct bnxt_sw_tx_bd *last_unmap_buf = NULL;
+ unsigned int hdr_len, mss, num_segs;
+ struct pci_dev *pdev = bp->pdev;
+ unsigned int total_payload;
+ int i, bds_needed, slots;
+ struct tso_dma_map map;
+ u32 vlan_tag_flags = 0;
+ struct tso_t tso;
+ u16 cfa_action;
+ u16 prod;
+
+ hdr_len = tso_start(skb, &tso);
+ mss = skb_shinfo(skb)->gso_size;
+ total_payload = skb->len - hdr_len;
+ num_segs = DIV_ROUND_UP(total_payload, mss);
+
+ /* Zero the csum fields so tso_build_hdr will propagate zeroes into
+ * every segment header. HW csum offload will recompute from scratch.
+ */
+ udp_hdr(skb)->check = 0;
+ if (!tso.ipv6)
+ ip_hdr(skb)->check = 0;
+
+ if (unlikely(num_segs <= 1))
+ goto drop;
+
+ /* Upper bound on the number of descriptors needed.
+ *
+ * Each segment uses 1 long BD + 1 ext BD + payload BDs, which is
+ * at most num_segs + nr_frags (each frag boundary crossing adds at
+ * most 1 extra BD).
+ */
+ bds_needed = 3 * num_segs + skb_shinfo(skb)->nr_frags + 1;
+
+ if (unlikely(bnxt_tx_avail(bp, txr) < bds_needed)) {
+ netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
+ bp->tx_wake_thresh);
+ return NETDEV_TX_BUSY;
+ }
+
+ slots = BNXT_SW_USO_MAX_SEGS - (txr->tx_inline_prod - txr->tx_inline_cons);
+
+ if (unlikely(slots < num_segs)) {
+ netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
+ bp->tx_wake_thresh);
+ return NETDEV_TX_BUSY;
+ }
+
+ if (unlikely(tso_dma_map_init(&map, &pdev->dev, skb, hdr_len)))
+ goto drop;
+
+ cfa_action = bnxt_xmit_get_cfa_action(skb);
+ if (skb_vlan_tag_present(skb)) {
+ vlan_tag_flags = TX_BD_CFA_META_KEY_VLAN |
+ skb_vlan_tag_get(skb);
+ if (skb->vlan_proto == htons(ETH_P_8021Q))
+ vlan_tag_flags |= 1 << TX_BD_CFA_META_TPID_SHIFT;
+ }
+
+ prod = txr->tx_prod;
+
+ for (i = 0; i < num_segs; i++) {
+ unsigned int seg_payload = min_t(unsigned int, mss,
+ total_payload - i * mss);
+ u16 slot = (txr->tx_inline_prod + i) &
+ (BNXT_SW_USO_MAX_SEGS - 1);
+ struct bnxt_sw_tx_bd *tx_buf;
+ unsigned int mapping_len;
+ dma_addr_t this_hdr_dma;
+ unsigned int chunk_len;
+ unsigned int offset;
+ dma_addr_t dma_addr;
+ struct tx_bd *txbd;
+ void *this_hdr;
+ int bd_count;
+ __le32 csum;
+ bool last;
+ u32 flags;
+
+ last = (i == num_segs - 1);
+ offset = slot * TSO_HEADER_SIZE;
+ this_hdr = txr->tx_inline_buf + offset;
+ this_hdr_dma = txr->tx_inline_dma + offset;
+
+ tso_build_hdr(skb, this_hdr, &tso, seg_payload, last);
+
+ dma_sync_single_for_device(&pdev->dev, this_hdr_dma,
+ hdr_len, DMA_TO_DEVICE);
+
+ bd_count = tso_dma_map_count(&map, seg_payload);
+
+ tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)];
+ txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)];
+
+ tx_buf->skb = skb;
+ tx_buf->nr_frags = bd_count;
+ tx_buf->is_push = 0;
+ tx_buf->is_ts_pkt = 0;
+
+ dma_unmap_addr_set(tx_buf, mapping, this_hdr_dma);
+ dma_unmap_len_set(tx_buf, len, 0);
+
+ tx_buf->is_sw_gso = last ? BNXT_SW_GSO_LAST : BNXT_SW_GSO_MID;
+
+ /* Store IOVA state on the last segment for completion.
+ * Always copy so that a stale iova_state from a prior
+ * occupant of this ring slot cannot be misread by
+ * dma_use_iova() in the completion path.
+ */
+ if (last) {
+ tx_buf->iova_state = map.iova_state;
+ tx_buf->iova_total_len = map.total_len;
+ }
+
+ flags = (hdr_len << TX_BD_LEN_SHIFT) |
+ TX_BD_TYPE_LONG_TX_BD |
+ TX_BD_CNT(2 + bd_count);
+
+ flags |= bnxt_sw_gso_lhint(hdr_len + seg_payload);
+
+ txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
+ txbd->tx_bd_haddr = cpu_to_le64(this_hdr_dma);
+ txbd->tx_bd_opaque = SET_TX_OPAQUE(bp, txr, prod,
+ 2 + bd_count);
+
+ csum = cpu_to_le32(TX_BD_FLAGS_TCP_UDP_CHKSUM |
+ TX_BD_FLAGS_IP_CKSUM);
+
+ prod = NEXT_TX(prod);
+ bnxt_init_ext_bd(bp, txr, prod, csum,
+ vlan_tag_flags, cfa_action);
+
+ /* set dma_unmap_len on the LAST BD touching each
+ * region. Since completions are in-order, the last segment
+ * completes after all earlier ones, so the unmap is safe.
+ */
+ while (tso_dma_map_next(&map, &dma_addr, &chunk_len,
+ &mapping_len, seg_payload)) {
+ prod = NEXT_TX(prod);
+ txbd = &txr->tx_desc_ring[TX_RING(bp, prod)][TX_IDX(prod)];
+ tx_buf = &txr->tx_buf_ring[RING_TX(bp, prod)];
+
+ txbd->tx_bd_haddr = cpu_to_le64(dma_addr);
+ dma_unmap_addr_set(tx_buf, mapping, dma_addr);
+ dma_unmap_len_set(tx_buf, len, 0);
+ tx_buf->skb = NULL;
+ tx_buf->is_sw_gso = 0;
+
+ if (mapping_len) {
+ if (last_unmap_buf) {
+ dma_unmap_addr_set(last_unmap_buf,
+ mapping,
+ last_unmap_addr);
+ dma_unmap_len_set(last_unmap_buf,
+ len,
+ last_unmap_len);
+ }
+ last_unmap_addr = dma_addr;
+ last_unmap_len = mapping_len;
+ }
+ last_unmap_buf = tx_buf;
+
+ flags = chunk_len << TX_BD_LEN_SHIFT;
+ txbd->tx_bd_len_flags_type = cpu_to_le32(flags);
+ txbd->tx_bd_opaque = 0;
+
+ seg_payload -= chunk_len;
+ }
+
+ txbd->tx_bd_len_flags_type |=
+ cpu_to_le32(TX_BD_FLAGS_PACKET_END);
+
+ prod = NEXT_TX(prod);
+ }
+
+ if (last_unmap_buf) {
+ dma_unmap_addr_set(last_unmap_buf, mapping, last_unmap_addr);
+ dma_unmap_len_set(last_unmap_buf, len, last_unmap_len);
+ }
+
+ txr->tx_inline_prod += num_segs;
+
+ netdev_tx_sent_queue(txq, skb->len);
+
+ WRITE_ONCE(txr->tx_prod, prod);
+ /* Sync BDs before doorbell */
+ wmb();
+ bnxt_db_write(bp, &txr->tx_db, prod);
+
+ if (unlikely(bnxt_tx_avail(bp, txr) <= bp->tx_wake_thresh))
+ netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
+ bp->tx_wake_thresh);
+
+ return NETDEV_TX_OK;
+
+drop:
dev_kfree_skb_any(skb);
dev_core_stats_tx_dropped_inc(bp->dev);
return NETDEV_TX_OK;
--
2.52.0
On Thu, 26 Mar 2026 16:52:27 -0700 Joe Damato wrote:
> + /* Upper bound on the number of descriptors needed.
> + *
> + * Each segment uses 1 long BD + 1 ext BD + payload BDs, which is
> + * at most num_segs + nr_frags (each frag boundary crossing adds at
> + * most 1 extra BD).
> + */
> + bds_needed = 3 * num_segs + skb_shinfo(skb)->nr_frags + 1;
> +
> + if (unlikely(bnxt_tx_avail(bp, txr) < bds_needed)) {
> + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
> + bp->tx_wake_thresh);
> + return NETDEV_TX_BUSY;
> + }
> +
> + slots = BNXT_SW_USO_MAX_SEGS - (txr->tx_inline_prod - txr->tx_inline_cons);
> +
> + if (unlikely(slots < num_segs)) {
> + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
This looks sus, try_stop() will evaluate the bnxt_tx_avail(bp, txr)
and leave the ring running.
> + bp->tx_wake_thresh);
Is tx_wake_thresh larger than the max USO even for smallest ring size?
On Sun, Mar 29, 2026 at 03:20:16PM -0700, Jakub Kicinski wrote:
> On Thu, 26 Mar 2026 16:52:27 -0700 Joe Damato wrote:
> > + /* Upper bound on the number of descriptors needed.
> > + *
> > + * Each segment uses 1 long BD + 1 ext BD + payload BDs, which is
> > + * at most num_segs + nr_frags (each frag boundary crossing adds at
> > + * most 1 extra BD).
> > + */
> > + bds_needed = 3 * num_segs + skb_shinfo(skb)->nr_frags + 1;
> > +
> > + if (unlikely(bnxt_tx_avail(bp, txr) < bds_needed)) {
> > + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
> > + bp->tx_wake_thresh);
> > + return NETDEV_TX_BUSY;
> > + }
> > +
> > + slots = BNXT_SW_USO_MAX_SEGS - (txr->tx_inline_prod - txr->tx_inline_cons);
> > +
> > + if (unlikely(slots < num_segs)) {
> > + netif_txq_try_stop(txq, bnxt_tx_avail(bp, txr),
>
> This looks sus, try_stop() will evaluate the bnxt_tx_avail(bp, txr)
> and leave the ring running.
Yea, I think the slot check can actually be removed entirely. Each segment
consumes 1 inline slot and at least 3 BDs. BNXT_SW_USO_MAX_SEGS is 64 and the
ring's minimum size is 2 * BNXT_SW_USO_MAX_DESCS (420).
Using 64 slots would consume at least 192 BDs, so the check above would fire
first.
I think as long as the ring size is constrained by the code in fix_features
and set_ringparam then this if block can be removed.
> > + bp->tx_wake_thresh);
>
> Is tx_wake_thresh larger than the max USO even for smallest ring size?
Yes, it is.
Maybe its worth adding a comment in the code somewhere to make
this more clear? Not sure where would be an appropriate place, but maybe
bnxt_init_tx_rings?
On Mon, 30 Mar 2026 09:53:33 -0700 Joe Damato wrote: > > > + bp->tx_wake_thresh); > > > > Is tx_wake_thresh larger than the max USO even for smallest ring size? > > Yes, it is. > > Maybe its worth adding a comment in the code somewhere to make > this more clear? Not sure where would be an appropriate place, but maybe > bnxt_init_tx_rings? Hm, as long as BNXT_MIN_TX_DESC_CNT is updated I don't think we need any bespoke comments
On Mon, Mar 30, 2026 at 04:53:57PM -0700, Jakub Kicinski wrote: > On Mon, 30 Mar 2026 09:53:33 -0700 Joe Damato wrote: > > > > + bp->tx_wake_thresh); > > > > > > Is tx_wake_thresh larger than the max USO even for smallest ring size? > > > > Yes, it is. > > > > Maybe its worth adding a comment in the code somewhere to make > > this more clear? Not sure where would be an appropriate place, but maybe > > bnxt_init_tx_rings? > > Hm, as long as BNXT_MIN_TX_DESC_CNT is updated I don't think we need > any bespoke comments I am hesitant to update BNXT_MIN_TX_DESC_CNT because it affects all hardware, including devices that do USO in hardware. It may not matter in practice since I would be bumping it from 19 to 210, but we could just leave it as is and let fix_features and init_tx_rings deal with ring size (as they do now). IDK. Feels "safer" (but maybe less clear) to leave it as is. WDYT?
On Wed, Apr 1, 2026 at 2:41 AM Joe Damato <joe@dama.to> wrote: > > On Mon, Mar 30, 2026 at 04:53:57PM -0700, Jakub Kicinski wrote: > > On Mon, 30 Mar 2026 09:53:33 -0700 Joe Damato wrote: > > > > > + bp->tx_wake_thresh); > > > > > > > > Is tx_wake_thresh larger than the max USO even for smallest ring size? > > > > > > Yes, it is. > > > > > > Maybe its worth adding a comment in the code somewhere to make > > > this more clear? Not sure where would be an appropriate place, but maybe > > > bnxt_init_tx_rings? > > > > Hm, as long as BNXT_MIN_TX_DESC_CNT is updated I don't think we need > > any bespoke comments > > I am hesitant to update BNXT_MIN_TX_DESC_CNT because it affects all hardware, > including devices that do USO in hardware. > > It may not matter in practice since I would be bumping it from 19 to 210, but > we could just leave it as is and let fix_features and init_tx_rings deal with > ring size (as they do now). > > IDK. Feels "safer" (but maybe less clear) to leave it as is. WDYT? To me also, having a helper (like you showed in the next patch) determine the min tx bds for use of the feature is better than altering the entire default behaviour.
© 2016 - 2026 Red Hat, Inc.