Use libeth XDP infra to support running XDP program on Rx polling.
This includes all of the possible verdicts/actions.
XDP Tx queues are cleaned only in "lazy" mode when there are less than
1/4 free descriptors left on the ring. libeth helper macros to define
driver-specific XDP functions make sure the compiler could uninline
them when needed.
Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when
applicable. It really gives some good boosts and code size reduction
on x86_64.
Co-developed-by: Michal Kubiak <michal.kubiak@intel.com>
Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
drivers/net/ethernet/intel/idpf/idpf_txrx.h | 4 +-
drivers/net/ethernet/intel/idpf/xdp.h | 92 +++++++++++-
drivers/net/ethernet/intel/idpf/idpf_lib.c | 2 +
drivers/net/ethernet/intel/idpf/idpf_txrx.c | 23 +--
drivers/net/ethernet/intel/idpf/xdp.c | 147 +++++++++++++++++++-
5 files changed, 248 insertions(+), 20 deletions(-)
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index 5039feafdee9..39a9c6bd6055 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -646,8 +646,8 @@ struct idpf_tx_queue {
__cacheline_group_end_aligned(read_mostly);
__cacheline_group_begin_aligned(read_write);
- u16 next_to_use;
- u16 next_to_clean;
+ u32 next_to_use;
+ u32 next_to_clean;
union {
struct {
diff --git a/drivers/net/ethernet/intel/idpf/xdp.h b/drivers/net/ethernet/intel/idpf/xdp.h
index 47553ce5f81a..986156162e2d 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.h
+++ b/drivers/net/ethernet/intel/idpf/xdp.h
@@ -4,12 +4,9 @@
#ifndef _IDPF_XDP_H_
#define _IDPF_XDP_H_
-#include <linux/types.h>
+#include <net/libeth/xdp.h>
-struct bpf_prog;
-struct idpf_vport;
-struct net_device;
-struct netdev_bpf;
+#include "idpf_txrx.h"
int idpf_xdp_rxq_info_init_all(const struct idpf_vport *vport);
void idpf_xdp_rxq_info_deinit_all(const struct idpf_vport *vport);
@@ -19,6 +16,91 @@ void idpf_xdp_copy_prog_to_rqs(const struct idpf_vport *vport,
int idpf_xdpsqs_get(const struct idpf_vport *vport);
void idpf_xdpsqs_put(const struct idpf_vport *vport);
+bool idpf_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags);
+
+/**
+ * idpf_xdp_tx_xmit - produce a single HW Tx descriptor out of XDP desc
+ * @desc: XDP descriptor to pull the DMA address and length from
+ * @i: descriptor index on the queue to fill
+ * @sq: XDP queue to produce the HW Tx descriptor on
+ * @priv: &xsk_tx_metadata_ops on XSk xmit or %NULL
+ */
+static inline void idpf_xdp_tx_xmit(struct libeth_xdp_tx_desc desc, u32 i,
+ const struct libeth_xdpsq *sq, u64 priv)
+{
+ struct idpf_flex_tx_desc *tx_desc = sq->descs;
+ u32 cmd;
+
+ cmd = FIELD_PREP(IDPF_FLEX_TXD_QW1_DTYPE_M,
+ IDPF_TX_DESC_DTYPE_FLEX_L2TAG1_L2TAG2);
+ if (desc.flags & LIBETH_XDP_TX_LAST)
+ cmd |= FIELD_PREP(IDPF_FLEX_TXD_QW1_CMD_M,
+ IDPF_TX_DESC_CMD_EOP);
+ if (priv && (desc.flags & LIBETH_XDP_TX_CSUM))
+ cmd |= FIELD_PREP(IDPF_FLEX_TXD_QW1_CMD_M,
+ IDPF_TX_FLEX_DESC_CMD_CS_EN);
+
+ tx_desc = &tx_desc[i];
+ tx_desc->buf_addr = cpu_to_le64(desc.addr);
+#ifdef __LIBETH_WORD_ACCESS
+ *(u64 *)&tx_desc->qw1 = ((u64)desc.len << 48) | cmd;
+#else
+ tx_desc->qw1.buf_size = cpu_to_le16(desc.len);
+ tx_desc->qw1.cmd_dtype = cpu_to_le16(cmd);
+#endif
+}
+
+static inline void idpf_xdpsq_set_rs(const struct idpf_tx_queue *xdpsq)
+{
+ u32 ntu, cmd;
+
+ ntu = xdpsq->next_to_use;
+ if (unlikely(!ntu))
+ ntu = xdpsq->desc_count;
+
+ cmd = FIELD_PREP(IDPF_FLEX_TXD_QW1_CMD_M, IDPF_TX_DESC_CMD_RS);
+#ifdef __LIBETH_WORD_ACCESS
+ *(u64 *)&xdpsq->flex_tx[ntu - 1].q.qw1 |= cmd;
+#else
+ xdpsq->flex_tx[ntu - 1].q.qw1.cmd_dtype |= cpu_to_le16(cmd);
+#endif
+}
+
+static inline void idpf_xdpsq_update_tail(const struct idpf_tx_queue *xdpsq)
+{
+ dma_wmb();
+ writel_relaxed(xdpsq->next_to_use, xdpsq->tail);
+}
+
+/**
+ * idpf_xdp_tx_finalize - finalize sending over XDPSQ
+ * @_xdpsq: XDP Tx queue
+ * @sent: whether any frames were sent
+ * @flush: whether to update RS bit and the tail register
+ *
+ * Set the RS bit ("end of batch"), bump the tail, and queue the cleanup timer.
+ * To be called after a NAPI polling loop, at the end of .ndo_xdp_xmit() etc.
+ */
+static inline void idpf_xdp_tx_finalize(void *_xdpsq, bool sent, bool flush)
+{
+ struct idpf_tx_queue *xdpsq = _xdpsq;
+
+ if ((!flush || unlikely(!sent)) &&
+ likely(xdpsq->desc_count - 1 != xdpsq->pending))
+ return;
+
+ libeth_xdpsq_lock(&xdpsq->xdp_lock);
+
+ idpf_xdpsq_set_rs(xdpsq);
+ idpf_xdpsq_update_tail(xdpsq);
+
+ libeth_xdpsq_queue_timer(xdpsq->timer);
+
+ libeth_xdpsq_unlock(&xdpsq->xdp_lock);
+}
+
+void idpf_xdp_set_features(const struct idpf_vport *vport);
+
int idpf_xdp(struct net_device *dev, struct netdev_bpf *xdp);
#endif /* _IDPF_XDP_H_ */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index 8f2ea790fdf3..30c8899ea0f4 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -835,6 +835,8 @@ static int idpf_cfg_netdev(struct idpf_vport *vport)
netdev->hw_features |= netdev->features | other_offloads;
netdev->vlan_features |= netdev->features | other_offloads;
netdev->hw_enc_features |= dflt_features | other_offloads;
+ idpf_xdp_set_features(vport);
+
idpf_set_ethtool_ops(netdev);
netif_set_affinity_auto(netdev);
SET_NETDEV_DEV(netdev, &adapter->pdev->dev);
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index 7a2914e4ec95..3beacd215b39 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -1,8 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2023 Intel Corporation */
-#include <net/libeth/xdp.h>
-
#include "idpf.h"
#include "idpf_ptp.h"
#include "idpf_virtchnl.h"
@@ -3127,14 +3125,12 @@ static bool idpf_rx_process_skb_fields(struct sk_buff *skb,
return !__idpf_rx_process_skb_fields(rxq, skb, xdp->desc);
}
-static void
-idpf_xdp_run_pass(struct libeth_xdp_buff *xdp, struct napi_struct *napi,
- struct libeth_rq_napi_stats *ss,
- const struct virtchnl2_rx_flex_desc_adv_nic_3 *desc)
-{
- libeth_xdp_run_pass(xdp, NULL, napi, ss, desc, NULL,
- idpf_rx_process_skb_fields);
-}
+LIBETH_XDP_DEFINE_START();
+LIBETH_XDP_DEFINE_RUN(static idpf_xdp_run_pass, idpf_xdp_run_prog,
+ idpf_xdp_tx_flush_bulk, idpf_rx_process_skb_fields);
+LIBETH_XDP_DEFINE_FINALIZE(static idpf_xdp_finalize_rx, idpf_xdp_tx_flush_bulk,
+ idpf_xdp_tx_finalize);
+LIBETH_XDP_DEFINE_END();
/**
* idpf_rx_hsplit_wa - handle header buffer overflows and split errors
@@ -3222,7 +3218,10 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget)
struct libeth_rq_napi_stats rs = { };
u16 ntc = rxq->next_to_clean;
LIBETH_XDP_ONSTACK_BUFF(xdp);
+ LIBETH_XDP_ONSTACK_BULK(bq);
+ libeth_xdp_tx_init_bulk(&bq, rxq->xdp_prog, rxq->xdp_rxq.dev,
+ rxq->xdpsqs, rxq->num_xdp_txq);
libeth_xdp_init_buff(xdp, &rxq->xdp, &rxq->xdp_rxq);
/* Process Rx packets bounded by budget */
@@ -3318,11 +3317,13 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget)
if (!idpf_rx_splitq_is_eop(rx_desc) || unlikely(!xdp->data))
continue;
- idpf_xdp_run_pass(xdp, rxq->napi, &rs, rx_desc);
+ idpf_xdp_run_pass(xdp, &bq, rxq->napi, &rs, rx_desc);
}
rxq->next_to_clean = ntc;
+
libeth_xdp_save_buff(&rxq->xdp, xdp);
+ idpf_xdp_finalize_rx(&bq);
u64_stats_update_begin(&rxq->stats_sync);
u64_stats_add(&rxq->q_stats.packets, rs.packets);
diff --git a/drivers/net/ethernet/intel/idpf/xdp.c b/drivers/net/ethernet/intel/idpf/xdp.c
index 09e84fe80d4e..d7ba2848148e 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.c
+++ b/drivers/net/ethernet/intel/idpf/xdp.c
@@ -1,8 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
/* Copyright (C) 2025 Intel Corporation */
-#include <net/libeth/xdp.h>
-
#include "idpf.h"
#include "idpf_virtchnl.h"
#include "xdp.h"
@@ -114,6 +112,8 @@ void idpf_xdp_copy_prog_to_rqs(const struct idpf_vport *vport,
idpf_rxq_for_each(vport, idpf_xdp_rxq_assign_prog, xdp_prog);
}
+static void idpf_xdp_tx_timer(struct work_struct *work);
+
int idpf_xdpsqs_get(const struct idpf_vport *vport)
{
struct libeth_xdpsq_timer **timers __free(kvfree) = NULL;
@@ -154,6 +154,8 @@ int idpf_xdpsqs_get(const struct idpf_vport *vport)
xdpsq->timer = timers[i - sqs];
libeth_xdpsq_get(&xdpsq->xdp_lock, dev, vport->xdpsq_share);
+ libeth_xdpsq_init_timer(xdpsq->timer, xdpsq, &xdpsq->xdp_lock,
+ idpf_xdp_tx_timer);
xdpsq->pending = 0;
xdpsq->xdp_tx = 0;
@@ -180,6 +182,7 @@ void idpf_xdpsqs_put(const struct idpf_vport *vport)
if (!idpf_queue_has_clear(XDP, xdpsq))
continue;
+ libeth_xdpsq_deinit_timer(xdpsq->timer);
libeth_xdpsq_put(&xdpsq->xdp_lock, dev);
kfree(xdpsq->timer);
@@ -187,6 +190,146 @@ void idpf_xdpsqs_put(const struct idpf_vport *vport)
}
}
+static int idpf_xdp_parse_cqe(const struct idpf_splitq_4b_tx_compl_desc *desc,
+ bool gen)
+{
+ u32 val;
+
+#ifdef __LIBETH_WORD_ACCESS
+ val = *(const u32 *)desc;
+#else
+ val = ((u32)le16_to_cpu(desc->q_head_compl_tag.q_head) << 16) |
+ le16_to_cpu(desc->qid_comptype_gen);
+#endif
+ if (!!(val & IDPF_TXD_COMPLQ_GEN_M) != gen)
+ return -ENODATA;
+
+ if (unlikely((val & GENMASK(IDPF_TXD_COMPLQ_GEN_S - 1, 0)) !=
+ FIELD_PREP(IDPF_TXD_COMPLQ_COMPL_TYPE_M,
+ IDPF_TXD_COMPLT_RS)))
+ return -EINVAL;
+
+ return upper_16_bits(val);
+}
+
+static u32 idpf_xdpsq_poll(struct idpf_tx_queue *xdpsq, u32 budget)
+{
+ struct idpf_compl_queue *cq = xdpsq->complq;
+ u32 tx_ntc = xdpsq->next_to_clean;
+ u32 tx_cnt = xdpsq->desc_count;
+ u32 ntc = cq->next_to_clean;
+ u32 cnt = cq->desc_count;
+ u32 done_frames;
+ bool gen;
+
+ gen = idpf_queue_has(GEN_CHK, cq);
+
+ for (done_frames = 0; done_frames < budget; ) {
+ int ret;
+
+ ret = idpf_xdp_parse_cqe(&cq->comp_4b[ntc], gen);
+ if (ret >= 0) {
+ done_frames = ret > tx_ntc ? ret - tx_ntc :
+ ret + tx_cnt - tx_ntc;
+ goto next;
+ }
+
+ switch (ret) {
+ case -ENODATA:
+ goto out;
+ case -EINVAL:
+ break;
+ }
+
+next:
+ if (unlikely(++ntc == cnt)) {
+ ntc = 0;
+ gen = !gen;
+ idpf_queue_change(GEN_CHK, cq);
+ }
+ }
+
+out:
+ cq->next_to_clean = ntc;
+
+ return done_frames;
+}
+
+static u32 idpf_xdpsq_complete(void *_xdpsq, u32 budget)
+{
+ struct libeth_xdpsq_napi_stats ss = { };
+ struct idpf_tx_queue *xdpsq = _xdpsq;
+ u32 tx_ntc = xdpsq->next_to_clean;
+ u32 tx_cnt = xdpsq->desc_count;
+ struct xdp_frame_bulk bq;
+ struct libeth_cq_pp cp = {
+ .dev = xdpsq->dev,
+ .bq = &bq,
+ .xss = &ss,
+ .napi = true,
+ };
+ u32 done_frames;
+
+ done_frames = idpf_xdpsq_poll(xdpsq, budget);
+ if (unlikely(!done_frames))
+ return 0;
+
+ xdp_frame_bulk_init(&bq);
+
+ for (u32 i = 0; likely(i < done_frames); i++) {
+ libeth_xdp_complete_tx(&xdpsq->tx_buf[tx_ntc], &cp);
+
+ if (unlikely(++tx_ntc == tx_cnt))
+ tx_ntc = 0;
+ }
+
+ xdp_flush_frame_bulk(&bq);
+
+ xdpsq->next_to_clean = tx_ntc;
+ xdpsq->pending -= done_frames;
+ xdpsq->xdp_tx -= cp.xdp_tx;
+
+ return done_frames;
+}
+
+static u32 idpf_xdp_tx_prep(void *_xdpsq, struct libeth_xdpsq *sq)
+{
+ struct idpf_tx_queue *xdpsq = _xdpsq;
+ u32 free;
+
+ libeth_xdpsq_lock(&xdpsq->xdp_lock);
+
+ free = xdpsq->desc_count - xdpsq->pending;
+ if (free < xdpsq->thresh)
+ free += idpf_xdpsq_complete(xdpsq, xdpsq->thresh);
+
+ *sq = (struct libeth_xdpsq){
+ .sqes = xdpsq->tx_buf,
+ .descs = xdpsq->desc_ring,
+ .count = xdpsq->desc_count,
+ .lock = &xdpsq->xdp_lock,
+ .ntu = &xdpsq->next_to_use,
+ .pending = &xdpsq->pending,
+ .xdp_tx = &xdpsq->xdp_tx,
+ };
+
+ return free;
+}
+
+LIBETH_XDP_DEFINE_START();
+LIBETH_XDP_DEFINE_TIMER(static idpf_xdp_tx_timer, idpf_xdpsq_complete);
+LIBETH_XDP_DEFINE_FLUSH_TX(idpf_xdp_tx_flush_bulk, idpf_xdp_tx_prep,
+ idpf_xdp_tx_xmit);
+LIBETH_XDP_DEFINE_END();
+
+void idpf_xdp_set_features(const struct idpf_vport *vport)
+{
+ if (!idpf_is_queue_model_split(vport->rxq_model))
+ return;
+
+ libeth_xdp_set_features_noredir(vport->netdev);
+}
+
static int idpf_xdp_setup_prog(struct idpf_vport *vport,
const struct netdev_bpf *xdp)
{
--
2.50.1
On Wed, 30 Jul 2025 18:07:15 +0200 Alexander Lobakin wrote: > Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when > applicable. It really gives some good boosts and code size reduction > on x86_64. Could you perhaps quantify the goodness of the boost with a number? :)
From: Jakub Kicinski <kuba@kernel.org> Date: Fri, 1 Aug 2025 15:33:43 -0700 > On Wed, 30 Jul 2025 18:07:15 +0200 Alexander Lobakin wrote: >> Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when >> applicable. It really gives some good boosts and code size reduction >> on x86_64. > > Could you perhaps quantify the goodness of the boost with a number? :) Sure, only a matter of switching this definition and running the tests (and bloat-o-meter). Intel doesn't allow us to publish raw numbers (Gbps/Mpps), I hope the diff in percents (+ bloat-o-meter output) would be enough? Thanks, Olek
On Tue, 5 Aug 2025 18:09:40 +0200 Alexander Lobakin wrote: > From: Jakub Kicinski <kuba@kernel.org> > Date: Fri, 1 Aug 2025 15:33:43 -0700 > > > On Wed, 30 Jul 2025 18:07:15 +0200 Alexander Lobakin wrote: > >> Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when > >> applicable. It really gives some good boosts and code size reduction > >> on x86_64. > > > > Could you perhaps quantify the goodness of the boost with a number? :) > > Sure, only a matter of switching this definition and running the tests > (and bloat-o-meter). > Intel doesn't allow us to publish raw numbers (Gbps/Mpps), I hope the > diff in percents (+ bloat-o-meter output) would be enough? Yes, delta is perfect. Absolute numbers aren't very meaningful if you don't specify all HW components and FW versions, and direction of wind on the day, anyway :$
On Wed, Jul 30, 2025 at 06:07:15PM +0200, Alexander Lobakin wrote: > Use libeth XDP infra to support running XDP program on Rx polling. > This includes all of the possible verdicts/actions. > XDP Tx queues are cleaned only in "lazy" mode when there are less than > 1/4 free descriptors left on the ring. libeth helper macros to define > driver-specific XDP functions make sure the compiler could uninline > them when needed. > Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when > applicable. It really gives some good boosts and code size reduction > on x86_64. > > Co-developed-by: Michal Kubiak <michal.kubiak@intel.com> > Signed-off-by: Michal Kubiak <michal.kubiak@intel.com> > Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com> ... > diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c ... > @@ -3127,14 +3125,12 @@ static bool idpf_rx_process_skb_fields(struct sk_buff *skb, > return !__idpf_rx_process_skb_fields(rxq, skb, xdp->desc); > } > > -static void > -idpf_xdp_run_pass(struct libeth_xdp_buff *xdp, struct napi_struct *napi, > - struct libeth_rq_napi_stats *ss, > - const struct virtchnl2_rx_flex_desc_adv_nic_3 *desc) > -{ > - libeth_xdp_run_pass(xdp, NULL, napi, ss, desc, NULL, > - idpf_rx_process_skb_fields); > -} > +LIBETH_XDP_DEFINE_START(); > +LIBETH_XDP_DEFINE_RUN(static idpf_xdp_run_pass, idpf_xdp_run_prog, > + idpf_xdp_tx_flush_bulk, idpf_rx_process_skb_fields); > +LIBETH_XDP_DEFINE_FINALIZE(static idpf_xdp_finalize_rx, idpf_xdp_tx_flush_bulk, > + idpf_xdp_tx_finalize); > +LIBETH_XDP_DEFINE_END(); > > /** > * idpf_rx_hsplit_wa - handle header buffer overflows and split errors > @@ -3222,7 +3218,10 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget) > struct libeth_rq_napi_stats rs = { }; > u16 ntc = rxq->next_to_clean; > LIBETH_XDP_ONSTACK_BUFF(xdp); > + LIBETH_XDP_ONSTACK_BULK(bq); > > + libeth_xdp_tx_init_bulk(&bq, rxq->xdp_prog, rxq->xdp_rxq.dev, > + rxq->xdpsqs, rxq->num_xdp_txq); > libeth_xdp_init_buff(xdp, &rxq->xdp, &rxq->xdp_rxq); > > /* Process Rx packets bounded by budget */ > @@ -3318,11 +3317,13 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget) > if (!idpf_rx_splitq_is_eop(rx_desc) || unlikely(!xdp->data)) > continue; > > - idpf_xdp_run_pass(xdp, rxq->napi, &rs, rx_desc); > + idpf_xdp_run_pass(xdp, &bq, rxq->napi, &rs, rx_desc); > } > > rxq->next_to_clean = ntc; > + > libeth_xdp_save_buff(&rxq->xdp, xdp); > + idpf_xdp_finalize_rx(&bq); This will call __libeth_xdp_finalize_rx(), which calls rcu_read_unlock(). But there doesn't seem to be a corresponding call to rcu_read_lock() Flagged by Sparse. > > u64_stats_update_begin(&rxq->stats_sync); > u64_stats_add(&rxq->q_stats.packets, rs.packets); ...
From: Simon Horman <horms@kernel.org> Date: Thu, 31 Jul 2025 14:35:57 +0100 > On Wed, Jul 30, 2025 at 06:07:15PM +0200, Alexander Lobakin wrote: >> Use libeth XDP infra to support running XDP program on Rx polling. >> This includes all of the possible verdicts/actions. >> XDP Tx queues are cleaned only in "lazy" mode when there are less than >> 1/4 free descriptors left on the ring. libeth helper macros to define >> driver-specific XDP functions make sure the compiler could uninline >> them when needed. >> Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when >> applicable. It really gives some good boosts and code size reduction >> on x86_64. >> >> Co-developed-by: Michal Kubiak <michal.kubiak@intel.com> >> Signed-off-by: Michal Kubiak <michal.kubiak@intel.com> >> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com> > > ... > >> diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c > > ... > >> @@ -3127,14 +3125,12 @@ static bool idpf_rx_process_skb_fields(struct sk_buff *skb, >> return !__idpf_rx_process_skb_fields(rxq, skb, xdp->desc); >> } >> >> -static void >> -idpf_xdp_run_pass(struct libeth_xdp_buff *xdp, struct napi_struct *napi, >> - struct libeth_rq_napi_stats *ss, >> - const struct virtchnl2_rx_flex_desc_adv_nic_3 *desc) >> -{ >> - libeth_xdp_run_pass(xdp, NULL, napi, ss, desc, NULL, >> - idpf_rx_process_skb_fields); >> -} >> +LIBETH_XDP_DEFINE_START(); >> +LIBETH_XDP_DEFINE_RUN(static idpf_xdp_run_pass, idpf_xdp_run_prog, >> + idpf_xdp_tx_flush_bulk, idpf_rx_process_skb_fields); >> +LIBETH_XDP_DEFINE_FINALIZE(static idpf_xdp_finalize_rx, idpf_xdp_tx_flush_bulk, >> + idpf_xdp_tx_finalize); >> +LIBETH_XDP_DEFINE_END(); >> >> /** >> * idpf_rx_hsplit_wa - handle header buffer overflows and split errors >> @@ -3222,7 +3218,10 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget) >> struct libeth_rq_napi_stats rs = { }; >> u16 ntc = rxq->next_to_clean; >> LIBETH_XDP_ONSTACK_BUFF(xdp); >> + LIBETH_XDP_ONSTACK_BULK(bq); >> >> + libeth_xdp_tx_init_bulk(&bq, rxq->xdp_prog, rxq->xdp_rxq.dev, >> + rxq->xdpsqs, rxq->num_xdp_txq); >> libeth_xdp_init_buff(xdp, &rxq->xdp, &rxq->xdp_rxq); >> >> /* Process Rx packets bounded by budget */ >> @@ -3318,11 +3317,13 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget) >> if (!idpf_rx_splitq_is_eop(rx_desc) || unlikely(!xdp->data)) >> continue; >> >> - idpf_xdp_run_pass(xdp, rxq->napi, &rs, rx_desc); >> + idpf_xdp_run_pass(xdp, &bq, rxq->napi, &rs, rx_desc); >> } >> >> rxq->next_to_clean = ntc; >> + >> libeth_xdp_save_buff(&rxq->xdp, xdp); >> + idpf_xdp_finalize_rx(&bq); > > This will call __libeth_xdp_finalize_rx(), which calls rcu_read_unlock(). > But there doesn't seem to be a corresponding call to rcu_read_lock() > > Flagged by Sparse. It's false-positive, rcu_read_lock() is called in tx_init_bulk(). > >> >> u64_stats_update_begin(&rxq->stats_sync); >> u64_stats_add(&rxq->q_stats.packets, rs.packets); Thanks, Olek
+ Kees, linux-hardening On Wed, Jul 30, 2025 at 06:07:15PM +0200, Alexander Lobakin wrote: > Use libeth XDP infra to support running XDP program on Rx polling. > This includes all of the possible verdicts/actions. > XDP Tx queues are cleaned only in "lazy" mode when there are less than > 1/4 free descriptors left on the ring. libeth helper macros to define > driver-specific XDP functions make sure the compiler could uninline > them when needed. > Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when > applicable. It really gives some good boosts and code size reduction > on x86_64. > > Co-developed-by: Michal Kubiak <michal.kubiak@intel.com> > Signed-off-by: Michal Kubiak <michal.kubiak@intel.com> > Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com> Hi Alexander, all, Sorry for providing review of __LIBETH_WORD_ACCESS[1] after the fact. I had missed it earlier. While I appreciate the desire for improved performance and nicer code generation. I think the idea of writing 64 bits of data to the address of a 32 bit member of a structure goes against the direction of hardening work by Kees and others. Indeed, it seems to me this is the kind of thing that struct_group() aims to avoid. In this case struct group() doesn't seem like the best option, because it would provide a 64-bit buffer that we can memcpy into. But it seems altogether better to simply assign u64 value to a u64 member. So I'm wondering if an approach along the following lines is appropriate (Very lightly compile tested only!). And yes, there is room for improvement of the wording of the comment I included below. diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h index f4880b50e804..a7d3d8e44aa6 100644 --- a/include/net/libeth/xdp.h +++ b/include/net/libeth/xdp.h @@ -1283,11 +1283,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, const struct page *page = __netmem_to_page(fqe->netmem); #ifdef __LIBETH_WORD_ACCESS - static_assert(offsetofend(typeof(xdp->base), flags) - - offsetof(typeof(xdp->base), frame_sz) == - sizeof(u64)); - - *(u64 *)&xdp->base.frame_sz = fqe->truesize; + xdp->base.frame_sz_le_qword = fqe->truesize; #else xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); #endif diff --git a/include/net/xdp.h b/include/net/xdp.h index b40f1f96cb11..b5eedeb82c9b 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -85,8 +85,19 @@ struct xdp_buff { void *data_hard_start; struct xdp_rxq_info *rxq; struct xdp_txq_info *txq; - u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ - u32 flags; /* supported values defined in xdp_buff_flags */ + union { + /* Allow setting frame_sz and flags as a single u64 on + * little endian systems. This may may give optimal + * performance. */ + u64 frame_sz_le_qword; + struct { + /* Frame size to deduce data_hard_end/reserved + * tailroom. */ + u32 frame_sz; + /* Supported values defined in xdp_buff_flags. */ + u32 flags; + }; + }; }; static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp) [1] https://git.kernel.org/torvalds/c/80bae9df2108 ...
On Thu, Jul 31, 2025 at 01:37:34PM +0100, Simon Horman wrote: > While I appreciate the desire for improved performance and nicer code > generation. I think the idea of writing 64 bits of data to the > address of a 32 bit member of a structure goes against the direction > of hardening work by Kees and others. Agreed: it's better to avoid obscuring these details from the compiler so it can have an "actual" view of the object sizes involved. > Indeed, it seems to me this is the kind of thing that struct_group() > aims to avoid. > > In this case struct group() doesn't seem like the best option, > because it would provide a 64-bit buffer that we can memcpy into. > But it seems altogether better to simply assign u64 value to a u64 member. Agreed: with struct_group you get a sized pointer, and while you can provide a struct tag to make it an assignable object, it doesn't make too much sense here. > So I'm wondering if an approach along the following lines is appropriate > (Very lightly compile tested only!). > > And yes, there is room for improvement of the wording of the comment > I included below. > > diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h > index f4880b50e804..a7d3d8e44aa6 100644 > --- a/include/net/libeth/xdp.h > +++ b/include/net/libeth/xdp.h > @@ -1283,11 +1283,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, > const struct page *page = __netmem_to_page(fqe->netmem); > > #ifdef __LIBETH_WORD_ACCESS > - static_assert(offsetofend(typeof(xdp->base), flags) - > - offsetof(typeof(xdp->base), frame_sz) == > - sizeof(u64)); > - > - *(u64 *)&xdp->base.frame_sz = fqe->truesize; > + xdp->base.frame_sz_le_qword = fqe->truesize; > #else > xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); > #endif > diff --git a/include/net/xdp.h b/include/net/xdp.h > index b40f1f96cb11..b5eedeb82c9b 100644 > --- a/include/net/xdp.h > +++ b/include/net/xdp.h > @@ -85,8 +85,19 @@ struct xdp_buff { > void *data_hard_start; > struct xdp_rxq_info *rxq; > struct xdp_txq_info *txq; > - u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ > - u32 flags; /* supported values defined in xdp_buff_flags */ > + union { > + /* Allow setting frame_sz and flags as a single u64 on > + * little endian systems. This may may give optimal > + * performance. */ > + u64 frame_sz_le_qword; > + struct { > + /* Frame size to deduce data_hard_end/reserved > + * tailroom. */ > + u32 frame_sz; > + /* Supported values defined in xdp_buff_flags. */ > + u32 flags; > + }; > + }; > }; Yeah, this looks like a nice way to express this, and is way more descriptive than "(u64 *)&xdp->base.frame_sz" :) -- Kees Cook
From: Kees Cook <kees@kernel.org> Date: Thu, 31 Jul 2025 10:05:47 -0700 > On Thu, Jul 31, 2025 at 01:37:34PM +0100, Simon Horman wrote: >> While I appreciate the desire for improved performance and nicer code >> generation. I think the idea of writing 64 bits of data to the >> address of a 32 bit member of a structure goes against the direction >> of hardening work by Kees and others. > > Agreed: it's better to avoid obscuring these details from the compiler > so it can have an "actual" view of the object sizes involved. > >> Indeed, it seems to me this is the kind of thing that struct_group() >> aims to avoid. >> >> In this case struct group() doesn't seem like the best option, >> because it would provide a 64-bit buffer that we can memcpy into. >> But it seems altogether better to simply assign u64 value to a u64 member. > > Agreed: with struct_group you get a sized pointer, and while you can > provide a struct tag to make it an assignable object, it doesn't make > too much sense here. > >> So I'm wondering if an approach along the following lines is appropriate >> (Very lightly compile tested only!). >> >> And yes, there is room for improvement of the wording of the comment >> I included below. >> >> diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h >> index f4880b50e804..a7d3d8e44aa6 100644 >> --- a/include/net/libeth/xdp.h >> +++ b/include/net/libeth/xdp.h >> @@ -1283,11 +1283,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, >> const struct page *page = __netmem_to_page(fqe->netmem); >> >> #ifdef __LIBETH_WORD_ACCESS >> - static_assert(offsetofend(typeof(xdp->base), flags) - >> - offsetof(typeof(xdp->base), frame_sz) == >> - sizeof(u64)); >> - >> - *(u64 *)&xdp->base.frame_sz = fqe->truesize; >> + xdp->base.frame_sz_le_qword = fqe->truesize; >> #else >> xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); >> #endif >> diff --git a/include/net/xdp.h b/include/net/xdp.h >> index b40f1f96cb11..b5eedeb82c9b 100644 >> --- a/include/net/xdp.h >> +++ b/include/net/xdp.h >> @@ -85,8 +85,19 @@ struct xdp_buff { >> void *data_hard_start; >> struct xdp_rxq_info *rxq; >> struct xdp_txq_info *txq; >> - u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ >> - u32 flags; /* supported values defined in xdp_buff_flags */ >> + union { >> + /* Allow setting frame_sz and flags as a single u64 on >> + * little endian systems. This may may give optimal >> + * performance. */ >> + u64 frame_sz_le_qword; >> + struct { >> + /* Frame size to deduce data_hard_end/reserved >> + * tailroom. */ >> + u32 frame_sz; >> + /* Supported values defined in xdp_buff_flags. */ >> + u32 flags; >> + }; >> + }; >> }; > > Yeah, this looks like a nice way to express this, and is way more > descriptive than "(u64 *)&xdp->base.frame_sz" :) Sounds good to me! Let me send v4 where I'll fix this. Thanks, Olek
From: Alexander Lobakin <aleksander.lobakin@intel.com> Date: Fri, 1 Aug 2025 15:12:43 +0200 > From: Kees Cook <kees@kernel.org> > Date: Thu, 31 Jul 2025 10:05:47 -0700 > >> On Thu, Jul 31, 2025 at 01:37:34PM +0100, Simon Horman wrote: >>> While I appreciate the desire for improved performance and nicer code >>> generation. I think the idea of writing 64 bits of data to the >>> address of a 32 bit member of a structure goes against the direction >>> of hardening work by Kees and others. >> >> Agreed: it's better to avoid obscuring these details from the compiler >> so it can have an "actual" view of the object sizes involved. >> >>> Indeed, it seems to me this is the kind of thing that struct_group() >>> aims to avoid. >>> >>> In this case struct group() doesn't seem like the best option, >>> because it would provide a 64-bit buffer that we can memcpy into. >>> But it seems altogether better to simply assign u64 value to a u64 member. >> >> Agreed: with struct_group you get a sized pointer, and while you can >> provide a struct tag to make it an assignable object, it doesn't make >> too much sense here. >> >>> So I'm wondering if an approach along the following lines is appropriate >>> (Very lightly compile tested only!). >>> >>> And yes, there is room for improvement of the wording of the comment >>> I included below. >>> >>> diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h >>> index f4880b50e804..a7d3d8e44aa6 100644 >>> --- a/include/net/libeth/xdp.h >>> +++ b/include/net/libeth/xdp.h >>> @@ -1283,11 +1283,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, >>> const struct page *page = __netmem_to_page(fqe->netmem); >>> >>> #ifdef __LIBETH_WORD_ACCESS >>> - static_assert(offsetofend(typeof(xdp->base), flags) - >>> - offsetof(typeof(xdp->base), frame_sz) == >>> - sizeof(u64)); >>> - >>> - *(u64 *)&xdp->base.frame_sz = fqe->truesize; >>> + xdp->base.frame_sz_le_qword = fqe->truesize; >>> #else >>> xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); >>> #endif >>> diff --git a/include/net/xdp.h b/include/net/xdp.h >>> index b40f1f96cb11..b5eedeb82c9b 100644 >>> --- a/include/net/xdp.h >>> +++ b/include/net/xdp.h >>> @@ -85,8 +85,19 @@ struct xdp_buff { >>> void *data_hard_start; >>> struct xdp_rxq_info *rxq; >>> struct xdp_txq_info *txq; >>> - u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ >>> - u32 flags; /* supported values defined in xdp_buff_flags */ >>> + union { >>> + /* Allow setting frame_sz and flags as a single u64 on >>> + * little endian systems. This may may give optimal >>> + * performance. */ >>> + u64 frame_sz_le_qword; >>> + struct { >>> + /* Frame size to deduce data_hard_end/reserved >>> + * tailroom. */ >>> + u32 frame_sz; >>> + /* Supported values defined in xdp_buff_flags. */ >>> + u32 flags; >>> + }; >>> + }; >>> }; >> >> Yeah, this looks like a nice way to express this, and is way more >> descriptive than "(u64 *)&xdp->base.frame_sz" :) > > Sounds good to me! > > Let me send v4 where I'll fix this. Note: would it be okay if I send v4 with this fix when the window opens, while our validation will retest v3 from Tony's tree in meantine? It's a cosmetic change anyway and does not involve any functional changes. Thanks, Olek
On Fri, Aug 01, 2025 at 03:17:42PM +0200, Alexander Lobakin wrote: > From: Alexander Lobakin <aleksander.lobakin@intel.com> > Date: Fri, 1 Aug 2025 15:12:43 +0200 > > > From: Kees Cook <kees@kernel.org> > > Date: Thu, 31 Jul 2025 10:05:47 -0700 > > > >> On Thu, Jul 31, 2025 at 01:37:34PM +0100, Simon Horman wrote: > >>> While I appreciate the desire for improved performance and nicer code > >>> generation. I think the idea of writing 64 bits of data to the > >>> address of a 32 bit member of a structure goes against the direction > >>> of hardening work by Kees and others. > >> > >> Agreed: it's better to avoid obscuring these details from the compiler > >> so it can have an "actual" view of the object sizes involved. > >> > >>> Indeed, it seems to me this is the kind of thing that struct_group() > >>> aims to avoid. > >>> > >>> In this case struct group() doesn't seem like the best option, > >>> because it would provide a 64-bit buffer that we can memcpy into. > >>> But it seems altogether better to simply assign u64 value to a u64 member. > >> > >> Agreed: with struct_group you get a sized pointer, and while you can > >> provide a struct tag to make it an assignable object, it doesn't make > >> too much sense here. > >> > >>> So I'm wondering if an approach along the following lines is appropriate > >>> (Very lightly compile tested only!). > >>> > >>> And yes, there is room for improvement of the wording of the comment > >>> I included below. > >>> > >>> diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h > >>> index f4880b50e804..a7d3d8e44aa6 100644 > >>> --- a/include/net/libeth/xdp.h > >>> +++ b/include/net/libeth/xdp.h > >>> @@ -1283,11 +1283,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp, > >>> const struct page *page = __netmem_to_page(fqe->netmem); > >>> > >>> #ifdef __LIBETH_WORD_ACCESS > >>> - static_assert(offsetofend(typeof(xdp->base), flags) - > >>> - offsetof(typeof(xdp->base), frame_sz) == > >>> - sizeof(u64)); > >>> - > >>> - *(u64 *)&xdp->base.frame_sz = fqe->truesize; > >>> + xdp->base.frame_sz_le_qword = fqe->truesize; > >>> #else > >>> xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq); > >>> #endif > >>> diff --git a/include/net/xdp.h b/include/net/xdp.h > >>> index b40f1f96cb11..b5eedeb82c9b 100644 > >>> --- a/include/net/xdp.h > >>> +++ b/include/net/xdp.h > >>> @@ -85,8 +85,19 @@ struct xdp_buff { > >>> void *data_hard_start; > >>> struct xdp_rxq_info *rxq; > >>> struct xdp_txq_info *txq; > >>> - u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/ > >>> - u32 flags; /* supported values defined in xdp_buff_flags */ > >>> + union { > >>> + /* Allow setting frame_sz and flags as a single u64 on > >>> + * little endian systems. This may may give optimal > >>> + * performance. */ > >>> + u64 frame_sz_le_qword; > >>> + struct { > >>> + /* Frame size to deduce data_hard_end/reserved > >>> + * tailroom. */ > >>> + u32 frame_sz; > >>> + /* Supported values defined in xdp_buff_flags. */ > >>> + u32 flags; > >>> + }; > >>> + }; > >>> }; > >> > >> Yeah, this looks like a nice way to express this, and is way more > >> descriptive than "(u64 *)&xdp->base.frame_sz" :) > > > > Sounds good to me! > > > > Let me send v4 where I'll fix this. > > Note: would it be okay if I send v4 with this fix when the window opens, > while our validation will retest v3 from Tony's tree in meantine? It's a > cosmetic change anyway and does not involve any functional changes. If this is directed at me, yeah, I don't see any high urgency here. -- Kees Cook
On Sat, Aug 02, 2025 at 11:52:44AM -0700, Kees Cook wrote: > On Fri, Aug 01, 2025 at 03:17:42PM +0200, Alexander Lobakin wrote: > > From: Alexander Lobakin <aleksander.lobakin@intel.com> > > Date: Fri, 1 Aug 2025 15:12:43 +0200 > > > > > From: Kees Cook <kees@kernel.org> > > > Date: Thu, 31 Jul 2025 10:05:47 -0700 > > > > > >> On Thu, Jul 31, 2025 at 01:37:34PM +0100, Simon Horman wrote: ... > > >> Yeah, this looks like a nice way to express this, and is way more > > >> descriptive than "(u64 *)&xdp->base.frame_sz" :) > > > > > > Sounds good to me! > > > > > > Let me send v4 where I'll fix this. > > > > Note: would it be okay if I send v4 with this fix when the window opens, > > while our validation will retest v3 from Tony's tree in meantine? It's a > > cosmetic change anyway and does not involve any functional changes. > > If this is directed at me, yeah, I don't see any high urgency here. Likewise, I see no urgency.
© 2016 - 2025 Red Hat, Inc.