[v3] idpf: add XDP support

[PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Alexander Lobakin 2 months, 1 week ago

Use libeth XDP infra to support running XDP program on Rx polling.
This includes all of the possible verdicts/actions.
XDP Tx queues are cleaned only in "lazy" mode when there are less than
1/4 free descriptors left on the ring. libeth helper macros to define
driver-specific XDP functions make sure the compiler could uninline
them when needed.
Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when
applicable. It really gives some good boosts and code size reduction
on x86_64.

Co-developed-by: Michal Kubiak <michal.kubiak@intel.com>
Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/idpf/idpf_txrx.h |   4 +-
 drivers/net/ethernet/intel/idpf/xdp.h       |  92 +++++++++++-
 drivers/net/ethernet/intel/idpf/idpf_lib.c  |   2 +
 drivers/net/ethernet/intel/idpf/idpf_txrx.c |  23 +--
 drivers/net/ethernet/intel/idpf/xdp.c       | 147 +++++++++++++++++++-
 5 files changed, 248 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.h b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
index 5039feafdee9..39a9c6bd6055 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.h
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.h
@@ -646,8 +646,8 @@ struct idpf_tx_queue {
 	__cacheline_group_end_aligned(read_mostly);
 
 	__cacheline_group_begin_aligned(read_write);
-	u16 next_to_use;
-	u16 next_to_clean;
+	u32 next_to_use;
+	u32 next_to_clean;
 
 	union {
 		struct {
diff --git a/drivers/net/ethernet/intel/idpf/xdp.h b/drivers/net/ethernet/intel/idpf/xdp.h
index 47553ce5f81a..986156162e2d 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.h
+++ b/drivers/net/ethernet/intel/idpf/xdp.h
@@ -4,12 +4,9 @@
 #ifndef _IDPF_XDP_H_
 #define _IDPF_XDP_H_
 
-#include <linux/types.h>
+#include <net/libeth/xdp.h>
 
-struct bpf_prog;
-struct idpf_vport;
-struct net_device;
-struct netdev_bpf;
+#include "idpf_txrx.h"
 
 int idpf_xdp_rxq_info_init_all(const struct idpf_vport *vport);
 void idpf_xdp_rxq_info_deinit_all(const struct idpf_vport *vport);
@@ -19,6 +16,91 @@ void idpf_xdp_copy_prog_to_rqs(const struct idpf_vport *vport,
 int idpf_xdpsqs_get(const struct idpf_vport *vport);
 void idpf_xdpsqs_put(const struct idpf_vport *vport);
 
+bool idpf_xdp_tx_flush_bulk(struct libeth_xdp_tx_bulk *bq, u32 flags);
+
+/**
+ * idpf_xdp_tx_xmit - produce a single HW Tx descriptor out of XDP desc
+ * @desc: XDP descriptor to pull the DMA address and length from
+ * @i: descriptor index on the queue to fill
+ * @sq: XDP queue to produce the HW Tx descriptor on
+ * @priv: &xsk_tx_metadata_ops on XSk xmit or %NULL
+ */
+static inline void idpf_xdp_tx_xmit(struct libeth_xdp_tx_desc desc, u32 i,
+				    const struct libeth_xdpsq *sq, u64 priv)
+{
+	struct idpf_flex_tx_desc *tx_desc = sq->descs;
+	u32 cmd;
+
+	cmd = FIELD_PREP(IDPF_FLEX_TXD_QW1_DTYPE_M,
+			 IDPF_TX_DESC_DTYPE_FLEX_L2TAG1_L2TAG2);
+	if (desc.flags & LIBETH_XDP_TX_LAST)
+		cmd |= FIELD_PREP(IDPF_FLEX_TXD_QW1_CMD_M,
+				  IDPF_TX_DESC_CMD_EOP);
+	if (priv && (desc.flags & LIBETH_XDP_TX_CSUM))
+		cmd |= FIELD_PREP(IDPF_FLEX_TXD_QW1_CMD_M,
+				  IDPF_TX_FLEX_DESC_CMD_CS_EN);
+
+	tx_desc = &tx_desc[i];
+	tx_desc->buf_addr = cpu_to_le64(desc.addr);
+#ifdef __LIBETH_WORD_ACCESS
+	*(u64 *)&tx_desc->qw1 = ((u64)desc.len << 48) | cmd;
+#else
+	tx_desc->qw1.buf_size = cpu_to_le16(desc.len);
+	tx_desc->qw1.cmd_dtype = cpu_to_le16(cmd);
+#endif
+}
+
+static inline void idpf_xdpsq_set_rs(const struct idpf_tx_queue *xdpsq)
+{
+	u32 ntu, cmd;
+
+	ntu = xdpsq->next_to_use;
+	if (unlikely(!ntu))
+		ntu = xdpsq->desc_count;
+
+	cmd = FIELD_PREP(IDPF_FLEX_TXD_QW1_CMD_M, IDPF_TX_DESC_CMD_RS);
+#ifdef __LIBETH_WORD_ACCESS
+	*(u64 *)&xdpsq->flex_tx[ntu - 1].q.qw1 |= cmd;
+#else
+	xdpsq->flex_tx[ntu - 1].q.qw1.cmd_dtype |= cpu_to_le16(cmd);
+#endif
+}
+
+static inline void idpf_xdpsq_update_tail(const struct idpf_tx_queue *xdpsq)
+{
+	dma_wmb();
+	writel_relaxed(xdpsq->next_to_use, xdpsq->tail);
+}
+
+/**
+ * idpf_xdp_tx_finalize - finalize sending over XDPSQ
+ * @_xdpsq: XDP Tx queue
+ * @sent: whether any frames were sent
+ * @flush: whether to update RS bit and the tail register
+ *
+ * Set the RS bit ("end of batch"), bump the tail, and queue the cleanup timer.
+ * To be called after a NAPI polling loop, at the end of .ndo_xdp_xmit() etc.
+ */
+static inline void idpf_xdp_tx_finalize(void *_xdpsq, bool sent, bool flush)
+{
+	struct idpf_tx_queue *xdpsq = _xdpsq;
+
+	if ((!flush || unlikely(!sent)) &&
+	    likely(xdpsq->desc_count - 1 != xdpsq->pending))
+		return;
+
+	libeth_xdpsq_lock(&xdpsq->xdp_lock);
+
+	idpf_xdpsq_set_rs(xdpsq);
+	idpf_xdpsq_update_tail(xdpsq);
+
+	libeth_xdpsq_queue_timer(xdpsq->timer);
+
+	libeth_xdpsq_unlock(&xdpsq->xdp_lock);
+}
+
+void idpf_xdp_set_features(const struct idpf_vport *vport);
+
 int idpf_xdp(struct net_device *dev, struct netdev_bpf *xdp);
 
 #endif /* _IDPF_XDP_H_ */
diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c
index 8f2ea790fdf3..30c8899ea0f4 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
@@ -835,6 +835,8 @@ static int idpf_cfg_netdev(struct idpf_vport *vport)
 	netdev->hw_features |=  netdev->features | other_offloads;
 	netdev->vlan_features |= netdev->features | other_offloads;
 	netdev->hw_enc_features |= dflt_features | other_offloads;
+	idpf_xdp_set_features(vport);
+
 	idpf_set_ethtool_ops(netdev);
 	netif_set_affinity_auto(netdev);
 	SET_NETDEV_DEV(netdev, &adapter->pdev->dev);
diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
index 7a2914e4ec95..3beacd215b39 100644
--- a/drivers/net/ethernet/intel/idpf/idpf_txrx.c
+++ b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (C) 2023 Intel Corporation */
 
-#include <net/libeth/xdp.h>
-
 #include "idpf.h"
 #include "idpf_ptp.h"
 #include "idpf_virtchnl.h"
@@ -3127,14 +3125,12 @@ static bool idpf_rx_process_skb_fields(struct sk_buff *skb,
 	return !__idpf_rx_process_skb_fields(rxq, skb, xdp->desc);
 }
 
-static void
-idpf_xdp_run_pass(struct libeth_xdp_buff *xdp, struct napi_struct *napi,
-		  struct libeth_rq_napi_stats *ss,
-		  const struct virtchnl2_rx_flex_desc_adv_nic_3 *desc)
-{
-	libeth_xdp_run_pass(xdp, NULL, napi, ss, desc, NULL,
-			    idpf_rx_process_skb_fields);
-}
+LIBETH_XDP_DEFINE_START();
+LIBETH_XDP_DEFINE_RUN(static idpf_xdp_run_pass, idpf_xdp_run_prog,
+		      idpf_xdp_tx_flush_bulk, idpf_rx_process_skb_fields);
+LIBETH_XDP_DEFINE_FINALIZE(static idpf_xdp_finalize_rx, idpf_xdp_tx_flush_bulk,
+			   idpf_xdp_tx_finalize);
+LIBETH_XDP_DEFINE_END();
 
 /**
  * idpf_rx_hsplit_wa - handle header buffer overflows and split errors
@@ -3222,7 +3218,10 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget)
 	struct libeth_rq_napi_stats rs = { };
 	u16 ntc = rxq->next_to_clean;
 	LIBETH_XDP_ONSTACK_BUFF(xdp);
+	LIBETH_XDP_ONSTACK_BULK(bq);
 
+	libeth_xdp_tx_init_bulk(&bq, rxq->xdp_prog, rxq->xdp_rxq.dev,
+				rxq->xdpsqs, rxq->num_xdp_txq);
 	libeth_xdp_init_buff(xdp, &rxq->xdp, &rxq->xdp_rxq);
 
 	/* Process Rx packets bounded by budget */
@@ -3318,11 +3317,13 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget)
 		if (!idpf_rx_splitq_is_eop(rx_desc) || unlikely(!xdp->data))
 			continue;
 
-		idpf_xdp_run_pass(xdp, rxq->napi, &rs, rx_desc);
+		idpf_xdp_run_pass(xdp, &bq, rxq->napi, &rs, rx_desc);
 	}
 
 	rxq->next_to_clean = ntc;
+
 	libeth_xdp_save_buff(&rxq->xdp, xdp);
+	idpf_xdp_finalize_rx(&bq);
 
 	u64_stats_update_begin(&rxq->stats_sync);
 	u64_stats_add(&rxq->q_stats.packets, rs.packets);
diff --git a/drivers/net/ethernet/intel/idpf/xdp.c b/drivers/net/ethernet/intel/idpf/xdp.c
index 09e84fe80d4e..d7ba2848148e 100644
--- a/drivers/net/ethernet/intel/idpf/xdp.c
+++ b/drivers/net/ethernet/intel/idpf/xdp.c
@@ -1,8 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0-only
 /* Copyright (C) 2025 Intel Corporation */
 
-#include <net/libeth/xdp.h>
-
 #include "idpf.h"
 #include "idpf_virtchnl.h"
 #include "xdp.h"
@@ -114,6 +112,8 @@ void idpf_xdp_copy_prog_to_rqs(const struct idpf_vport *vport,
 	idpf_rxq_for_each(vport, idpf_xdp_rxq_assign_prog, xdp_prog);
 }
 
+static void idpf_xdp_tx_timer(struct work_struct *work);
+
 int idpf_xdpsqs_get(const struct idpf_vport *vport)
 {
 	struct libeth_xdpsq_timer **timers __free(kvfree) = NULL;
@@ -154,6 +154,8 @@ int idpf_xdpsqs_get(const struct idpf_vport *vport)
 
 		xdpsq->timer = timers[i - sqs];
 		libeth_xdpsq_get(&xdpsq->xdp_lock, dev, vport->xdpsq_share);
+		libeth_xdpsq_init_timer(xdpsq->timer, xdpsq, &xdpsq->xdp_lock,
+					idpf_xdp_tx_timer);
 
 		xdpsq->pending = 0;
 		xdpsq->xdp_tx = 0;
@@ -180,6 +182,7 @@ void idpf_xdpsqs_put(const struct idpf_vport *vport)
 		if (!idpf_queue_has_clear(XDP, xdpsq))
 			continue;
 
+		libeth_xdpsq_deinit_timer(xdpsq->timer);
 		libeth_xdpsq_put(&xdpsq->xdp_lock, dev);
 
 		kfree(xdpsq->timer);
@@ -187,6 +190,146 @@ void idpf_xdpsqs_put(const struct idpf_vport *vport)
 	}
 }
 
+static int idpf_xdp_parse_cqe(const struct idpf_splitq_4b_tx_compl_desc *desc,
+			      bool gen)
+{
+	u32 val;
+
+#ifdef __LIBETH_WORD_ACCESS
+	val = *(const u32 *)desc;
+#else
+	val = ((u32)le16_to_cpu(desc->q_head_compl_tag.q_head) << 16) |
+	      le16_to_cpu(desc->qid_comptype_gen);
+#endif
+	if (!!(val & IDPF_TXD_COMPLQ_GEN_M) != gen)
+		return -ENODATA;
+
+	if (unlikely((val & GENMASK(IDPF_TXD_COMPLQ_GEN_S - 1, 0)) !=
+		     FIELD_PREP(IDPF_TXD_COMPLQ_COMPL_TYPE_M,
+				IDPF_TXD_COMPLT_RS)))
+		return -EINVAL;
+
+	return upper_16_bits(val);
+}
+
+static u32 idpf_xdpsq_poll(struct idpf_tx_queue *xdpsq, u32 budget)
+{
+	struct idpf_compl_queue *cq = xdpsq->complq;
+	u32 tx_ntc = xdpsq->next_to_clean;
+	u32 tx_cnt = xdpsq->desc_count;
+	u32 ntc = cq->next_to_clean;
+	u32 cnt = cq->desc_count;
+	u32 done_frames;
+	bool gen;
+
+	gen = idpf_queue_has(GEN_CHK, cq);
+
+	for (done_frames = 0; done_frames < budget; ) {
+		int ret;
+
+		ret = idpf_xdp_parse_cqe(&cq->comp_4b[ntc], gen);
+		if (ret >= 0) {
+			done_frames = ret > tx_ntc ? ret - tx_ntc :
+						     ret + tx_cnt - tx_ntc;
+			goto next;
+		}
+
+		switch (ret) {
+		case -ENODATA:
+			goto out;
+		case -EINVAL:
+			break;
+		}
+
+next:
+		if (unlikely(++ntc == cnt)) {
+			ntc = 0;
+			gen = !gen;
+			idpf_queue_change(GEN_CHK, cq);
+		}
+	}
+
+out:
+	cq->next_to_clean = ntc;
+
+	return done_frames;
+}
+
+static u32 idpf_xdpsq_complete(void *_xdpsq, u32 budget)
+{
+	struct libeth_xdpsq_napi_stats ss = { };
+	struct idpf_tx_queue *xdpsq = _xdpsq;
+	u32 tx_ntc = xdpsq->next_to_clean;
+	u32 tx_cnt = xdpsq->desc_count;
+	struct xdp_frame_bulk bq;
+	struct libeth_cq_pp cp = {
+		.dev	= xdpsq->dev,
+		.bq	= &bq,
+		.xss	= &ss,
+		.napi	= true,
+	};
+	u32 done_frames;
+
+	done_frames = idpf_xdpsq_poll(xdpsq, budget);
+	if (unlikely(!done_frames))
+		return 0;
+
+	xdp_frame_bulk_init(&bq);
+
+	for (u32 i = 0; likely(i < done_frames); i++) {
+		libeth_xdp_complete_tx(&xdpsq->tx_buf[tx_ntc], &cp);
+
+		if (unlikely(++tx_ntc == tx_cnt))
+			tx_ntc = 0;
+	}
+
+	xdp_flush_frame_bulk(&bq);
+
+	xdpsq->next_to_clean = tx_ntc;
+	xdpsq->pending -= done_frames;
+	xdpsq->xdp_tx -= cp.xdp_tx;
+
+	return done_frames;
+}
+
+static u32 idpf_xdp_tx_prep(void *_xdpsq, struct libeth_xdpsq *sq)
+{
+	struct idpf_tx_queue *xdpsq = _xdpsq;
+	u32 free;
+
+	libeth_xdpsq_lock(&xdpsq->xdp_lock);
+
+	free = xdpsq->desc_count - xdpsq->pending;
+	if (free < xdpsq->thresh)
+		free += idpf_xdpsq_complete(xdpsq, xdpsq->thresh);
+
+	*sq = (struct libeth_xdpsq){
+		.sqes		= xdpsq->tx_buf,
+		.descs		= xdpsq->desc_ring,
+		.count		= xdpsq->desc_count,
+		.lock		= &xdpsq->xdp_lock,
+		.ntu		= &xdpsq->next_to_use,
+		.pending	= &xdpsq->pending,
+		.xdp_tx		= &xdpsq->xdp_tx,
+	};
+
+	return free;
+}
+
+LIBETH_XDP_DEFINE_START();
+LIBETH_XDP_DEFINE_TIMER(static idpf_xdp_tx_timer, idpf_xdpsq_complete);
+LIBETH_XDP_DEFINE_FLUSH_TX(idpf_xdp_tx_flush_bulk, idpf_xdp_tx_prep,
+			   idpf_xdp_tx_xmit);
+LIBETH_XDP_DEFINE_END();
+
+void idpf_xdp_set_features(const struct idpf_vport *vport)
+{
+	if (!idpf_is_queue_model_split(vport->rxq_model))
+		return;
+
+	libeth_xdp_set_features_noredir(vport->netdev);
+}
+
 static int idpf_xdp_setup_prog(struct idpf_vport *vport,
 			       const struct netdev_bpf *xdp)
 {
-- 
2.50.1

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Jakub Kicinski 2 months ago

On Wed, 30 Jul 2025 18:07:15 +0200 Alexander Lobakin wrote:
> Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when
> applicable. It really gives some good boosts and code size reduction
> on x86_64.

Could you perhaps quantify the goodness of the boost with a number? :)

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Alexander Lobakin 2 months ago

From: Jakub Kicinski <kuba@kernel.org>
Date: Fri, 1 Aug 2025 15:33:43 -0700

> On Wed, 30 Jul 2025 18:07:15 +0200 Alexander Lobakin wrote:
>> Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when
>> applicable. It really gives some good boosts and code size reduction
>> on x86_64.
> 
> Could you perhaps quantify the goodness of the boost with a number? :)

Sure, only a matter of switching this definition and running the tests
(and bloat-o-meter).
Intel doesn't allow us to publish raw numbers (Gbps/Mpps), I hope the
diff in percents (+ bloat-o-meter output) would be enough?

Thanks,
Olek

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Jakub Kicinski 2 months ago

On Tue, 5 Aug 2025 18:09:40 +0200 Alexander Lobakin wrote:
> From: Jakub Kicinski <kuba@kernel.org>
> Date: Fri, 1 Aug 2025 15:33:43 -0700
> 
> > On Wed, 30 Jul 2025 18:07:15 +0200 Alexander Lobakin wrote:  
> >> Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when
> >> applicable. It really gives some good boosts and code size reduction
> >> on x86_64.  
> > 
> > Could you perhaps quantify the goodness of the boost with a number? :)  
> 
> Sure, only a matter of switching this definition and running the tests
> (and bloat-o-meter).
> Intel doesn't allow us to publish raw numbers (Gbps/Mpps), I hope the
> diff in percents (+ bloat-o-meter output) would be enough?

Yes, delta is perfect. Absolute numbers aren't very meaningful if you
don't specify all HW components and FW versions, and direction of wind
on the day, anyway :$

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Simon Horman 2 months ago

On Wed, Jul 30, 2025 at 06:07:15PM +0200, Alexander Lobakin wrote:
> Use libeth XDP infra to support running XDP program on Rx polling.
> This includes all of the possible verdicts/actions.
> XDP Tx queues are cleaned only in "lazy" mode when there are less than
> 1/4 free descriptors left on the ring. libeth helper macros to define
> driver-specific XDP functions make sure the compiler could uninline
> them when needed.
> Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when
> applicable. It really gives some good boosts and code size reduction
> on x86_64.
> 
> Co-developed-by: Michal Kubiak <michal.kubiak@intel.com>
> Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>

...

> diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c

...

> @@ -3127,14 +3125,12 @@ static bool idpf_rx_process_skb_fields(struct sk_buff *skb,
>  	return !__idpf_rx_process_skb_fields(rxq, skb, xdp->desc);
>  }
>  
> -static void
> -idpf_xdp_run_pass(struct libeth_xdp_buff *xdp, struct napi_struct *napi,
> -		  struct libeth_rq_napi_stats *ss,
> -		  const struct virtchnl2_rx_flex_desc_adv_nic_3 *desc)
> -{
> -	libeth_xdp_run_pass(xdp, NULL, napi, ss, desc, NULL,
> -			    idpf_rx_process_skb_fields);
> -}
> +LIBETH_XDP_DEFINE_START();
> +LIBETH_XDP_DEFINE_RUN(static idpf_xdp_run_pass, idpf_xdp_run_prog,
> +		      idpf_xdp_tx_flush_bulk, idpf_rx_process_skb_fields);
> +LIBETH_XDP_DEFINE_FINALIZE(static idpf_xdp_finalize_rx, idpf_xdp_tx_flush_bulk,
> +			   idpf_xdp_tx_finalize);
> +LIBETH_XDP_DEFINE_END();
>  
>  /**
>   * idpf_rx_hsplit_wa - handle header buffer overflows and split errors
> @@ -3222,7 +3218,10 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget)
>  	struct libeth_rq_napi_stats rs = { };
>  	u16 ntc = rxq->next_to_clean;
>  	LIBETH_XDP_ONSTACK_BUFF(xdp);
> +	LIBETH_XDP_ONSTACK_BULK(bq);
>  
> +	libeth_xdp_tx_init_bulk(&bq, rxq->xdp_prog, rxq->xdp_rxq.dev,
> +				rxq->xdpsqs, rxq->num_xdp_txq);
>  	libeth_xdp_init_buff(xdp, &rxq->xdp, &rxq->xdp_rxq);
>  
>  	/* Process Rx packets bounded by budget */
> @@ -3318,11 +3317,13 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget)
>  		if (!idpf_rx_splitq_is_eop(rx_desc) || unlikely(!xdp->data))
>  			continue;
>  
> -		idpf_xdp_run_pass(xdp, rxq->napi, &rs, rx_desc);
> +		idpf_xdp_run_pass(xdp, &bq, rxq->napi, &rs, rx_desc);
>  	}
>  
>  	rxq->next_to_clean = ntc;
> +
>  	libeth_xdp_save_buff(&rxq->xdp, xdp);
> +	idpf_xdp_finalize_rx(&bq);

This will call __libeth_xdp_finalize_rx(), which calls rcu_read_unlock().
But there doesn't seem to be a corresponding call to rcu_read_lock()

Flagged by Sparse.

>  
>  	u64_stats_update_begin(&rxq->stats_sync);
>  	u64_stats_add(&rxq->q_stats.packets, rs.packets);

...

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Alexander Lobakin 2 months ago

From: Simon Horman <horms@kernel.org>
Date: Thu, 31 Jul 2025 14:35:57 +0100

> On Wed, Jul 30, 2025 at 06:07:15PM +0200, Alexander Lobakin wrote:
>> Use libeth XDP infra to support running XDP program on Rx polling.
>> This includes all of the possible verdicts/actions.
>> XDP Tx queues are cleaned only in "lazy" mode when there are less than
>> 1/4 free descriptors left on the ring. libeth helper macros to define
>> driver-specific XDP functions make sure the compiler could uninline
>> them when needed.
>> Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when
>> applicable. It really gives some good boosts and code size reduction
>> on x86_64.
>>
>> Co-developed-by: Michal Kubiak <michal.kubiak@intel.com>
>> Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
>> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> 
> ...
> 
>> diff --git a/drivers/net/ethernet/intel/idpf/idpf_txrx.c b/drivers/net/ethernet/intel/idpf/idpf_txrx.c
> 
> ...
> 
>> @@ -3127,14 +3125,12 @@ static bool idpf_rx_process_skb_fields(struct sk_buff *skb,
>>  	return !__idpf_rx_process_skb_fields(rxq, skb, xdp->desc);
>>  }
>>  
>> -static void
>> -idpf_xdp_run_pass(struct libeth_xdp_buff *xdp, struct napi_struct *napi,
>> -		  struct libeth_rq_napi_stats *ss,
>> -		  const struct virtchnl2_rx_flex_desc_adv_nic_3 *desc)
>> -{
>> -	libeth_xdp_run_pass(xdp, NULL, napi, ss, desc, NULL,
>> -			    idpf_rx_process_skb_fields);
>> -}
>> +LIBETH_XDP_DEFINE_START();
>> +LIBETH_XDP_DEFINE_RUN(static idpf_xdp_run_pass, idpf_xdp_run_prog,
>> +		      idpf_xdp_tx_flush_bulk, idpf_rx_process_skb_fields);
>> +LIBETH_XDP_DEFINE_FINALIZE(static idpf_xdp_finalize_rx, idpf_xdp_tx_flush_bulk,
>> +			   idpf_xdp_tx_finalize);
>> +LIBETH_XDP_DEFINE_END();
>>  
>>  /**
>>   * idpf_rx_hsplit_wa - handle header buffer overflows and split errors
>> @@ -3222,7 +3218,10 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget)
>>  	struct libeth_rq_napi_stats rs = { };
>>  	u16 ntc = rxq->next_to_clean;
>>  	LIBETH_XDP_ONSTACK_BUFF(xdp);
>> +	LIBETH_XDP_ONSTACK_BULK(bq);
>>  
>> +	libeth_xdp_tx_init_bulk(&bq, rxq->xdp_prog, rxq->xdp_rxq.dev,
>> +				rxq->xdpsqs, rxq->num_xdp_txq);
>>  	libeth_xdp_init_buff(xdp, &rxq->xdp, &rxq->xdp_rxq);
>>  
>>  	/* Process Rx packets bounded by budget */
>> @@ -3318,11 +3317,13 @@ static int idpf_rx_splitq_clean(struct idpf_rx_queue *rxq, int budget)
>>  		if (!idpf_rx_splitq_is_eop(rx_desc) || unlikely(!xdp->data))
>>  			continue;
>>  
>> -		idpf_xdp_run_pass(xdp, rxq->napi, &rs, rx_desc);
>> +		idpf_xdp_run_pass(xdp, &bq, rxq->napi, &rs, rx_desc);
>>  	}
>>  
>>  	rxq->next_to_clean = ntc;
>> +
>>  	libeth_xdp_save_buff(&rxq->xdp, xdp);
>> +	idpf_xdp_finalize_rx(&bq);
> 
> This will call __libeth_xdp_finalize_rx(), which calls rcu_read_unlock().
> But there doesn't seem to be a corresponding call to rcu_read_lock()
> 
> Flagged by Sparse.

It's false-positive, rcu_read_lock() is called in tx_init_bulk().

> 
>>  
>>  	u64_stats_update_begin(&rxq->stats_sync);
>>  	u64_stats_add(&rxq->q_stats.packets, rs.packets);

Thanks,
Olek

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Simon Horman 2 months ago

+ Kees, linux-hardening

On Wed, Jul 30, 2025 at 06:07:15PM +0200, Alexander Lobakin wrote:
> Use libeth XDP infra to support running XDP program on Rx polling.
> This includes all of the possible verdicts/actions.
> XDP Tx queues are cleaned only in "lazy" mode when there are less than
> 1/4 free descriptors left on the ring. libeth helper macros to define
> driver-specific XDP functions make sure the compiler could uninline
> them when needed.
> Use __LIBETH_WORD_ACCESS to parse descriptors more efficiently when
> applicable. It really gives some good boosts and code size reduction
> on x86_64.
> 
> Co-developed-by: Michal Kubiak <michal.kubiak@intel.com>
> Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>

Hi Alexander, all,

Sorry for providing review of __LIBETH_WORD_ACCESS[1] after the fact.
I had missed it earlier.

While I appreciate the desire for improved performance and nicer code
generation. I think the idea of writing 64 bits of data to the
address of a 32 bit member of a structure goes against the direction
of hardening work by Kees and others.

Indeed, it seems to me this is the kind of thing that struct_group()
aims to avoid.

In this case struct group() doesn't seem like the best option,
because it would provide a 64-bit buffer that we can memcpy into.
But it seems altogether better to simply assign u64 value to a u64 member.

So I'm wondering if an approach along the following lines is appropriate
(Very lightly compile tested only!).

And yes, there is room for improvement of the wording of the comment
I included below.

diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
index f4880b50e804..a7d3d8e44aa6 100644
--- a/include/net/libeth/xdp.h
+++ b/include/net/libeth/xdp.h
@@ -1283,11 +1283,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
 	const struct page *page = __netmem_to_page(fqe->netmem);
 
 #ifdef __LIBETH_WORD_ACCESS
-	static_assert(offsetofend(typeof(xdp->base), flags) -
-		      offsetof(typeof(xdp->base), frame_sz) ==
-		      sizeof(u64));
-
-	*(u64 *)&xdp->base.frame_sz = fqe->truesize;
+	xdp->base.frame_sz_le_qword = fqe->truesize;
 #else
 	xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
 #endif
diff --git a/include/net/xdp.h b/include/net/xdp.h
index b40f1f96cb11..b5eedeb82c9b 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -85,8 +85,19 @@ struct xdp_buff {
 	void *data_hard_start;
 	struct xdp_rxq_info *rxq;
 	struct xdp_txq_info *txq;
-	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
-	u32 flags; /* supported values defined in xdp_buff_flags */
+	union {
+		/* Allow setting frame_sz and flags as a single u64 on
+		 * little endian systems. This may may give optimal
+		 * performance. */
+		u64 frame_sz_le_qword;
+		struct {
+			/* Frame size to deduce data_hard_end/reserved
+			 * tailroom. */
+			u32 frame_sz;
+			/* Supported values defined in xdp_buff_flags. */
+			u32 flags;
+		};
+	};
 };
 
 static __always_inline bool xdp_buff_has_frags(const struct xdp_buff *xdp)

[1] https://git.kernel.org/torvalds/c/80bae9df2108


...

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Kees Cook 2 months ago

On Thu, Jul 31, 2025 at 01:37:34PM +0100, Simon Horman wrote:
> While I appreciate the desire for improved performance and nicer code
> generation. I think the idea of writing 64 bits of data to the
> address of a 32 bit member of a structure goes against the direction
> of hardening work by Kees and others.

Agreed: it's better to avoid obscuring these details from the compiler
so it can have an "actual" view of the object sizes involved.

> Indeed, it seems to me this is the kind of thing that struct_group()
> aims to avoid.
> 
> In this case struct group() doesn't seem like the best option,
> because it would provide a 64-bit buffer that we can memcpy into.
> But it seems altogether better to simply assign u64 value to a u64 member.

Agreed: with struct_group you get a sized pointer, and while you can
provide a struct tag to make it an assignable object, it doesn't make
too much sense here.

> So I'm wondering if an approach along the following lines is appropriate
> (Very lightly compile tested only!).
> 
> And yes, there is room for improvement of the wording of the comment
> I included below.
> 
> diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
> index f4880b50e804..a7d3d8e44aa6 100644
> --- a/include/net/libeth/xdp.h
> +++ b/include/net/libeth/xdp.h
> @@ -1283,11 +1283,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
>  	const struct page *page = __netmem_to_page(fqe->netmem);
>  
>  #ifdef __LIBETH_WORD_ACCESS
> -	static_assert(offsetofend(typeof(xdp->base), flags) -
> -		      offsetof(typeof(xdp->base), frame_sz) ==
> -		      sizeof(u64));
> -
> -	*(u64 *)&xdp->base.frame_sz = fqe->truesize;
> +	xdp->base.frame_sz_le_qword = fqe->truesize;
>  #else
>  	xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
>  #endif
> diff --git a/include/net/xdp.h b/include/net/xdp.h
> index b40f1f96cb11..b5eedeb82c9b 100644
> --- a/include/net/xdp.h
> +++ b/include/net/xdp.h
> @@ -85,8 +85,19 @@ struct xdp_buff {
>  	void *data_hard_start;
>  	struct xdp_rxq_info *rxq;
>  	struct xdp_txq_info *txq;
> -	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
> -	u32 flags; /* supported values defined in xdp_buff_flags */
> +	union {
> +		/* Allow setting frame_sz and flags as a single u64 on
> +		 * little endian systems. This may may give optimal
> +		 * performance. */
> +		u64 frame_sz_le_qword;
> +		struct {
> +			/* Frame size to deduce data_hard_end/reserved
> +			 * tailroom. */
> +			u32 frame_sz;
> +			/* Supported values defined in xdp_buff_flags. */
> +			u32 flags;
> +		};
> +	};
>  };

Yeah, this looks like a nice way to express this, and is way more
descriptive than "(u64 *)&xdp->base.frame_sz" :)

-- 
Kees Cook

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Alexander Lobakin 2 months ago

From: Kees Cook <kees@kernel.org>
Date: Thu, 31 Jul 2025 10:05:47 -0700

> On Thu, Jul 31, 2025 at 01:37:34PM +0100, Simon Horman wrote:
>> While I appreciate the desire for improved performance and nicer code
>> generation. I think the idea of writing 64 bits of data to the
>> address of a 32 bit member of a structure goes against the direction
>> of hardening work by Kees and others.
> 
> Agreed: it's better to avoid obscuring these details from the compiler
> so it can have an "actual" view of the object sizes involved.
> 
>> Indeed, it seems to me this is the kind of thing that struct_group()
>> aims to avoid.
>>
>> In this case struct group() doesn't seem like the best option,
>> because it would provide a 64-bit buffer that we can memcpy into.
>> But it seems altogether better to simply assign u64 value to a u64 member.
> 
> Agreed: with struct_group you get a sized pointer, and while you can
> provide a struct tag to make it an assignable object, it doesn't make
> too much sense here.
> 
>> So I'm wondering if an approach along the following lines is appropriate
>> (Very lightly compile tested only!).
>>
>> And yes, there is room for improvement of the wording of the comment
>> I included below.
>>
>> diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
>> index f4880b50e804..a7d3d8e44aa6 100644
>> --- a/include/net/libeth/xdp.h
>> +++ b/include/net/libeth/xdp.h
>> @@ -1283,11 +1283,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
>>  	const struct page *page = __netmem_to_page(fqe->netmem);
>>  
>>  #ifdef __LIBETH_WORD_ACCESS
>> -	static_assert(offsetofend(typeof(xdp->base), flags) -
>> -		      offsetof(typeof(xdp->base), frame_sz) ==
>> -		      sizeof(u64));
>> -
>> -	*(u64 *)&xdp->base.frame_sz = fqe->truesize;
>> +	xdp->base.frame_sz_le_qword = fqe->truesize;
>>  #else
>>  	xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
>>  #endif
>> diff --git a/include/net/xdp.h b/include/net/xdp.h
>> index b40f1f96cb11..b5eedeb82c9b 100644
>> --- a/include/net/xdp.h
>> +++ b/include/net/xdp.h
>> @@ -85,8 +85,19 @@ struct xdp_buff {
>>  	void *data_hard_start;
>>  	struct xdp_rxq_info *rxq;
>>  	struct xdp_txq_info *txq;
>> -	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
>> -	u32 flags; /* supported values defined in xdp_buff_flags */
>> +	union {
>> +		/* Allow setting frame_sz and flags as a single u64 on
>> +		 * little endian systems. This may may give optimal
>> +		 * performance. */
>> +		u64 frame_sz_le_qword;
>> +		struct {
>> +			/* Frame size to deduce data_hard_end/reserved
>> +			 * tailroom. */
>> +			u32 frame_sz;
>> +			/* Supported values defined in xdp_buff_flags. */
>> +			u32 flags;
>> +		};
>> +	};
>>  };
> 
> Yeah, this looks like a nice way to express this, and is way more
> descriptive than "(u64 *)&xdp->base.frame_sz" :)

Sounds good to me!

Let me send v4 where I'll fix this.

Thanks,
Olek

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Alexander Lobakin 2 months ago

From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 1 Aug 2025 15:12:43 +0200

> From: Kees Cook <kees@kernel.org>
> Date: Thu, 31 Jul 2025 10:05:47 -0700
> 
>> On Thu, Jul 31, 2025 at 01:37:34PM +0100, Simon Horman wrote:
>>> While I appreciate the desire for improved performance and nicer code
>>> generation. I think the idea of writing 64 bits of data to the
>>> address of a 32 bit member of a structure goes against the direction
>>> of hardening work by Kees and others.
>>
>> Agreed: it's better to avoid obscuring these details from the compiler
>> so it can have an "actual" view of the object sizes involved.
>>
>>> Indeed, it seems to me this is the kind of thing that struct_group()
>>> aims to avoid.
>>>
>>> In this case struct group() doesn't seem like the best option,
>>> because it would provide a 64-bit buffer that we can memcpy into.
>>> But it seems altogether better to simply assign u64 value to a u64 member.
>>
>> Agreed: with struct_group you get a sized pointer, and while you can
>> provide a struct tag to make it an assignable object, it doesn't make
>> too much sense here.
>>
>>> So I'm wondering if an approach along the following lines is appropriate
>>> (Very lightly compile tested only!).
>>>
>>> And yes, there is room for improvement of the wording of the comment
>>> I included below.
>>>
>>> diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
>>> index f4880b50e804..a7d3d8e44aa6 100644
>>> --- a/include/net/libeth/xdp.h
>>> +++ b/include/net/libeth/xdp.h
>>> @@ -1283,11 +1283,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
>>>  	const struct page *page = __netmem_to_page(fqe->netmem);
>>>  
>>>  #ifdef __LIBETH_WORD_ACCESS
>>> -	static_assert(offsetofend(typeof(xdp->base), flags) -
>>> -		      offsetof(typeof(xdp->base), frame_sz) ==
>>> -		      sizeof(u64));
>>> -
>>> -	*(u64 *)&xdp->base.frame_sz = fqe->truesize;
>>> +	xdp->base.frame_sz_le_qword = fqe->truesize;
>>>  #else
>>>  	xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
>>>  #endif
>>> diff --git a/include/net/xdp.h b/include/net/xdp.h
>>> index b40f1f96cb11..b5eedeb82c9b 100644
>>> --- a/include/net/xdp.h
>>> +++ b/include/net/xdp.h
>>> @@ -85,8 +85,19 @@ struct xdp_buff {
>>>  	void *data_hard_start;
>>>  	struct xdp_rxq_info *rxq;
>>>  	struct xdp_txq_info *txq;
>>> -	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
>>> -	u32 flags; /* supported values defined in xdp_buff_flags */
>>> +	union {
>>> +		/* Allow setting frame_sz and flags as a single u64 on
>>> +		 * little endian systems. This may may give optimal
>>> +		 * performance. */
>>> +		u64 frame_sz_le_qword;
>>> +		struct {
>>> +			/* Frame size to deduce data_hard_end/reserved
>>> +			 * tailroom. */
>>> +			u32 frame_sz;
>>> +			/* Supported values defined in xdp_buff_flags. */
>>> +			u32 flags;
>>> +		};
>>> +	};
>>>  };
>>
>> Yeah, this looks like a nice way to express this, and is way more
>> descriptive than "(u64 *)&xdp->base.frame_sz" :)
> 
> Sounds good to me!
> 
> Let me send v4 where I'll fix this.

Note: would it be okay if I send v4 with this fix when the window opens,
while our validation will retest v3 from Tony's tree in meantine? It's a
cosmetic change anyway and does not involve any functional changes.

Thanks,
Olek

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Kees Cook 2 months ago

On Fri, Aug 01, 2025 at 03:17:42PM +0200, Alexander Lobakin wrote:
> From: Alexander Lobakin <aleksander.lobakin@intel.com>
> Date: Fri, 1 Aug 2025 15:12:43 +0200
> 
> > From: Kees Cook <kees@kernel.org>
> > Date: Thu, 31 Jul 2025 10:05:47 -0700
> > 
> >> On Thu, Jul 31, 2025 at 01:37:34PM +0100, Simon Horman wrote:
> >>> While I appreciate the desire for improved performance and nicer code
> >>> generation. I think the idea of writing 64 bits of data to the
> >>> address of a 32 bit member of a structure goes against the direction
> >>> of hardening work by Kees and others.
> >>
> >> Agreed: it's better to avoid obscuring these details from the compiler
> >> so it can have an "actual" view of the object sizes involved.
> >>
> >>> Indeed, it seems to me this is the kind of thing that struct_group()
> >>> aims to avoid.
> >>>
> >>> In this case struct group() doesn't seem like the best option,
> >>> because it would provide a 64-bit buffer that we can memcpy into.
> >>> But it seems altogether better to simply assign u64 value to a u64 member.
> >>
> >> Agreed: with struct_group you get a sized pointer, and while you can
> >> provide a struct tag to make it an assignable object, it doesn't make
> >> too much sense here.
> >>
> >>> So I'm wondering if an approach along the following lines is appropriate
> >>> (Very lightly compile tested only!).
> >>>
> >>> And yes, there is room for improvement of the wording of the comment
> >>> I included below.
> >>>
> >>> diff --git a/include/net/libeth/xdp.h b/include/net/libeth/xdp.h
> >>> index f4880b50e804..a7d3d8e44aa6 100644
> >>> --- a/include/net/libeth/xdp.h
> >>> +++ b/include/net/libeth/xdp.h
> >>> @@ -1283,11 +1283,7 @@ static inline void libeth_xdp_prepare_buff(struct libeth_xdp_buff *xdp,
> >>>  	const struct page *page = __netmem_to_page(fqe->netmem);
> >>>  
> >>>  #ifdef __LIBETH_WORD_ACCESS
> >>> -	static_assert(offsetofend(typeof(xdp->base), flags) -
> >>> -		      offsetof(typeof(xdp->base), frame_sz) ==
> >>> -		      sizeof(u64));
> >>> -
> >>> -	*(u64 *)&xdp->base.frame_sz = fqe->truesize;
> >>> +	xdp->base.frame_sz_le_qword = fqe->truesize;
> >>>  #else
> >>>  	xdp_init_buff(&xdp->base, fqe->truesize, xdp->base.rxq);
> >>>  #endif
> >>> diff --git a/include/net/xdp.h b/include/net/xdp.h
> >>> index b40f1f96cb11..b5eedeb82c9b 100644
> >>> --- a/include/net/xdp.h
> >>> +++ b/include/net/xdp.h
> >>> @@ -85,8 +85,19 @@ struct xdp_buff {
> >>>  	void *data_hard_start;
> >>>  	struct xdp_rxq_info *rxq;
> >>>  	struct xdp_txq_info *txq;
> >>> -	u32 frame_sz; /* frame size to deduce data_hard_end/reserved tailroom*/
> >>> -	u32 flags; /* supported values defined in xdp_buff_flags */
> >>> +	union {
> >>> +		/* Allow setting frame_sz and flags as a single u64 on
> >>> +		 * little endian systems. This may may give optimal
> >>> +		 * performance. */
> >>> +		u64 frame_sz_le_qword;
> >>> +		struct {
> >>> +			/* Frame size to deduce data_hard_end/reserved
> >>> +			 * tailroom. */
> >>> +			u32 frame_sz;
> >>> +			/* Supported values defined in xdp_buff_flags. */
> >>> +			u32 flags;
> >>> +		};
> >>> +	};
> >>>  };
> >>
> >> Yeah, this looks like a nice way to express this, and is way more
> >> descriptive than "(u64 *)&xdp->base.frame_sz" :)
> > 
> > Sounds good to me!
> > 
> > Let me send v4 where I'll fix this.
> 
> Note: would it be okay if I send v4 with this fix when the window opens,
> while our validation will retest v3 from Tony's tree in meantine? It's a
> cosmetic change anyway and does not involve any functional changes.

If this is directed at me, yeah, I don't see any high urgency here.

-- 
Kees Cook

Re: [PATCH iwl-next v3 16/18] idpf: add support for XDP on Rx

Posted by Simon Horman 2 months ago

On Sat, Aug 02, 2025 at 11:52:44AM -0700, Kees Cook wrote:
> On Fri, Aug 01, 2025 at 03:17:42PM +0200, Alexander Lobakin wrote:
> > From: Alexander Lobakin <aleksander.lobakin@intel.com>
> > Date: Fri, 1 Aug 2025 15:12:43 +0200
> > 
> > > From: Kees Cook <kees@kernel.org>
> > > Date: Thu, 31 Jul 2025 10:05:47 -0700
> > > 
> > >> On Thu, Jul 31, 2025 at 01:37:34PM +0100, Simon Horman wrote:

...

> > >> Yeah, this looks like a nice way to express this, and is way more
> > >> descriptive than "(u64 *)&xdp->base.frame_sz" :)
> > > 
> > > Sounds good to me!
> > > 
> > > Let me send v4 where I'll fix this.
> > 
> > Note: would it be okay if I send v4 with this fix when the window opens,
> > while our validation will retest v3 from Tony's tree in meantine? It's a
> > cosmetic change anyway and does not involve any functional changes.
> 
> If this is directed at me, yeah, I don't see any high urgency here.

Likewise, I see no urgency.