[PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests

atwellwea@gmail.com posted 14 patches 1 week ago
[PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests
Posted by atwellwea@gmail.com 1 week ago
From: Wesley Atwell <atwellwea@gmail.com>

Add a test-only TUN ioctl that inflates RX skb->truesize, plus the
packetdrill-side helper needed to drive that ioctl through packetdrill's
own TUN queue file descriptor.

Use that plumbing to cover the receive-window regressions where
scaling_ratio drifts after advertisement, alongside the baseline too-big
packetdrill cases that exercise the same sender-visible rwnd accounting
from the non-injected path.

Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
 drivers/net/tun.c                             |  65 ++++++++
 include/uapi/linux/if_tun.h                   |   4 +
 .../tcp_rcv_neg_window_truesize.pkt           | 143 ++++++++++++++++++
 .../net/packetdrill/tcp_rcv_toobig.pkt        |  35 +++++
 .../packetdrill/tcp_rcv_toobig_default.pkt    |  97 ++++++++++++
 .../tcp_rcv_toobig_default_truesize.pkt       | 118 +++++++++++++++
 .../tcp_rcv_wnd_shrink_allowed_truesize.pkt   |  49 ++++++
 tools/testing/selftests/net/tun.c             | 140 ++++++++++++++++-
 8 files changed, 650 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
 create mode 100644 tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index c492fda6fc15..2cef62cebe88 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -53,6 +53,7 @@
 #include <linux/if_ether.h>
 #include <linux/if_tun.h>
 #include <linux/if_vlan.h>
+#include <linux/overflow.h>
 #include <linux/crc32.h>
 #include <linux/math.h>
 #include <linux/nsproxy.h>
@@ -85,8 +86,13 @@
 
 #include "tun_vnet.h"
 
+struct tun_file;
+
+#define TUNSETTRUESIZE_OLD _IOW('T', 228, unsigned int)
+
 static void tun_default_link_ksettings(struct net_device *dev,
 				       struct ethtool_link_ksettings *cmd);
+static void tun_rx_update_truesize(struct tun_file *tfile, struct sk_buff *skb);
 
 #define TUN_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
 
@@ -138,6 +144,7 @@ struct tun_file {
 		u16 queue_index;
 		unsigned int ifindex;
 	};
+	u32 rx_extra_truesize;
 	struct napi_struct napi;
 	bool napi_enabled;
 	bool napi_frags_enabled;
@@ -1817,6 +1824,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 		goto free_skb;
 	}
 
+	tun_rx_update_truesize(tfile, skb);
 	switch (tun->flags & TUN_TYPE_MASK) {
 	case IFF_TUN:
 		if (tun->flags & IFF_NO_PI) {
@@ -2373,6 +2381,25 @@ static void tun_put_page(struct tun_page *tpage)
 		__page_frag_cache_drain(tpage->page, tpage->count);
 }
 
+/* Tests can inflate skb->truesize on ingress to exercise receive-memory
+ * accounting against a scaling_ratio that drifts after a window was
+ * advertised. The knob is per queue file, defaults to zero, and only changes
+ * behavior when explicitly enabled through the TUN fd.
+ */
+static void tun_rx_update_truesize(struct tun_file *tfile, struct sk_buff *skb)
+{
+	u32 extra = READ_ONCE(tfile->rx_extra_truesize);
+	unsigned int truesize;
+
+	if (!extra)
+		return;
+
+	if (check_add_overflow(skb->truesize, extra, &truesize))
+		truesize = UINT_MAX;
+
+	skb->truesize = truesize;
+}
+
 static int tun_xdp_one(struct tun_struct *tun,
 		       struct tun_file *tfile,
 		       struct xdp_buff *xdp, int *flush,
@@ -2459,6 +2486,7 @@ static int tun_xdp_one(struct tun_struct *tun,
 		goto out;
 	}
 
+	tun_rx_update_truesize(tfile, skb);
 	skb->protocol = eth_type_trans(skb, tun->dev);
 	skb_reset_network_header(skb);
 	skb_probe_transport_header(skb);
@@ -3045,6 +3073,7 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	struct tun_struct *tun;
 	void __user* argp = (void __user*)arg;
 	unsigned int carrier;
+	unsigned int extra_truesize;
 	struct ifreq ifr;
 	kuid_t owner;
 	kgid_t group;
@@ -3309,6 +3338,40 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 		ret = tun_net_change_carrier(tun->dev, (bool)carrier);
 		break;
 
+	/* Support both the legacy pointer-payload form and the scalar form
+	 * used by the selftest helper when injecting truesize from
+	 * packetdrill shell commands.
+	 */
+	case TUNSETTRUESIZE:
+	case TUNSETTRUESIZE_OLD:
+		ret = -EPERM;
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			goto unlock;
+
+		if (cmd == TUNSETTRUESIZE_OLD) {
+			ret = -EFAULT;
+			if (copy_from_user(&extra_truesize, argp,
+					   sizeof(extra_truesize))) {
+				ret = -EINVAL;
+				if (arg > U32_MAX)
+					goto unlock;
+
+				extra_truesize = arg;
+			}
+		} else {
+			ret = -EINVAL;
+			if (arg > U32_MAX)
+				goto unlock;
+
+			extra_truesize = arg;
+		}
+
+		WRITE_ONCE(tfile->rx_extra_truesize, extra_truesize);
+		netif_info(tun, drv, tun->dev,
+			   "rx extra truesize set to %u\n", extra_truesize);
+		ret = 0;
+		break;
+
 	case TUNGETDEVNETNS:
 		ret = -EPERM;
 		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -3348,6 +3411,7 @@ static long tun_chr_compat_ioctl(struct file *file,
 	case TUNGETSNDBUF:
 	case TUNSETSNDBUF:
 	case SIOCGIFHWADDR:
+	case TUNSETTRUESIZE_OLD:
 	case SIOCSIFHWADDR:
 		arg = (unsigned long)compat_ptr(arg);
 		break;
@@ -3408,6 +3472,7 @@ static int tun_chr_open(struct inode *inode, struct file * file)
 	RCU_INIT_POINTER(tfile->tun, NULL);
 	tfile->flags = 0;
 	tfile->ifindex = 0;
+	tfile->rx_extra_truesize = 0;
 
 	init_waitqueue_head(&tfile->socket.wq.wait);
 
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 79d53c7a1ebd..4be63efe6540 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -61,6 +61,10 @@
 #define TUNSETFILTEREBPF _IOR('T', 225, int)
 #define TUNSETCARRIER _IOW('T', 226, int)
 #define TUNGETDEVNETNS _IO('T', 227)
+/* Test-only: add scalar bytes to skb->truesize on RX after TUN allocates
+ * an skb.
+ */
+#define TUNSETTRUESIZE _IO('T', 228)
 
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
new file mode 100644
index 000000000000..1c5550fff509
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_neg_window_truesize.pkt
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-2.0
+// Run the negative-window / max-advertised-window regression with inflated
+// TUN skb->truesize so scaling_ratio drifts throughout the flow. The sequence
+// checks and drop counters should remain identical to the uninflated case.
+
+--mss=1000
+
+`./defaults.sh`
+
+    0 `nstat -n`
+
+// Establish a connection.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [1000000], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 win 65535 <mss 1460,nop,nop,sackOK,nop,wscale 4>
+   +0 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Put 1040000 bytes into the receive buffer.
+   +0 < P. 1:65001(65000) ack 1 win 257
+    * > .  1:1(0) ack 65001
+   +0 < P. 65001:130001(65000) ack 1 win 257
+    * > .  1:1(0) ack 130001
+   +0 < P. 130001:195001(65000) ack 1 win 257
+    * > .  1:1(0) ack 195001
+   +0 < P. 195001:260001(65000) ack 1 win 257
+    * > .  1:1(0) ack 260001
+   +0 < P. 260001:325001(65000) ack 1 win 257
+    * > .  1:1(0) ack 325001
+   +0 < P. 325001:390001(65000) ack 1 win 257
+    * > .  1:1(0) ack 390001
+   +0 < P. 390001:455001(65000) ack 1 win 257
+    * > .  1:1(0) ack 455001
+   +0 < P. 455001:520001(65000) ack 1 win 257
+    * > .  1:1(0) ack 520001
+   +0 < P. 520001:585001(65000) ack 1 win 257
+    * > .  1:1(0) ack 585001
+   +0 < P. 585001:650001(65000) ack 1 win 257
+    * > .  1:1(0) ack 650001
+   +0 < P. 650001:715001(65000) ack 1 win 257
+    * > .  1:1(0) ack 715001
+   +0 < P. 715001:780001(65000) ack 1 win 257
+    * > .  1:1(0) ack 780001
+   +0 < P. 780001:845001(65000) ack 1 win 257
+    * > .  1:1(0) ack 845001
+   +0 < P. 845001:910001(65000) ack 1 win 257
+    * > .  1:1(0) ack 910001
+   +0 < P. 910001:975001(65000) ack 1 win 257
+    * > .  1:1(0) ack 975001
+   +0 < P. 975001:1040001(65000) ack 1 win 257
+    * > .  1:1(0) ack 1040001
+
+// Start inflating future TUN skbs only after the baseline sender-visible
+// window has been established, so the negative-window checks below exercise
+// ratio drift without changing the initial max advertised window.
+   +0 `../tun --set-rx-truesize tun0 65536`
+
+// Trigger an extreme memory squeeze by shrinking SO_RCVBUF.
+   +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [16000], 4) = 0
+
+   +0 < P. 1040001:1105001(65000) ack 1 win 257
+    * > .  1:1(0) ack 1040001 win 0
+// Check LINUX_MIB_TCPRCVQDROP has been incremented.
+   +0 `nstat -s | grep TcpExtTCPRcvQDrop | grep -q " 1 "`
+
+// RWIN == 0: rcv_wup = 1040001, rcv_wnd = 0, rcv_mwnd_seq > 1105001.
+
+// Accept pure ack with seq in max adv. window.
+   +0 write(4, ..., 1000) = 1000
+   +0 > P. 1:1001(1000) ack 1040001 win 0
+   +0 < .  1105001:1105001(0) ack 1001 win 257
+
+// In order segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW).
+   +0 < P. 1040001:1041001(1000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0
+// Ooo partial segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_ZEROWINDOW).
+   +0 < P. 1039001:1041001(2000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0 <nop,nop,sack 1039001:1040001>
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented twice.
+   +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 2 "`
+
+// Ooo segment, in max adv. window -> drop (SKB_DROP_REASON_TCP_OVERWINDOW).
+   +0 < P. 1105001:1106001(1000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0
+// Ooo segment, beyond max adv. window -> drop (SKB_DROP_REASON_TCP_INVALID_SEQUENCE).
+   +0 < P. 2000001:2001001(1000) ack 1001 win 257
+   +0 > .  1001:1001(0) ack 1040001 win 0
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented twice.
+   +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 2 "`
+
+// Read all data.
+   +0 read(4, ..., 2000000) = 1040000
+    * > .  1001:1001(0) ack 1040001
+
+// RWIN > 0: rcv_wup = 1040001, 0 < rcv_wnd < 32000, rcv_mwnd_seq > 1105001.
+
+// Accept pure ack with seq in max adv. window, beyond adv. window.
+   +0 write(4, ..., 1000) = 1000
+   +0 > P.  1001:2001(1000) ack 1040001
+   +0 < . 1105001:1105001(0) ack 2001 win 257
+
+// In order segment, in max adv. window, in adv. window -> accept.
+   +0 < P. 1040001:1041001(1000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1041001
+
+// Ooo partial segment, in adv. window -> accept.
+   +0 < P. 1040001:1042001(2000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1042001 <nop,nop,sack 1040001:1041001>
+
+// Ooo segment, in max adv. window, beyond adv. window -> drop.
+   +0 < P. 1105001:1106001(1000) ack 2001 win 257
+   +0 > .  2001:2001(0) ack 1042001
+// Ooo segment, beyond max adv. window, beyond adv. window -> drop.
+   +0 < P. 2000001:2001001(1000) ack 2001 win 257
+   +0 > .  2001:2001(0) ack 1042001
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented twice more.
+   +0 `nstat -s | grep TcpExtBeyondWindow | grep -q " 4 "`
+
+// We are allowed to go beyond the window and buffer with one packet.
+   +0 < P. 1042001:1062001(20000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1062001
+   +0 < P. 1062001:1082001(20000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1082001 win 0
+
+// But not more: in-order segment, in max adv. window -> drop.
+   +0 < P. 1082001:1083001(1000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1082001
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented again.
+   +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 3 "`
+
+// Another ratio drop must not change the final zero-window decision.
+   +0 `../tun --set-rx-truesize tun0 131072`
+
+   +0 < P. 1082001:1083001(1000) ack 2001 win 257
+    * > .  2001:2001(0) ack 1082001
+// Check LINUX_MIB_TCPZEROWINDOWDROP has been incremented once more.
+   +0 `nstat -s | grep TcpExtTCPZeroWindowDrop | grep -q " 4 "`
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
new file mode 100644
index 000000000000..837ba3633752
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig.pkt
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh`
+
+    0 `nstat -n`
+
+// Establish a connection.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 setsockopt(3, SOL_SOCKET, SO_RCVBUF, [20000], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 win 18980 <mss 1460,nop,wscale 0>
+  +.1 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+   +0 < P. 1:20001(20000) ack 1 win 257
+ +.04 > .  1:1(0) ack 20001 win 18000
+
+   +0 setsockopt(4, SOL_SOCKET, SO_RCVBUF, [12000], 4) = 0
+   +0 < P. 20001:80001(60000) ack 1 win 257
+   +0 > .  1:1(0) ack 20001 win 18000
+
+   +0 read(4, ..., 20000) = 20000
+
+// A too big packet is accepted if the receive queue is empty, but the
+// stronger admission path must not zero the receive buffer while doing so.
+   +0 < P. 20001:80001(60000) ack 1 win 257
+    * > .  1:1(0) ack 80001 win 0
+   +0 %{ assert SK_MEMINFO_RCVBUF > 0, SK_MEMINFO_RCVBUF }%
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
new file mode 100644
index 000000000000..b2e4950e0b83
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default.pkt
@@ -0,0 +1,97 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_moderate_rcvbuf=0`
+
+// Establish a connection on the default receive buffer. Leave a large skb in
+// the queue, then deliver another one which still fits the remaining rwnd.
+// We should grow sk_rcvbuf to honor the already-advertised window instead of
+// dropping the packet.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <...>
+  +.1 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Exchange enough data to get past the completely fresh-socket case while
+// still keeping the receive buffer at its 128kB default.
+   +0 < P. 1:65001(65000) ack 1 win 257
+   * > .  1:1(0) ack 65001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 65001:130001(65000) ack 1 win 257
+   * > .  1:1(0) ack 130001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 130001:195001(65000) ack 1 win 257
+   * > .  1:1(0) ack 195001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 195001:260001(65000) ack 1 win 257
+   * > .  1:1(0) ack 260001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 260001:325001(65000) ack 1 win 257
+   * > .  1:1(0) ack 325001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 325001:390001(65000) ack 1 win 257
+   * > .  1:1(0) ack 390001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 390001:455001(65000) ack 1 win 257
+   * > .  1:1(0) ack 455001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 455001:520001(65000) ack 1 win 257
+   * > .  1:1(0) ack 520001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 520001:585001(65000) ack 1 win 257
+   * > .  1:1(0) ack 585001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 585001:650001(65000) ack 1 win 257
+   * > .  1:1(0) ack 650001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 650001:715001(65000) ack 1 win 257
+   * > .  1:1(0) ack 715001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 715001:780001(65000) ack 1 win 257
+   * > .  1:1(0) ack 780001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 780001:845001(65000) ack 1 win 257
+   * > .  1:1(0) ack 845001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 845001:910001(65000) ack 1 win 257
+   * > .  1:1(0) ack 910001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 910001:975001(65000) ack 1 win 257
+   * > .  1:1(0) ack 975001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 975001:1040001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1040001
+   +0 read(4, ..., 65000) = 65000
+
+// Leave about 60kB queued, then accept another large skb which still fits
+// the rwnd we already exposed to the peer. The regression is the drop; the
+// exact sk_rcvbuf growth path is an implementation detail.
+   +0 < P. 1040001:1102001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1102001
+
+   +0 < P. 1102001:1167001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1167001
+   +0 read(4, ..., 127000) = 127000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
new file mode 100644
index 000000000000..c2ebe11d75f7
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_toobig_default_truesize.pkt
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_moderate_rcvbuf=0`
+
+// Establish a connection on the default receive buffer. The warmup traffic
+// keeps the socket in the normal data path without changing its default
+// sk_rcvbuf. Then inflate skb->truesize on future TUN RX packets so the live
+// scaling_ratio drops after we already exposed a larger rwnd to the peer.
+// The follow-up packet should still be admitted, and tcp_clamp_window() should
+// grow sk_rcvbuf to honor the sender-visible window instead of dropping data.
+   +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+   +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+   +0 bind(3, ..., ...) = 0
+   +0 listen(3, 1) = 0
+
+   +0 < S 0:0(0) win 65535 <mss 1000,nop,nop,sackOK,nop,wscale 7>
+   +0 > S. 0:0(0) ack 1 <...>
+  +.1 < . 1:1(0) ack 1 win 257
+
+   +0 accept(3, ..., ...) = 4
+
+// Exchange enough data to get past the completely fresh-socket case while
+// still keeping the receive buffer at its initial default.
+   +0 < P. 1:65001(65000) ack 1 win 257
+   * > .  1:1(0) ack 65001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 65001:130001(65000) ack 1 win 257
+   * > .  1:1(0) ack 130001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 130001:195001(65000) ack 1 win 257
+   * > .  1:1(0) ack 195001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 195001:260001(65000) ack 1 win 257
+   * > .  1:1(0) ack 260001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 260001:325001(65000) ack 1 win 257
+   * > .  1:1(0) ack 325001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 325001:390001(65000) ack 1 win 257
+   * > .  1:1(0) ack 390001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 390001:455001(65000) ack 1 win 257
+   * > .  1:1(0) ack 455001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 455001:520001(65000) ack 1 win 257
+   * > .  1:1(0) ack 520001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 520001:585001(65000) ack 1 win 257
+   * > .  1:1(0) ack 585001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 585001:650001(65000) ack 1 win 257
+   * > .  1:1(0) ack 650001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 650001:715001(65000) ack 1 win 257
+   * > .  1:1(0) ack 715001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 715001:780001(65000) ack 1 win 257
+   * > .  1:1(0) ack 780001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 780001:845001(65000) ack 1 win 257
+   * > .  1:1(0) ack 845001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 845001:910001(65000) ack 1 win 257
+   * > .  1:1(0) ack 910001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 910001:975001(65000) ack 1 win 257
+   * > .  1:1(0) ack 975001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 < P. 975001:1040001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1040001
+   +0 read(4, ..., 65000) = 65000
+
+   +0 %{ base_rcvbuf = SK_MEMINFO_RCVBUF }%
+
+// Leave about 60kB queued, then make future TUN skbs look more expensive in
+// two steps. Both inflated skbs still fit the already-advertised window and
+// must be admitted, and sk_rcvbuf should keep growing as the live
+// scaling_ratio drops further.
+   +0 < P. 1040001:1102001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1102001
+
+   +0 `../tun --set-rx-truesize tun0 4096`
+
+   +0 < P. 1102001:1167001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1167001
+   +0 %{ assert SK_MEMINFO_RCVBUF > base_rcvbuf, (base_rcvbuf, SK_MEMINFO_RCVBUF) }%
+   +0 %{ small_rcvbuf = SK_MEMINFO_RCVBUF }%
+
+   +0 < P. 1167001:1229001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1229001
+
+   +0 `../tun --set-rx-truesize tun0 65536`
+
+   +0 < P. 1229001:1294001(65000) ack 1 win 257
+   * > .  1:1(0) ack 1294001
+   +0 %{ assert SK_MEMINFO_RCVBUF > small_rcvbuf, (base_rcvbuf, small_rcvbuf, SK_MEMINFO_RCVBUF) }%
+
+   +0 < P. 1294001:1356001(62000) ack 1 win 257
+   * > .  1:1(0) ack 1356001
+   +0 read(4, ..., 254000) = 254000
diff --git a/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt b/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt
new file mode 100644
index 000000000000..08da5fddaa12
--- /dev/null
+++ b/tools/testing/selftests/net/packetdrill/tcp_rcv_wnd_shrink_allowed_truesize.pkt
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: GPL-2.0
+
+--mss=1000
+
+`./defaults.sh
+sysctl -q net.ipv4.tcp_shrink_window=1
+sysctl -q net.ipv4.tcp_rmem="4096 32768 $((32*1024*1024))"`
+
+   0 `nstat -n`
+
+// Establish a connection. After the first payload we know the peer has seen a
+// scaled receive window reaching sequence 25361. Inflate later TUN skbs in two
+// steps so the live scaling_ratio drops more than once, then verify that:
+//   1) a segment one byte beyond the max advertised window is still dropped,
+//   2) a segment exactly using the previously advertised max window is still
+//      accepted even though the current live ratio no longer matches that
+//      original advertisement basis.
+  +0 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+  +0 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+  +0 bind(3, ..., ...) = 0
+  +0 listen(3, 1) = 0
+
+  +0 < S 0:0(0) win 32792 <mss 1000,nop,wscale 7>
+  +0 > S. 0:0(0) ack 1 <mss 1460,nop,wscale 10>
+  +0 < . 1:1(0) ack 1 win 257
+
+  +0 accept(3, ..., ...) = 4
+
+  +0 < P. 1:10001(10000) ack 1 win 257
+   * > .  1:1(0) ack 10001 win 15
+
+// Max window seq advertised here is 10001 + 15*1024 = 25361.
+  +0 `../tun --set-rx-truesize tun0 4096`
+
+  +0 < P. 10001:11024(1023) ack 1 win 257
+   * > .  1:1(0) ack 11024
+
+  +0 `../tun --set-rx-truesize tun0 65536`
+
+// Segment beyond the max window stays invalid even after ratio drift.
+  +0 < P. 11024:25362(14338) ack 1 win 257
+   * > .  1:1(0) ack 11024
+
+// Segment exactly using the max window must still be accepted.
+  +0 < P. 11024:25361(14337) ack 1 win 257
+   * > .  1:1(0) ack 25361
+
+// Check LINUX_MIB_BEYOND_WINDOW has been incremented once.
+  +0 `nstat | grep TcpExtBeyondWindow | grep -q " 1 "`
diff --git a/tools/testing/selftests/net/tun.c b/tools/testing/selftests/net/tun.c
index cf106a49b55e..473992b3784d 100644
--- a/tools/testing/selftests/net/tun.c
+++ b/tools/testing/selftests/net/tun.c
@@ -2,14 +2,17 @@
 
 #define _GNU_SOURCE
 
+#include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
+#include <limits.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 #include <linux/if_tun.h>
 #include <sys/ioctl.h>
+#include <sys/syscall.h>
 #include <sys/socket.h>
 
 #include "kselftest_harness.h"
@@ -174,6 +177,135 @@ static int tun_delete(char *dev)
 	return ip_link_del(dev);
 }
 
+static bool is_numeric_name(const char *name)
+{
+	for (; *name; name++) {
+		if (*name < '0' || *name > '9')
+			return false;
+	}
+
+	return true;
+}
+
+static int packetdrill_dup_fd(int pidfd, const char *fd_name)
+{
+	char *end;
+	unsigned long tmp;
+
+	errno = 0;
+	tmp = strtoul(fd_name, &end, 10);
+	if (errno || *end || tmp > INT_MAX) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	return syscall(SYS_pidfd_getfd, pidfd, (int)tmp, 0);
+}
+
+static int open_packetdrill_tunfd(pid_t pid, const char *ifname)
+{
+	char fd_dir[PATH_MAX];
+	struct dirent *dent;
+	struct ifreq ifr = {};
+	int pidfd;
+	int saved_errno = ENOENT;
+	DIR *dir;
+
+	snprintf(fd_dir, sizeof(fd_dir), "/proc/%ld/fd", (long)pid);
+
+	pidfd = syscall(SYS_pidfd_open, pid, 0);
+	if (pidfd < 0)
+		return -1;
+
+	dir = opendir(fd_dir);
+	if (!dir) {
+		close(pidfd);
+		return -1;
+	}
+
+	while ((dent = readdir(dir))) {
+		int fd;
+
+		if (!is_numeric_name(dent->d_name))
+			continue;
+
+		/* Reopen via pidfd_getfd() so we duplicate packetdrill's attached
+		 * queue file, instead of opening a fresh /dev/net/tun instance.
+		 */
+		fd = packetdrill_dup_fd(pidfd, dent->d_name);
+		if (fd < 0) {
+			saved_errno = errno;
+			continue;
+		}
+
+		memset(&ifr, 0, sizeof(ifr));
+		if (!ioctl(fd, TUNGETIFF, &ifr) &&
+		    !strncmp(ifr.ifr_name, ifname, IFNAMSIZ)) {
+			close(pidfd);
+			closedir(dir);
+			return fd;
+		}
+
+		if (errno)
+			saved_errno = errno;
+		close(fd);
+	}
+
+	close(pidfd);
+	closedir(dir);
+	errno = saved_errno;
+	return -1;
+}
+
+/* Packetdrill owns the TUN queue fd, so drive the test ioctl through that
+ * exact file descriptor found under /proc/$PACKETDRILL_PID/fd.
+ */
+static int packetdrill_set_rx_truesize(const char *ifname, const char *value)
+{
+	char *packetdrill_pid, *end;
+	unsigned long long tmp;
+	unsigned int extra;
+	pid_t pid;
+	int fd;
+
+	packetdrill_pid = getenv("PACKETDRILL_PID");
+	if (!packetdrill_pid || !*packetdrill_pid) {
+		fprintf(stderr, "PACKETDRILL_PID is not set\n");
+		return 1;
+	}
+
+	errno = 0;
+	tmp = strtoull(packetdrill_pid, &end, 10);
+	if (errno || *end || !tmp || tmp > INT_MAX) {
+		fprintf(stderr, "invalid PACKETDRILL_PID: %s\n", packetdrill_pid);
+		return 1;
+	}
+	pid = (pid_t)tmp;
+
+	errno = 0;
+	tmp = strtoull(value, &end, 0);
+	if (errno || *end || tmp > UINT_MAX) {
+		fprintf(stderr, "invalid truesize value: %s\n", value);
+		return 1;
+	}
+	extra = (unsigned int)tmp;
+
+	fd = open_packetdrill_tunfd(pid, ifname);
+	if (fd < 0) {
+		perror("open_packetdrill_tunfd");
+		return 1;
+	}
+
+	if (ioctl(fd, TUNSETTRUESIZE, (unsigned long)extra)) {
+		perror("ioctl(TUNSETTRUESIZE)");
+		close(fd);
+		return 1;
+	}
+
+	close(fd);
+	return 0;
+}
+
 static int tun_open(char *dev, const int flags, const int hdrlen,
 		    const int features, const unsigned char *mac_addr)
 {
@@ -985,4 +1117,10 @@ XFAIL_ADD(tun_vnet_udptnl, 6in4_over_maxbytes, recv_gso_packet);
 XFAIL_ADD(tun_vnet_udptnl, 4in6_over_maxbytes, recv_gso_packet);
 XFAIL_ADD(tun_vnet_udptnl, 6in6_over_maxbytes, recv_gso_packet);
 
-TEST_HARNESS_MAIN
+int main(int argc, char **argv)
+{
+	if (argc == 4 && !strcmp(argv[1], "--set-rx-truesize"))
+		return packetdrill_set_rx_truesize(argv[2], argv[3]);
+
+	return test_harness_run(argc, argv);
+}
-- 
2.43.0
Re: [PATCH net-next v2 12/14] tun/selftests: add RX truesize injection for TCP window tests
Posted by Jakub Kicinski 1 week ago
On Sat, 14 Mar 2026 14:13:46 -0600 atwellwea@gmail.com wrote:
> Add a test-only TUN ioctl that inflates RX skb->truesize, plus the
> packetdrill-side helper needed to drive that ioctl through packetdrill's
> own TUN queue file descriptor.
> 
> Use that plumbing to cover the receive-window regressions where
> scaling_ratio drifts after advertisement, alongside the baseline too-big
> packetdrill cases that exercise the same sender-visible rwnd accounting
> from the non-injected path.

You missed adding the tun program to the Makefile.

  tcp-rcv-neg-window-truesize-pkt                                               
  tcp-rcv-toobig-default-truesize-pkt                                           
  tcp-rcv-wnd-shrink-allowed-truesize-pkt                                       
                                                                               
Fail with:                                                            

  sh: line 1: ../tun: No such file or directory                                 
  tcp_rcv_*_truesize.pkt:*: error executing `../tun --set-rx-truesize tun0 6553\
6` command: non-zero status 127        

https://netdev.bots.linux.dev/contest.html?pw-n=0&branch=net-next-2026-03-15--00-00&pw-n=0&pass=0