[PATCH v2] bpf: guard sock_ops rtt_min access with is_locked_tcp_sock

Werner Kasselman posted 1 patch 2 months, 1 week ago
net/core/filter.c                             | 53 +++++++++++++++----
.../selftests/bpf/prog_tests/tcpbpf_user.c    |  9 ++++
.../selftests/bpf/progs/test_tcpbpf_kern.c    | 21 ++++++++
tools/testing/selftests/bpf/test_tcpbpf.h     |  6 +++
4 files changed, 79 insertions(+), 10 deletions(-)
[PATCH v2] bpf: guard sock_ops rtt_min access with is_locked_tcp_sock
Posted by Werner Kasselman 2 months, 1 week ago
sock_ops_convert_ctx_access() emits guarded reads for tcp_sock-backed
bpf_sock_ops fields such as snd_cwnd, srtt_us, snd_ssthresh, rcv_nxt,
snd_nxt, snd_una, mss_cache, ecn_flags, rate_delivered, and
rate_interval_us. Those accesses go through SOCK_OPS_GET_TCP_SOCK_FIELD(),
which checks is_locked_tcp_sock before dereferencing sock_ops.sk.

The rtt_min case is different. Because it reads a subfield of
struct minmax, it uses a custom open-coded load sequence instead of the
usual helper macro, and that sequence currently dereferences sock_ops.sk
without checking is_locked_tcp_sock first.

This is unsafe when sock_ops.sk points to a request_sock-backed object
instead of a locked full tcp_sock. That is reachable not only from the
SYNACK header option callbacks, but also from other request_sock-backed
sock_ops callbacks such as BPF_SOCK_OPS_TIMEOUT_INIT,
BPF_SOCK_OPS_RWND_INIT, and BPF_SOCK_OPS_NEEDS_ECN. In those cases,
reading ctx->rtt_min makes the generated code treat a request_sock as a
tcp_sock and read beyond the end of the request_sock allocation.

Fix the rtt_min conversion by adding the same is_locked_tcp_sock guard
used for the other tcp_sock field reads. Also make the accessed subfield
explicit by using offsetof(struct minmax_sample, v).

Add a selftest that verifies request_sock-backed sock_ops callbacks see
ctx->rtt_min as zero after the fix.

Found via AST-based call-graph analysis using sqry.

Fixes: 44f0e43037d3 ("bpf: Add support for reading sk_state and more")
Cc: stable@vger.kernel.org
Signed-off-by: Werner Kasselman <werner@verivus.com>
---
 net/core/filter.c                             | 53 +++++++++++++++----
 .../selftests/bpf/prog_tests/tcpbpf_user.c    |  9 ++++
 .../selftests/bpf/progs/test_tcpbpf_kern.c    | 21 ++++++++
 tools/testing/selftests/bpf/test_tcpbpf.h     |  6 +++
 4 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 78b548158..5040bf7e4 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -10827,16 +10827,49 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
 	case offsetof(struct bpf_sock_ops, rtt_min):
 		BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
 			     sizeof(struct minmax));
-		BUILD_BUG_ON(sizeof(struct minmax) <
-			     sizeof(struct minmax_sample));
-
-		*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
-						struct bpf_sock_ops_kern, sk),
-				      si->dst_reg, si->src_reg,
-				      offsetof(struct bpf_sock_ops_kern, sk));
-		*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
-				      offsetof(struct tcp_sock, rtt_min) +
-				      sizeof_field(struct minmax_sample, t));
+		BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, rtt_min) !=
+			     sizeof_field(struct minmax_sample, v));
+		off = offsetof(struct tcp_sock, rtt_min) +
+		      offsetof(struct minmax_sample, v);
+
+		{
+			int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;
+
+			if (si->dst_reg == reg || si->src_reg == reg)
+				reg--;
+			if (si->dst_reg == reg || si->src_reg == reg)
+				reg--;
+			if (si->dst_reg == si->src_reg) {
+				*insn++ = BPF_STX_MEM(BPF_DW, si->src_reg, reg,
+						      offsetof(struct bpf_sock_ops_kern,
+							       temp));
+				fullsock_reg = reg;
+				jmp += 2;
+			}
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+							struct bpf_sock_ops_kern,
+							is_locked_tcp_sock),
+					      fullsock_reg, si->src_reg,
+					      offsetof(struct bpf_sock_ops_kern,
+						       is_locked_tcp_sock));
+			*insn++ = BPF_JMP_IMM(BPF_JEQ, fullsock_reg, 0, jmp);
+			if (si->dst_reg == si->src_reg)
+				*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,
+						      offsetof(struct bpf_sock_ops_kern,
+							       temp));
+			*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+							struct bpf_sock_ops_kern, sk),
+					      si->dst_reg, si->src_reg,
+					      offsetof(struct bpf_sock_ops_kern, sk));
+			*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
+					      off);
+			if (si->dst_reg == si->src_reg) {
+				*insn++ = BPF_JMP_A(1);
+				*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->src_reg,
+						      offsetof(struct bpf_sock_ops_kern,
+							       temp));
+			}
+		}
 		break;
 
 	case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
diff --git a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c
index 7e8fe1bad..d243d6713 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcpbpf_user.c
@@ -42,6 +42,15 @@ static void verify_result(struct tcpbpf_globals *result)
 	/* check getsockopt for window_clamp */
 	ASSERT_EQ(result->window_clamp_client, 9216, "window_clamp_client");
 	ASSERT_EQ(result->window_clamp_server, 9216, "window_clamp_server");
+
+	ASSERT_EQ(result->timeout_init_req_seen, 1, "timeout_init_req_seen");
+	ASSERT_EQ(result->timeout_init_req_rtt_min, 0, "timeout_init_req_rtt_min");
+
+	ASSERT_EQ(result->rwnd_init_req_seen, 1, "rwnd_init_req_seen");
+	ASSERT_EQ(result->rwnd_init_req_rtt_min, 0, "rwnd_init_req_rtt_min");
+
+	ASSERT_EQ(result->needs_ecn_req_seen, 1, "needs_ecn_req_seen");
+	ASSERT_EQ(result->needs_ecn_req_rtt_min, 0, "needs_ecn_req_rtt_min");
 }
 
 static void run_test(struct tcpbpf_globals *result)
diff --git a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c
index 6935f32ee..79757a19b 100644
--- a/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c
+++ b/tools/testing/selftests/bpf/progs/test_tcpbpf_kern.c
@@ -85,6 +85,27 @@ int bpf_testcb(struct bpf_sock_ops *skops)
 	global.event_map |= (1 << op);
 
 	switch (op) {
+	case BPF_SOCK_OPS_TIMEOUT_INIT:
+		if (!skops->is_fullsock) {
+			global.timeout_init_req_seen = 1;
+			global.timeout_init_req_rtt_min = skops->rtt_min;
+		}
+		rv = -1;
+		break;
+	case BPF_SOCK_OPS_RWND_INIT:
+		if (!skops->is_fullsock) {
+			global.rwnd_init_req_seen = 1;
+			global.rwnd_init_req_rtt_min = skops->rtt_min;
+		}
+		rv = 0;
+		break;
+	case BPF_SOCK_OPS_NEEDS_ECN:
+		if (!skops->is_fullsock) {
+			global.needs_ecn_req_seen = 1;
+			global.needs_ecn_req_rtt_min = skops->rtt_min;
+		}
+		rv = 0;
+		break;
 	case BPF_SOCK_OPS_TCP_CONNECT_CB:
 		rv = bpf_setsockopt(skops, SOL_TCP, TCP_WINDOW_CLAMP,
 				    &window_clamp, sizeof(window_clamp));
diff --git a/tools/testing/selftests/bpf/test_tcpbpf.h b/tools/testing/selftests/bpf/test_tcpbpf.h
index 9dd9b5590..46500c1d6 100644
--- a/tools/testing/selftests/bpf/test_tcpbpf.h
+++ b/tools/testing/selftests/bpf/test_tcpbpf.h
@@ -18,5 +18,11 @@ struct tcpbpf_globals {
 	__u32 tcp_saved_syn;
 	__u32 window_clamp_client;
 	__u32 window_clamp_server;
+	__u32 timeout_init_req_seen;
+	__u32 timeout_init_req_rtt_min;
+	__u32 rwnd_init_req_seen;
+	__u32 rwnd_init_req_rtt_min;
+	__u32 needs_ecn_req_seen;
+	__u32 needs_ecn_req_rtt_min;
 };
 #endif
-- 
2.43.0

Re: [PATCH v2] bpf: guard sock_ops rtt_min access with is_locked_tcp_sock
Posted by Alexei Starovoitov 2 months, 1 week ago
On Wed, Apr 8, 2026 at 11:10 PM Werner Kasselman <werner@verivus.ai> wrote:
>
> sock_ops_convert_ctx_access() emits guarded reads for tcp_sock-backed
> bpf_sock_ops fields such as snd_cwnd, srtt_us, snd_ssthresh, rcv_nxt,
> snd_nxt, snd_una, mss_cache, ecn_flags, rate_delivered, and
> rate_interval_us. Those accesses go through SOCK_OPS_GET_TCP_SOCK_FIELD(),
> which checks is_locked_tcp_sock before dereferencing sock_ops.sk.
>
> The rtt_min case is different. Because it reads a subfield of
> struct minmax, it uses a custom open-coded load sequence instead of the
> usual helper macro, and that sequence currently dereferences sock_ops.sk
> without checking is_locked_tcp_sock first.
>
> This is unsafe when sock_ops.sk points to a request_sock-backed object
> instead of a locked full tcp_sock. That is reachable not only from the
> SYNACK header option callbacks, but also from other request_sock-backed
> sock_ops callbacks such as BPF_SOCK_OPS_TIMEOUT_INIT,
> BPF_SOCK_OPS_RWND_INIT, and BPF_SOCK_OPS_NEEDS_ECN. In those cases,
> reading ctx->rtt_min makes the generated code treat a request_sock as a
> tcp_sock and read beyond the end of the request_sock allocation.
>
> Fix the rtt_min conversion by adding the same is_locked_tcp_sock guard
> used for the other tcp_sock field reads. Also make the accessed subfield
> explicit by using offsetof(struct minmax_sample, v).
>
> Add a selftest that verifies request_sock-backed sock_ops callbacks see
> ctx->rtt_min as zero after the fix.
>
> Found via AST-based call-graph analysis using sqry.
>
> Fixes: 44f0e43037d3 ("bpf: Add support for reading sk_state and more")
> Cc: stable@vger.kernel.org
> Signed-off-by: Werner Kasselman <werner@verivus.com>
> ---
>  net/core/filter.c                             | 53 +++++++++++++++----
>  .../selftests/bpf/prog_tests/tcpbpf_user.c    |  9 ++++
>  .../selftests/bpf/progs/test_tcpbpf_kern.c    | 21 ++++++++
>  tools/testing/selftests/bpf/test_tcpbpf.h     |  6 +++
>  4 files changed, 79 insertions(+), 10 deletions(-)
>
> diff --git a/net/core/filter.c b/net/core/filter.c
> index 78b548158..5040bf7e4 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -10827,16 +10827,49 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
>         case offsetof(struct bpf_sock_ops, rtt_min):
>                 BUILD_BUG_ON(sizeof_field(struct tcp_sock, rtt_min) !=
>                              sizeof(struct minmax));
> -               BUILD_BUG_ON(sizeof(struct minmax) <
> -                            sizeof(struct minmax_sample));
> -
> -               *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
> -                                               struct bpf_sock_ops_kern, sk),
> -                                     si->dst_reg, si->src_reg,
> -                                     offsetof(struct bpf_sock_ops_kern, sk));
> -               *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg,
> -                                     offsetof(struct tcp_sock, rtt_min) +
> -                                     sizeof_field(struct minmax_sample, t));
> +               BUILD_BUG_ON(sizeof_field(struct bpf_sock_ops, rtt_min) !=
> +                            sizeof_field(struct minmax_sample, v));
> +               off = offsetof(struct tcp_sock, rtt_min) +
> +                     offsetof(struct minmax_sample, v);
> +
> +               {
> +                       int fullsock_reg = si->dst_reg, reg = BPF_REG_9, jmp = 2;
> +

please de-claude your patches before posting.

pw-bot: cr