From: Wesley Atwell <atwellwea@gmail.com>
Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
more sender-visible window than the current hard receive-memory backing
can cover.
The new helper keeps backlog and memory-pressure limits in the same
units as the rest of the receive path, while __tcp_select_window()
backs any rounding slack before advertising it.
Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
---
include/net/tcp.h | 12 ++++++++++++
net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++--
net/ipv4/tcp_output.c | 15 +++++++++++++--
3 files changed, 59 insertions(+), 4 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index fc22ab6b80d5..5b479ad44f89 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
void tcp_rcv_space_adjust(struct sock *sk);
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
void tcp_twsk_destructor(struct sock *sk);
@@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
}
+/* Passive children clone the listener's sk_socket until accept() grafts
+ * their own struct socket, so only sockets that point back to themselves
+ * should autotune receive-buffer backing.
+ */
+static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
+{
+ struct socket *sock = READ_ONCE(sk->sk_socket);
+
+ return sock && READ_ONCE(sock->sk) == sk;
+}
+
/* Note: caller must be prepared to deal with negative returns */
static inline int tcp_space(const struct sock *sk)
{
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 352f814a4ff6..32256519a085 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -774,6 +774,38 @@ static void tcp_init_buffer_space(struct sock *sk)
(u32)TCP_INIT_CWND * tp->advmss);
}
+/* Try to grow sk_rcvbuf so the hard receive-memory limit covers @needed
+ * bytes beyond sk_rmem_alloc while preserving sender-visible headroom
+ * already consumed by sk_backlog.len.
+ */
+bool tcp_try_grow_rcvbuf(struct sock *sk, int needed)
+{
+ struct net *net = sock_net(sk);
+ int backlog;
+ int rmem2;
+ int target;
+
+ needed = max(needed, 0);
+ backlog = READ_ONCE(sk->sk_backlog.len);
+ target = tcp_rmem_used(sk) + backlog + needed;
+
+ if (target <= READ_ONCE(sk->sk_rcvbuf))
+ return true;
+
+ rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
+ if (READ_ONCE(sk->sk_rcvbuf) >= rmem2 ||
+ (sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
+ tcp_under_memory_pressure(sk) ||
+ sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
+ return false;
+
+ WRITE_ONCE(sk->sk_rcvbuf,
+ min_t(int, rmem2,
+ max_t(int, READ_ONCE(sk->sk_rcvbuf), target)));
+
+ return target <= READ_ONCE(sk->sk_rcvbuf);
+}
+
/* 4. Recalculate window clamp after socket hit its memory bounds. */
static void tcp_clamp_window(struct sock *sk)
{
@@ -785,14 +817,14 @@ static void tcp_clamp_window(struct sock *sk)
icsk->icsk_ack.quick = 0;
rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
- if (sk->sk_rcvbuf < rmem2 &&
+ if (READ_ONCE(sk->sk_rcvbuf) < rmem2 &&
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK) &&
!tcp_under_memory_pressure(sk) &&
sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)) {
WRITE_ONCE(sk->sk_rcvbuf,
min(atomic_read(&sk->sk_rmem_alloc), rmem2));
}
- if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf)
+ if (atomic_read(&sk->sk_rmem_alloc) > READ_ONCE(sk->sk_rcvbuf))
tp->rcv_ssthresh = min(tp->window_clamp, 2U * tp->advmss);
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 57a2a6daaad3..53781cf591d2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -3375,13 +3375,24 @@ u32 __tcp_select_window(struct sock *sk)
* scaled window will not line up with the MSS boundary anyway.
*/
if (tp->rx_opt.rcv_wscale) {
+ int rcv_wscale = 1 << tp->rx_opt.rcv_wscale;
+
window = free_space;
/* Advertise enough space so that it won't get scaled away.
- * Import case: prevent zero window announcement if
+ * Important case: prevent zero-window announcement if
* 1<<rcv_wscale > mss.
*/
- window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
+ window = ALIGN(window, rcv_wscale);
+
+ /* Back any scale-quantization slack before we expose it.
+ * Otherwise tcp_can_ingest() can reject data which is still
+ * within the sender-visible window.
+ */
+ if (window > free_space &&
+ (!tcp_rcvbuf_grow_allowed(sk) ||
+ !tcp_try_grow_rcvbuf(sk, tcp_space_from_win(sk, window))))
+ window = round_down(free_space, rcv_wscale);
} else {
window = tp->rcv_wnd;
/* Get the largest window that is a nice multiple of mss.
--
2.43.0
On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
> index 57a2a6daaad3..53781cf591d2 100644
> --- a/net/ipv4/tcp_output.c
> +++ b/net/ipv4/tcp_output.c
> @@ -3375,13 +3375,24 @@ u32 __tcp_select_window(struct sock *sk)
> * scaled window will not line up with the MSS boundary anyway.
> */
> if (tp->rx_opt.rcv_wscale) {
> + int rcv_wscale = 1 << tp->rx_opt.rcv_wscale;
> +
> window = free_space;
>
> /* Advertise enough space so that it won't get scaled away.
> - * Import case: prevent zero window announcement if
> + * Important case: prevent zero-window announcement if
> * 1<<rcv_wscale > mss.
> */
> - window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale));
> + window = ALIGN(window, rcv_wscale);
> +
> + /* Back any scale-quantization slack before we expose it.
> + * Otherwise tcp_can_ingest() can reject data which is still
> + * within the sender-visible window.
> + */
> + if (window > free_space &&
> + (!tcp_rcvbuf_grow_allowed(sk) ||
> + !tcp_try_grow_rcvbuf(sk, tcp_space_from_win(sk, window))))
> + window = round_down(free_space, rcv_wscale);
It looks like this can cause the advertised window to shrink even if we
are in the 'do not allow window to shrink' branch.
Also why the other branch (shrinking allowed) is not touched?
/P
On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> From: Wesley Atwell <atwellwea@gmail.com>
>
> Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
> more sender-visible window than the current hard receive-memory backing
> can cover.
>
> The new helper keeps backlog and memory-pressure limits in the same
> units as the rest of the receive path, while __tcp_select_window()
> backs any rounding slack before advertising it.
>
> Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
> ---
> include/net/tcp.h | 12 ++++++++++++
> net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++--
> net/ipv4/tcp_output.c | 15 +++++++++++++--
> 3 files changed, 59 insertions(+), 4 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index fc22ab6b80d5..5b479ad44f89 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
> enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
> void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
> void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
> +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
> void tcp_rcv_space_adjust(struct sock *sk);
> int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
> void tcp_twsk_destructor(struct sock *sk);
> @@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
> return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
> }
>
> +/* Passive children clone the listener's sk_socket until accept() grafts
> + * their own struct socket,
AFAICS, the above statement is false, see sk_set_socket() in sk_clone()
> so only sockets that point back to themselves
> + * should autotune receive-buffer backing.
> + */
> +static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
> +{
> + struct socket *sock = READ_ONCE(sk->sk_socket);
You can just check `sk->sk_socket`. Also you could re-use this helper in
tcp_data_queue_ofo().
/P
On 3/14/26 9:13 PM, atwellwea@gmail.com wrote:
> From: Wesley Atwell <atwellwea@gmail.com>
>
> Teach TCP to grow sk_rcvbuf when scale rounding would otherwise expose
> more sender-visible window than the current hard receive-memory backing
> can cover.
>
> The new helper keeps backlog and memory-pressure limits in the same
> units as the rest of the receive path, while __tcp_select_window()
> backs any rounding slack before advertising it.
>
> Signed-off-by: Wesley Atwell <atwellwea@gmail.com>
> ---
> include/net/tcp.h | 12 ++++++++++++
> net/ipv4/tcp_input.c | 36 ++++++++++++++++++++++++++++++++++--
> net/ipv4/tcp_output.c | 15 +++++++++++++--
> 3 files changed, 59 insertions(+), 4 deletions(-)
>
> diff --git a/include/net/tcp.h b/include/net/tcp.h
> index fc22ab6b80d5..5b479ad44f89 100644
> --- a/include/net/tcp.h
> +++ b/include/net/tcp.h
> @@ -397,6 +397,7 @@ int tcp_ioctl(struct sock *sk, int cmd, int *karg);
> enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
> void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
> void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
> +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed);
> void tcp_rcv_space_adjust(struct sock *sk);
> int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
> void tcp_twsk_destructor(struct sock *sk);
> @@ -1844,6 +1845,17 @@ static inline int tcp_rwnd_avail(const struct sock *sk)
> return tcp_rmem_avail(sk) - READ_ONCE(sk->sk_backlog.len);
> }
>
> +/* Passive children clone the listener's sk_socket until accept() grafts
> + * their own struct socket, so only sockets that point back to themselves
> + * should autotune receive-buffer backing.
> + */
> +static inline bool tcp_rcvbuf_grow_allowed(const struct sock *sk)
> +{
> + struct socket *sock = READ_ONCE(sk->sk_socket);
> +
> + return sock && READ_ONCE(sock->sk) == sk;
This is executed under the sk socket lock, ONCE annotation not needed.
> +}
> +
> /* Note: caller must be prepared to deal with negative returns */
> static inline int tcp_space(const struct sock *sk)
> {
> diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
> index 352f814a4ff6..32256519a085 100644
> --- a/net/ipv4/tcp_input.c
> +++ b/net/ipv4/tcp_input.c
> @@ -774,6 +774,38 @@ static void tcp_init_buffer_space(struct sock *sk)
> (u32)TCP_INIT_CWND * tp->advmss);
> }
>
> +/* Try to grow sk_rcvbuf so the hard receive-memory limit covers @needed
> + * bytes beyond sk_rmem_alloc while preserving sender-visible headroom
> + * already consumed by sk_backlog.len.
> + */
> +bool tcp_try_grow_rcvbuf(struct sock *sk, int needed)
> +{
> + struct net *net = sock_net(sk);
> + int backlog;
> + int rmem2;
> + int target;
> +
> + needed = max(needed, 0);
> + backlog = READ_ONCE(sk->sk_backlog.len);
> + target = tcp_rmem_used(sk) + backlog + needed;
> +
> + if (target <= READ_ONCE(sk->sk_rcvbuf))
> + return true;
> +
> + rmem2 = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
> + if (READ_ONCE(sk->sk_rcvbuf) >= rmem2 ||
> + (sk->sk_userlocks & SOCK_RCVBUF_LOCK) ||
> + tcp_under_memory_pressure(sk) ||
> + sk_memory_allocated(sk) >= sk_prot_mem_limits(sk, 0))
> + return false;
> +
> + WRITE_ONCE(sk->sk_rcvbuf,
> + min_t(int, rmem2,
> + max_t(int, READ_ONCE(sk->sk_rcvbuf), target)));
> +
> + return target <= READ_ONCE(sk->sk_rcvbuf);
Same here, and more cases below.
/P
© 2016 - 2026 Red Hat, Inc.