Currently mptcp_cleanup_rbuf() makes a significant effort to avoid
acquiring the subflow socket lock, estimating if the tcp level
cleanup could actually send an ack.
Such estimate is a bit rough when accounting for receive window
change, as it consider the msk available buffer space instead
of the announced mptcp-level window.
Let's consider the announced window instead, mirroring closely
the plain TCP implementation. We need to lockless access a bunch
of additional, tcp fields.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
include/net/mptcp.h | 2 ++
net/ipv4/tcp.c | 3 +++
net/mptcp/options.c | 1 -
net/mptcp/protocol.c | 14 +++++++++++---
net/mptcp/protocol.h | 6 +++++-
5 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index 7af7fd48acc7..4b6c66b73bf4 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -136,6 +136,7 @@ static inline bool rsk_drop_req(const struct request_sock *req)
return tcp_rsk(req)->is_mptcp && tcp_rsk(req)->drop_req;
}
+void mptcp_receive_window(const struct sock *ssk, int *rwnd);
void mptcp_space(const struct sock *ssk, int *space, int *full_space);
bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
unsigned int *size, struct mptcp_out_options *opts);
@@ -284,6 +285,7 @@ static inline bool mptcp_skb_can_collapse(const struct sk_buff *to,
return true;
}
+static inline void mptcp_receive_window(const struct sock *ssk, int *rwnd) { }
static inline void mptcp_space(const struct sock *ssk, int *s, int *fs) { }
static inline void mptcp_seq_show(struct seq_file *seq) { }
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 970e9a2cca4a..9e2df51c0558 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1606,6 +1606,9 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied)
if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
__u32 rcv_window_now = tcp_receive_window(tp);
+ if (is_tcpsk(sk))
+ mptcp_receive_window(sk, &rcv_window_now);
+
/* Optimize, __tcp_select_window() is not cheap. */
if (2*rcv_window_now <= tp->window_clamp) {
__u32 new_window = __tcp_select_window(sk);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 30d289044e71..563ef8fe5a85 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -604,7 +604,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
}
opts->ext_copy.use_ack = 1;
opts->suboptions = OPTION_MPTCP_DSS;
- WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
/* Add kind/length/subtype/flag overhead if mapping is not populated */
if (dss_size == 0)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 5af3d591a20b..17e2dbe43639 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -535,6 +535,14 @@ static void mptcp_subflow_cleanup_rbuf(struct sock *ssk)
unlock_sock_fast(ssk, slow);
}
+void mptcp_receive_window(const struct sock *ssk, int *rwnd)
+{
+ const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ const struct sock *sk = subflow->conn;
+
+ *rwnd = __mptcp_receive_window(mptcp_sk(sk));
+}
+
static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
{
const struct inet_connection_sock *icsk = inet_csk(ssk);
@@ -550,13 +558,13 @@ static bool mptcp_subflow_could_cleanup(const struct sock *ssk, bool rx_empty)
static void mptcp_cleanup_rbuf(struct mptcp_sock *msk)
{
- int old_space = READ_ONCE(msk->old_wspace);
+ int cur_window = __mptcp_receive_window(msk);
struct mptcp_subflow_context *subflow;
struct sock *sk = (struct sock *)msk;
- int space = __mptcp_space(sk);
+ int new_window = __mptcp_space(sk);
bool cleanup, rx_empty;
- cleanup = (space > 0) && (space >= (old_space << 1));
+ cleanup = new_window > 0 && new_window >= (cur_window << 1);
rx_empty = !__mptcp_rmem(sk);
mptcp_for_each_subflow(msk, subflow) {
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 1bc9f6e77ddd..a54f42462a71 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -260,7 +260,6 @@ struct mptcp_sock {
int rmem_fwd_alloc;
struct sock *last_snd;
int snd_burst;
- int old_wspace;
u64 recovery_snd_nxt; /* in recovery mode accept up to this seq;
* recovery related fields are under data_lock
* protection
@@ -336,6 +335,11 @@ static inline int __mptcp_rmem(const struct sock *sk)
return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released);
}
+static inline int __mptcp_receive_window(const struct mptcp_sock *msk)
+{
+ return atomic64_read(&msk->rcv_wnd_sent) - msk->ack_seq;
+}
+
static inline int __mptcp_space(const struct sock *sk)
{
return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk));
--
2.35.3
Hi Paolo,
I love your patch! Yet something to improve:
[auto build test ERROR on mptcp/export]
[also build test ERROR on linus/master v5.19-rc8 next-20220728]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Paolo-Abeni/mptcp-just-another-receive-path-refactor/20220729-233501
base: https://github.com/multipath-tcp/mptcp_net-next.git export
config: hexagon-randconfig-r045-20220729 (https://download.01.org/0day-ci/archive/20220730/202207300238.2ZcAhQUi-lkp@intel.com/config)
compiler: clang version 15.0.0 (https://github.com/llvm/llvm-project 8dfaecc4c24494337933aff9d9166486ca0949f1)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/intel-lab-lkp/linux/commit/c330a583c6d306ab637d187f0f981bfe4408caba
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Paolo-Abeni/mptcp-just-another-receive-path-refactor/20220729-233501
git checkout c330a583c6d306ab637d187f0f981bfe4408caba
# save the config file
mkdir build_dir && cp config build_dir/.config
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 O=build_dir ARCH=hexagon SHELL=/bin/bash net/ipv4/ net/mptcp/
If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
>> net/ipv4/tcp.c:1609:7: error: call to undeclared function 'is_tcpsk'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
if (is_tcpsk(sk))
^
1 error generated.
vim +/is_tcpsk +1609 net/ipv4/tcp.c
1563
1564 /* Clean up the receive buffer for full frames taken by the user,
1565 * then send an ACK if necessary. COPIED is the number of bytes
1566 * tcp_recvmsg has given to the user so far, it speeds up the
1567 * calculation of whether or not we must ACK for the sake of
1568 * a window update.
1569 */
1570 void tcp_cleanup_rbuf(struct sock *sk, int copied)
1571 {
1572 struct tcp_sock *tp = tcp_sk(sk);
1573 bool time_to_ack = false;
1574
1575 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1576
1577 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1578 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1579 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1580
1581 if (inet_csk_ack_scheduled(sk)) {
1582 const struct inet_connection_sock *icsk = inet_csk(sk);
1583
1584 if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
1585 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1586 /*
1587 * If this read emptied read buffer, we send ACK, if
1588 * connection is not bidirectional, user drained
1589 * receive buffer and there was a small segment
1590 * in queue.
1591 */
1592 (copied > 0 &&
1593 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1594 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1595 !inet_csk_in_pingpong_mode(sk))) &&
1596 !atomic_read(&sk->sk_rmem_alloc)))
1597 time_to_ack = true;
1598 }
1599
1600 /* We send an ACK if we can now advertise a non-zero window
1601 * which has been raised "significantly".
1602 *
1603 * Even if window raised up to infinity, do not send window open ACK
1604 * in states, where we will not receive more. It is useless.
1605 */
1606 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1607 __u32 rcv_window_now = tcp_receive_window(tp);
1608
> 1609 if (is_tcpsk(sk))
1610 mptcp_receive_window(sk, &rcv_window_now);
1611
1612 /* Optimize, __tcp_select_window() is not cheap. */
1613 if (2*rcv_window_now <= tp->window_clamp) {
1614 __u32 new_window = __tcp_select_window(sk);
1615
1616 /* Send ACK now, if this read freed lots of space
1617 * in our buffer. Certainly, new_window is new window.
1618 * We can advertise it now, if it is not less than current one.
1619 * "Lots" means "at least twice" here.
1620 */
1621 if (new_window && new_window >= 2 * rcv_window_now)
1622 time_to_ack = true;
1623 }
1624 }
1625 if (time_to_ack)
1626 tcp_send_ack(sk);
1627 }
1628
--
0-DAY CI Kernel Test Service
https://01.org/lkp
Hi Paolo,
I love your patch! Yet something to improve:
[auto build test ERROR on mptcp/export]
[also build test ERROR on linus/master v5.19-rc8 next-20220728]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Paolo-Abeni/mptcp-just-another-receive-path-refactor/20220729-233501
base: https://github.com/multipath-tcp/mptcp_net-next.git export
config: um-i386_defconfig (https://download.01.org/0day-ci/archive/20220730/202207300227.kYan1H1K-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-3) 11.3.0
reproduce (this is a W=1 build):
# https://github.com/intel-lab-lkp/linux/commit/c330a583c6d306ab637d187f0f981bfe4408caba
git remote add linux-review https://github.com/intel-lab-lkp/linux
git fetch --no-tags linux-review Paolo-Abeni/mptcp-just-another-receive-path-refactor/20220729-233501
git checkout c330a583c6d306ab637d187f0f981bfe4408caba
# save the config file
mkdir build_dir && cp config build_dir/.config
make W=1 O=build_dir ARCH=um SUBARCH=i386 SHELL=/bin/bash
If you fix the issue, kindly add following tag where applicable
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
net/ipv4/tcp.c: In function 'tcp_cleanup_rbuf':
>> net/ipv4/tcp.c:1609:21: error: implicit declaration of function 'is_tcpsk' [-Werror=implicit-function-declaration]
1609 | if (is_tcpsk(sk))
| ^~~~~~~~
cc1: some warnings being treated as errors
vim +/is_tcpsk +1609 net/ipv4/tcp.c
1563
1564 /* Clean up the receive buffer for full frames taken by the user,
1565 * then send an ACK if necessary. COPIED is the number of bytes
1566 * tcp_recvmsg has given to the user so far, it speeds up the
1567 * calculation of whether or not we must ACK for the sake of
1568 * a window update.
1569 */
1570 void tcp_cleanup_rbuf(struct sock *sk, int copied)
1571 {
1572 struct tcp_sock *tp = tcp_sk(sk);
1573 bool time_to_ack = false;
1574
1575 struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
1576
1577 WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
1578 "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
1579 tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
1580
1581 if (inet_csk_ack_scheduled(sk)) {
1582 const struct inet_connection_sock *icsk = inet_csk(sk);
1583
1584 if (/* Once-per-two-segments ACK was not sent by tcp_input.c */
1585 tp->rcv_nxt - tp->rcv_wup > icsk->icsk_ack.rcv_mss ||
1586 /*
1587 * If this read emptied read buffer, we send ACK, if
1588 * connection is not bidirectional, user drained
1589 * receive buffer and there was a small segment
1590 * in queue.
1591 */
1592 (copied > 0 &&
1593 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED2) ||
1594 ((icsk->icsk_ack.pending & ICSK_ACK_PUSHED) &&
1595 !inet_csk_in_pingpong_mode(sk))) &&
1596 !atomic_read(&sk->sk_rmem_alloc)))
1597 time_to_ack = true;
1598 }
1599
1600 /* We send an ACK if we can now advertise a non-zero window
1601 * which has been raised "significantly".
1602 *
1603 * Even if window raised up to infinity, do not send window open ACK
1604 * in states, where we will not receive more. It is useless.
1605 */
1606 if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
1607 __u32 rcv_window_now = tcp_receive_window(tp);
1608
> 1609 if (is_tcpsk(sk))
1610 mptcp_receive_window(sk, &rcv_window_now);
1611
1612 /* Optimize, __tcp_select_window() is not cheap. */
1613 if (2*rcv_window_now <= tp->window_clamp) {
1614 __u32 new_window = __tcp_select_window(sk);
1615
1616 /* Send ACK now, if this read freed lots of space
1617 * in our buffer. Certainly, new_window is new window.
1618 * We can advertise it now, if it is not less than current one.
1619 * "Lots" means "at least twice" here.
1620 */
1621 if (new_window && new_window >= 2 * rcv_window_now)
1622 time_to_ack = true;
1623 }
1624 }
1625 if (time_to_ack)
1626 tcp_send_ack(sk);
1627 }
1628
--
0-DAY CI Kernel Test Service
https://01.org/lkp
© 2016 - 2026 Red Hat, Inc.