1 | This is a batch of changes I had sitting in my local tree for a while. | 1 | Paolo worked on this RX path refactor for these two main reasons: |
---|---|---|---|
2 | Why another refactor you may ask? Two main resons: | ||
3 | 2 | ||
4 | - currently the mptcp RX path introduces quite a bit of 'exceptional' | 3 | - Currently, the MPTCP RX path introduces quite a bit of 'exceptional' |
5 | accounting/locking processing WRT to plain TCP, adding up to the | 4 | accounting/locking processing WRT to plain TCP, adding up to the |
6 | implementation complexity in a misurable way | 5 | implementation complexity in a miserable way. |
7 | - the performance gap WRT plain TCP for single subflow connections is | 6 | |
8 | quite measurable. | 7 | - The performance gap WRT plain TCP for single subflow connections is |
8 | quite measurable. | ||
9 | 9 | ||
10 | The present refactor addresses both the above items: most of the | 10 | The present refactor addresses both the above items: most of the |
11 | additional complexity is dropped, and single stream performances | 11 | additional complexity is dropped, and single stream performances |
12 | increase measurably - from 55Gbps to 71Gbps in my loopback test. As a | 12 | increase measurably, from 55Gbps to 71Gbps in Paolo's loopback test. As |
13 | reference, plain TCP is around 84Gps on the same host. | 13 | a reference, plain TCP was around 84Gbps on the same host. |
14 | 14 | ||
15 | The above comes to a price: the patch are invasive, even in subtle ways: | 15 | The above comes to a price: the patch are invasive, even in subtle ways. |
16 | the chance of destabilizing the implementation is real (ence the | ||
17 | additional, intentional '-next' into the subj). | ||
18 | 16 | ||
19 | In any case keeping the patch hidden for longer was not going to do any | 17 | Note: patch 5/7 removes the sk_forward_alloc_get() helper, which caused |
20 | good, so here we are. | 18 | some trivial modifications in different places in the net tree: sockets, |
19 | IPv4, sched. That's why a few more people have been Cc here. Feel free | ||
20 | to only look at this patch 5/7. | ||
21 | 21 | ||
22 | Paolo Abeni (3): | 22 | Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> |
23 | mptcp: consolidate subflow cleanup | 23 | --- |
24 | mptcp: move the whole rx path under msk socket lock protection | 24 | Paolo Abeni (7): |
25 | mptcp: cleanup mem accounting. | 25 | mptcp: consolidate subflow cleanup |
26 | mptcp: drop __mptcp_fastopen_gen_msk_ackseq() | ||
27 | mptcp: move the whole rx path under msk socket lock protection | ||
28 | mptcp: cleanup mem accounting | ||
29 | net: dismiss sk_forward_alloc_get() | ||
30 | mptcp: dismiss __mptcp_rmem() | ||
31 | mptcp: micro-optimize __mptcp_move_skb() | ||
26 | 32 | ||
27 | net/mptcp/fastopen.c | 4 +- | 33 | include/net/sock.h | 13 --- |
28 | net/mptcp/protocol.c | 202 +++++++++---------------------------------- | 34 | net/core/sock.c | 2 +- |
29 | net/mptcp/protocol.h | 6 +- | 35 | net/ipv4/af_inet.c | 2 +- |
30 | net/mptcp/subflow.c | 33 +++---- | 36 | net/ipv4/inet_diag.c | 2 +- |
31 | 4 files changed, 66 insertions(+), 179 deletions(-) | 37 | net/mptcp/fastopen.c | 27 +---- |
38 | net/mptcp/protocol.c | 317 ++++++++++++++++----------------------------------- | ||
39 | net/mptcp/protocol.h | 22 ++-- | ||
40 | net/mptcp/subflow.c | 36 +++--- | ||
41 | net/sched/em_meta.c | 2 +- | ||
42 | 9 files changed, 134 insertions(+), 289 deletions(-) | ||
43 | --- | ||
44 | base-commit: b4cb730862cf4f59ac3dcb83b9ac4eeb29dbfb0e | ||
45 | change-id: 20250106-net-next-mptcp-rx-path-refactor-f44579efb57c | ||
32 | 46 | ||
47 | Best regards, | ||
33 | -- | 48 | -- |
34 | 2.45.2 | 49 | Matthieu Baerts (NGI0) <matttbe@kernel.org> | diff view generated by jsdifflib |
1 | Consolidate all the cleanup actions requiring the worked in a single | 1 | From: Paolo Abeni <pabeni@redhat.com> |
---|---|---|---|
2 | |||
3 | Consolidate all the cleanup actions requiring the worker in a single | ||
2 | helper and ensure the dummy data fin creation for fallback socket is | 4 | helper and ensure the dummy data fin creation for fallback socket is |
3 | performed only when the tcp rx queue is empty. | 5 | performed only when the tcp rx queue is empty. |
4 | 6 | ||
5 | There are no functional changes intended, but this will simplify the | 7 | There are no functional changes intended, but this will simplify the |
6 | next patch, when the tcp rx queue spooling could be delayed at release_cb | 8 | next patch, when the tcp rx queue spooling could be delayed at release_cb |
7 | time. | 9 | time. |
8 | 10 | ||
9 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | 11 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> |
12 | Reviewed-by: Mat Martineau <martineau@kernel.org> | ||
13 | Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> | ||
10 | --- | 14 | --- |
11 | net/mptcp/subflow.c | 33 ++++++++++++++++++--------------- | 15 | net/mptcp/subflow.c | 33 ++++++++++++++++++--------------- |
12 | 1 file changed, 18 insertions(+), 15 deletions(-) | 16 | 1 file changed, 18 insertions(+), 15 deletions(-) |
13 | 17 | ||
14 | diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c | 18 | diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c |
... | ... | ||
74 | - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) | 78 | - mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true)) |
75 | - mptcp_schedule_work(parent); | 79 | - mptcp_schedule_work(parent); |
76 | } | 80 | } |
77 | 81 | ||
78 | void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) | 82 | void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) |
83 | |||
79 | -- | 84 | -- |
80 | 2.45.2 | 85 | 2.47.1 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | From: Paolo Abeni <pabeni@redhat.com> | ||
1 | 2 | ||
3 | When we will move the whole RX path under the msk socket lock, updating | ||
4 | the already queued skb for passive fastopen socket at 3rd ack time will | ||
5 | be extremely painful and race prone | ||
6 | |||
7 | The map_seq for already enqueued skbs is used only to allow correct | ||
8 | coalescing with later data; preventing collapsing to the first skb of | ||
9 | a fastopen connect we can completely remove the | ||
10 | __mptcp_fastopen_gen_msk_ackseq() helper. | ||
11 | |||
12 | Before dropping this helper, a new item had to be added to the | ||
13 | mptcp_skb_cb structure. Because this item will be frequently tested in | ||
14 | the fast path -- almost on every packet -- and because there is free | ||
15 | space there, a single byte is used instead of a bitfield. This micro | ||
16 | optimisation slightly reduces the number of CPU operations to do the | ||
17 | associated check. | ||
18 | |||
19 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||
20 | Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> | ||
21 | Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> | ||
22 | --- | ||
23 | net/mptcp/fastopen.c | 24 ++---------------------- | ||
24 | net/mptcp/protocol.c | 4 +++- | ||
25 | net/mptcp/protocol.h | 5 ++--- | ||
26 | net/mptcp/subflow.c | 3 --- | ||
27 | 4 files changed, 7 insertions(+), 29 deletions(-) | ||
28 | |||
29 | diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c | ||
30 | index XXXXXXX..XXXXXXX 100644 | ||
31 | --- a/net/mptcp/fastopen.c | ||
32 | +++ b/net/mptcp/fastopen.c | ||
33 | @@ -XXX,XX +XXX,XX @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf | ||
34 | tp->copied_seq += skb->len; | ||
35 | subflow->ssn_offset += skb->len; | ||
36 | |||
37 | - /* initialize a dummy sequence number, we will update it at MPC | ||
38 | - * completion, if needed | ||
39 | - */ | ||
40 | + /* Only the sequence delta is relevant */ | ||
41 | MPTCP_SKB_CB(skb)->map_seq = -skb->len; | ||
42 | MPTCP_SKB_CB(skb)->end_seq = 0; | ||
43 | MPTCP_SKB_CB(skb)->offset = 0; | ||
44 | MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; | ||
45 | + MPTCP_SKB_CB(skb)->cant_coalesce = 1; | ||
46 | |||
47 | mptcp_data_lock(sk); | ||
48 | |||
49 | @@ -XXX,XX +XXX,XX @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf | ||
50 | |||
51 | mptcp_data_unlock(sk); | ||
52 | } | ||
53 | - | ||
54 | -void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, | ||
55 | - const struct mptcp_options_received *mp_opt) | ||
56 | -{ | ||
57 | - struct sock *sk = (struct sock *)msk; | ||
58 | - struct sk_buff *skb; | ||
59 | - | ||
60 | - skb = skb_peek_tail(&sk->sk_receive_queue); | ||
61 | - if (skb) { | ||
62 | - WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); | ||
63 | - pr_debug("msk %p moving seq %llx -> %llx end_seq %llx -> %llx\n", sk, | ||
64 | - MPTCP_SKB_CB(skb)->map_seq, MPTCP_SKB_CB(skb)->map_seq + msk->ack_seq, | ||
65 | - MPTCP_SKB_CB(skb)->end_seq, MPTCP_SKB_CB(skb)->end_seq + msk->ack_seq); | ||
66 | - MPTCP_SKB_CB(skb)->map_seq += msk->ack_seq; | ||
67 | - MPTCP_SKB_CB(skb)->end_seq += msk->ack_seq; | ||
68 | - } | ||
69 | - | ||
70 | - pr_debug("msk=%p ack_seq=%llx\n", msk, msk->ack_seq); | ||
71 | -} | ||
72 | diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c | ||
73 | index XXXXXXX..XXXXXXX 100644 | ||
74 | --- a/net/mptcp/protocol.c | ||
75 | +++ b/net/mptcp/protocol.c | ||
76 | @@ -XXX,XX +XXX,XX @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, | ||
77 | bool fragstolen; | ||
78 | int delta; | ||
79 | |||
80 | - if (MPTCP_SKB_CB(from)->offset || | ||
81 | + if (unlikely(MPTCP_SKB_CB(to)->cant_coalesce) || | ||
82 | + MPTCP_SKB_CB(from)->offset || | ||
83 | ((to->len + from->len) > (sk->sk_rcvbuf >> 3)) || | ||
84 | !skb_try_coalesce(to, from, &fragstolen, &delta)) | ||
85 | return false; | ||
86 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk, | ||
87 | MPTCP_SKB_CB(skb)->end_seq = MPTCP_SKB_CB(skb)->map_seq + copy_len; | ||
88 | MPTCP_SKB_CB(skb)->offset = offset; | ||
89 | MPTCP_SKB_CB(skb)->has_rxtstamp = has_rxtstamp; | ||
90 | + MPTCP_SKB_CB(skb)->cant_coalesce = 0; | ||
91 | |||
92 | if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) { | ||
93 | /* in sequence */ | ||
94 | diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h | ||
95 | index XXXXXXX..XXXXXXX 100644 | ||
96 | --- a/net/mptcp/protocol.h | ||
97 | +++ b/net/mptcp/protocol.h | ||
98 | @@ -XXX,XX +XXX,XX @@ struct mptcp_skb_cb { | ||
99 | u64 map_seq; | ||
100 | u64 end_seq; | ||
101 | u32 offset; | ||
102 | - u8 has_rxtstamp:1; | ||
103 | + u8 has_rxtstamp; | ||
104 | + u8 cant_coalesce; | ||
105 | }; | ||
106 | |||
107 | #define MPTCP_SKB_CB(__skb) ((struct mptcp_skb_cb *)&((__skb)->cb[0])) | ||
108 | @@ -XXX,XX +XXX,XX @@ void mptcp_event_pm_listener(const struct sock *ssk, | ||
109 | enum mptcp_event_type event); | ||
110 | bool mptcp_userspace_pm_active(const struct mptcp_sock *msk); | ||
111 | |||
112 | -void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflow_context *subflow, | ||
113 | - const struct mptcp_options_received *mp_opt); | ||
114 | void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, | ||
115 | struct request_sock *req); | ||
116 | int mptcp_nl_fill_addr(struct sk_buff *skb, | ||
117 | diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c | ||
118 | index XXXXXXX..XXXXXXX 100644 | ||
119 | --- a/net/mptcp/subflow.c | ||
120 | +++ b/net/mptcp/subflow.c | ||
121 | @@ -XXX,XX +XXX,XX @@ void __mptcp_subflow_fully_established(struct mptcp_sock *msk, | ||
122 | subflow_set_remote_key(msk, subflow, mp_opt); | ||
123 | WRITE_ONCE(subflow->fully_established, true); | ||
124 | WRITE_ONCE(msk->fully_established, true); | ||
125 | - | ||
126 | - if (subflow->is_mptfo) | ||
127 | - __mptcp_fastopen_gen_msk_ackseq(msk, subflow, mp_opt); | ||
128 | } | ||
129 | |||
130 | static struct sock *subflow_syn_recv_sock(const struct sock *sk, | ||
131 | |||
132 | -- | ||
133 | 2.47.1 | diff view generated by jsdifflib |
1 | After commit c2e6048fa1cf ("mptcp: fix race in release_cb") it's | 1 | From: Paolo Abeni <pabeni@redhat.com> |
---|---|---|---|
2 | pretty straight forward move the whole MPTCP rx path under the socket | 2 | |
3 | lock leveraging the release_cb. | 3 | After commit c2e6048fa1cf ("mptcp: fix race in release_cb") we can |
4 | move the whole MPTCP rx path under the socket lock leveraging the | ||
5 | release_cb. | ||
4 | 6 | ||
5 | We can drop a bunch of spin_lock pairs in the receive functions, use | 7 | We can drop a bunch of spin_lock pairs in the receive functions, use |
6 | a single receive queue and invoke __mptcp_move_skbs only when subflows | 8 | a single receive queue and invoke __mptcp_move_skbs only when subflows |
7 | ask for it. | 9 | ask for it. |
8 | 10 | ||
9 | This will allow more cleanup in the next patch | 11 | This will allow more cleanup in the next patch. |
12 | |||
13 | Some changes are worth specific mention: | ||
14 | |||
15 | The msk rcvbuf update now always happens under both the msk and the | ||
16 | subflow socket lock: we can drop a bunch of ONCE annotation and | ||
17 | consolidate the checks. | ||
18 | |||
19 | When the skbs move is delayed at msk release callback time, even the | ||
20 | msk rcvbuf update is delayed; additionally take care of such action in | ||
21 | __mptcp_move_skbs(). | ||
10 | 22 | ||
11 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | 23 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> |
24 | Reviewed-by: Mat Martineau <martineau@kernel.org> | ||
25 | Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> | ||
12 | --- | 26 | --- |
13 | net/mptcp/fastopen.c | 2 ++ | 27 | net/mptcp/fastopen.c | 1 + |
14 | net/mptcp/protocol.c | 76 +++++++++++++++++++------------------------- | 28 | net/mptcp/protocol.c | 123 ++++++++++++++++++++++++--------------------------- |
15 | net/mptcp/protocol.h | 2 +- | 29 | net/mptcp/protocol.h | 2 +- |
16 | 3 files changed, 36 insertions(+), 44 deletions(-) | 30 | 3 files changed, 60 insertions(+), 66 deletions(-) |
17 | 31 | ||
18 | diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c | 32 | diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c |
19 | index XXXXXXX..XXXXXXX 100644 | 33 | index XXXXXXX..XXXXXXX 100644 |
20 | --- a/net/mptcp/fastopen.c | 34 | --- a/net/mptcp/fastopen.c |
21 | +++ b/net/mptcp/fastopen.c | 35 | +++ b/net/mptcp/fastopen.c |
22 | @@ -XXX,XX +XXX,XX @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf | 36 | @@ -XXX,XX +XXX,XX @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subf |
23 | MPTCP_SKB_CB(skb)->has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp; | 37 | MPTCP_SKB_CB(skb)->cant_coalesce = 1; |
24 | 38 | ||
25 | mptcp_data_lock(sk); | 39 | mptcp_data_lock(sk); |
26 | + DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); | 40 | + DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); |
27 | 41 | ||
28 | mptcp_set_owner_r(skb, sk); | 42 | mptcp_set_owner_r(skb, sk); |
29 | __skb_queue_tail(&sk->sk_receive_queue, skb); | 43 | __skb_queue_tail(&sk->sk_receive_queue, skb); |
30 | @@ -XXX,XX +XXX,XX @@ void __mptcp_fastopen_gen_msk_ackseq(struct mptcp_sock *msk, struct mptcp_subflo | ||
31 | struct sock *sk = (struct sock *)msk; | ||
32 | struct sk_buff *skb; | ||
33 | |||
34 | + DEBUG_NET_WARN_ON_ONCE(sock_owned_by_user_nocheck(sk)); | ||
35 | skb = skb_peek_tail(&sk->sk_receive_queue); | ||
36 | if (skb) { | ||
37 | WARN_ON_ONCE(MPTCP_SKB_CB(skb)->end_seq); | ||
38 | diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c | 44 | diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c |
39 | index XXXXXXX..XXXXXXX 100644 | 45 | index XXXXXXX..XXXXXXX 100644 |
40 | --- a/net/mptcp/protocol.c | 46 | --- a/net/mptcp/protocol.c |
41 | +++ b/net/mptcp/protocol.c | 47 | +++ b/net/mptcp/protocol.c |
48 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, | ||
49 | bool more_data_avail; | ||
50 | struct tcp_sock *tp; | ||
51 | bool done = false; | ||
52 | - int sk_rbuf; | ||
53 | - | ||
54 | - sk_rbuf = READ_ONCE(sk->sk_rcvbuf); | ||
55 | - | ||
56 | - if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { | ||
57 | - int ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); | ||
58 | - | ||
59 | - if (unlikely(ssk_rbuf > sk_rbuf)) { | ||
60 | - WRITE_ONCE(sk->sk_rcvbuf, ssk_rbuf); | ||
61 | - sk_rbuf = ssk_rbuf; | ||
62 | - } | ||
63 | - } | ||
64 | |||
65 | pr_debug("msk=%p ssk=%p\n", msk, ssk); | ||
66 | tp = tcp_sk(ssk); | ||
67 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, | ||
68 | WRITE_ONCE(tp->copied_seq, seq); | ||
69 | more_data_avail = mptcp_subflow_data_available(ssk); | ||
70 | |||
71 | - if (atomic_read(&sk->sk_rmem_alloc) > sk_rbuf) { | ||
72 | + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { | ||
73 | done = true; | ||
74 | break; | ||
75 | } | ||
42 | @@ -XXX,XX +XXX,XX @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) | 76 | @@ -XXX,XX +XXX,XX @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) |
43 | return moved > 0; | 77 | return moved > 0; |
44 | } | 78 | } |
45 | 79 | ||
46 | -void mptcp_data_ready(struct sock *sk, struct sock *ssk) | 80 | +static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) |
81 | +{ | ||
82 | + if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf)) | ||
83 | + WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf); | ||
84 | +} | ||
85 | + | ||
47 | +static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) | 86 | +static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) |
87 | +{ | ||
88 | + struct mptcp_sock *msk = mptcp_sk(sk); | ||
89 | + | ||
90 | + __mptcp_rcvbuf_update(sk, ssk); | ||
91 | + | ||
92 | + /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ | ||
93 | + if (__mptcp_rmem(sk) > sk->sk_rcvbuf) | ||
94 | + return; | ||
95 | + | ||
96 | + /* Wake-up the reader only for in-sequence data */ | ||
97 | + if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) | ||
98 | + sk->sk_data_ready(sk); | ||
99 | +} | ||
100 | + | ||
101 | void mptcp_data_ready(struct sock *sk, struct sock *ssk) | ||
48 | { | 102 | { |
49 | struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); | 103 | struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); |
50 | struct mptcp_sock *msk = mptcp_sk(sk); | 104 | - struct mptcp_sock *msk = mptcp_sk(sk); |
105 | - int sk_rbuf, ssk_rbuf; | ||
106 | |||
107 | /* The peer can send data while we are shutting down this | ||
108 | * subflow at msk destruction time, but we must avoid enqueuing | ||
51 | @@ -XXX,XX +XXX,XX @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) | 109 | @@ -XXX,XX +XXX,XX @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk) |
110 | if (unlikely(subflow->disposable)) | ||
52 | return; | 111 | return; |
53 | 112 | ||
54 | /* Wake-up the reader only for in-sequence data */ | 113 | - ssk_rbuf = READ_ONCE(ssk->sk_rcvbuf); |
55 | - mptcp_data_lock(sk); | 114 | - sk_rbuf = READ_ONCE(sk->sk_rcvbuf); |
56 | if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) | 115 | - if (unlikely(ssk_rbuf > sk_rbuf)) |
57 | sk->sk_data_ready(sk); | 116 | - sk_rbuf = ssk_rbuf; |
58 | +} | 117 | - |
59 | + | 118 | - /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ |
60 | +void mptcp_data_ready(struct sock *sk, struct sock *ssk) | 119 | - if (__mptcp_rmem(sk) > sk_rbuf) |
61 | +{ | 120 | - return; |
62 | + mptcp_data_lock(sk); | 121 | - |
122 | - /* Wake-up the reader only for in-sequence data */ | ||
123 | mptcp_data_lock(sk); | ||
124 | - if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) | ||
125 | - sk->sk_data_ready(sk); | ||
63 | + if (!sock_owned_by_user(sk)) | 126 | + if (!sock_owned_by_user(sk)) |
64 | + __mptcp_data_ready(sk, ssk); | 127 | + __mptcp_data_ready(sk, ssk); |
65 | + else | 128 | + else |
66 | + __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags); | 129 | + __set_bit(MPTCP_DEQUEUE, &mptcp_sk(sk)->cb_flags); |
67 | mptcp_data_unlock(sk); | 130 | mptcp_data_unlock(sk); |
68 | } | 131 | } |
69 | 132 | ||
70 | @@ -XXX,XX +XXX,XX @@ static void __mptcp_clean_una_wakeup(struct sock *sk) | ||
71 | |||
72 | static void mptcp_clean_una_wakeup(struct sock *sk) | ||
73 | { | ||
74 | - mptcp_data_lock(sk); | ||
75 | __mptcp_clean_una_wakeup(sk); | ||
76 | - mptcp_data_unlock(sk); | ||
77 | } | ||
78 | |||
79 | static void mptcp_enter_memory_pressure(struct sock *sk) | ||
80 | @@ -XXX,XX +XXX,XX @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) | 133 | @@ -XXX,XX +XXX,XX @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) |
81 | goto out; | 134 | |
82 | } | 135 | static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied); |
83 | 136 | ||
84 | -static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, | 137 | -static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, |
85 | +static bool __mptcp_move_skbs(struct sock *sk); | ||
86 | + | ||
87 | +static int __mptcp_recvmsg_mskq(struct sock *sk, | 138 | +static int __mptcp_recvmsg_mskq(struct sock *sk, |
88 | struct msghdr *msg, | 139 | struct msghdr *msg, |
89 | size_t len, int flags, | 140 | size_t len, int flags, |
90 | struct scm_timestamping_internal *tss, | 141 | struct scm_timestamping_internal *tss, |
91 | int *cmsg_flags) | 142 | int *cmsg_flags) |
92 | { | 143 | { |
93 | + struct mptcp_sock *msk = mptcp_sk(sk); | 144 | + struct mptcp_sock *msk = mptcp_sk(sk); |
94 | struct sk_buff *skb, *tmp; | 145 | struct sk_buff *skb, *tmp; |
95 | int copied = 0; | 146 | int copied = 0; |
96 | 147 | ||
97 | - skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { | 148 | - skb_queue_walk_safe(&msk->receive_queue, skb, tmp) { |
98 | + if (skb_queue_empty(&sk->sk_receive_queue) && !__mptcp_move_skbs(sk)) | ||
99 | + return 0; | ||
100 | + | ||
101 | + skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { | 149 | + skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) { |
102 | u32 offset = MPTCP_SKB_CB(skb)->offset; | 150 | u32 offset = MPTCP_SKB_CB(skb)->offset; |
103 | u32 data_len = skb->len - offset; | 151 | u32 data_len = skb->len - offset; |
104 | u32 count = min_t(size_t, len - copied, data_len); | 152 | u32 count = min_t(size_t, len - copied, data_len); |
105 | @@ -XXX,XX +XXX,XX @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, | 153 | @@ -XXX,XX +XXX,XX @@ static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, |
... | ... | ||
116 | } | 164 | } |
117 | 165 | ||
118 | -static void __mptcp_splice_receive_queue(struct sock *sk) | 166 | -static void __mptcp_splice_receive_queue(struct sock *sk) |
119 | +static bool __mptcp_move_skbs(struct sock *sk) | 167 | +static bool __mptcp_move_skbs(struct sock *sk) |
120 | { | 168 | { |
169 | + struct mptcp_subflow_context *subflow; | ||
121 | struct mptcp_sock *msk = mptcp_sk(sk); | 170 | struct mptcp_sock *msk = mptcp_sk(sk); |
122 | - | 171 | - |
123 | - skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); | 172 | - skb_queue_splice_tail_init(&sk->sk_receive_queue, &msk->receive_queue); |
124 | -} | 173 | -} |
125 | - | 174 | - |
126 | -static bool __mptcp_move_skbs(struct mptcp_sock *msk) | 175 | -static bool __mptcp_move_skbs(struct mptcp_sock *msk) |
127 | -{ | 176 | -{ |
128 | - struct sock *sk = (struct sock *)msk; | 177 | - struct sock *sk = (struct sock *)msk; |
129 | unsigned int moved = 0; | 178 | unsigned int moved = 0; |
130 | bool ret, done; | 179 | bool ret, done; |
131 | 180 | ||
132 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs(struct mptcp_sock *msk) | 181 | + /* verify we can move any data from the subflow, eventually updating */ |
182 | + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) | ||
183 | + mptcp_for_each_subflow(msk, subflow) | ||
184 | + __mptcp_rcvbuf_update(sk, subflow->tcp_sock); | ||
185 | + | ||
186 | + if (__mptcp_rmem(sk) > sk->sk_rcvbuf) | ||
187 | + return false; | ||
188 | + | ||
189 | do { | ||
133 | struct sock *ssk = mptcp_subflow_recv_lookup(msk); | 190 | struct sock *ssk = mptcp_subflow_recv_lookup(msk); |
134 | bool slowpath; | 191 | bool slowpath; |
135 | 192 | ||
136 | - /* we can have data pending in the subflows only if the msk | 193 | - /* we can have data pending in the subflows only if the msk |
137 | - * receive buffer was full at subflow_data_ready() time, | 194 | - * receive buffer was full at subflow_data_ready() time, |
... | ... | ||
187 | + bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); | 244 | + bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags); |
188 | if (unlikely(bytes_read < 0)) { | 245 | if (unlikely(bytes_read < 0)) { |
189 | if (!copied) | 246 | if (!copied) |
190 | copied = bytes_read; | 247 | copied = bytes_read; |
191 | @@ -XXX,XX +XXX,XX @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, | 248 | @@ -XXX,XX +XXX,XX @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, |
192 | /* be sure to advertise window change */ | 249 | |
193 | mptcp_cleanup_rbuf(msk); | 250 | copied += bytes_read; |
194 | 251 | ||
195 | - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) | 252 | - if (skb_queue_empty(&msk->receive_queue) && __mptcp_move_skbs(msk)) |
196 | - continue; | 253 | + if (skb_queue_empty(&sk->sk_receive_queue) && __mptcp_move_skbs(sk)) |
254 | continue; | ||
197 | 255 | ||
198 | /* only the MPTCP socket status is relevant here. The exit | 256 | /* only the MPTCP socket status is relevant here. The exit |
199 | * conditions mirror closely tcp_recvmsg() | ||
200 | @@ -XXX,XX +XXX,XX @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, | 257 | @@ -XXX,XX +XXX,XX @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, |
201 | /* race breaker: the shutdown could be after the | 258 | /* race breaker: the shutdown could be after the |
202 | * previous receive queue check | 259 | * previous receive queue check |
203 | */ | 260 | */ |
204 | - if (__mptcp_move_skbs(msk)) | 261 | - if (__mptcp_move_skbs(msk)) |
... | ... | ||
211 | } | 268 | } |
212 | 269 | ||
213 | - pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", | 270 | - pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", |
214 | - msk, skb_queue_empty_lockless(&sk->sk_receive_queue), | 271 | - msk, skb_queue_empty_lockless(&sk->sk_receive_queue), |
215 | - skb_queue_empty(&msk->receive_queue), copied); | 272 | - skb_queue_empty(&msk->receive_queue), copied); |
216 | + pr_debug("msk=%p rx queue empty=%d copied=%d", | 273 | + pr_debug("msk=%p rx queue empty=%d copied=%d\n", |
217 | + msk, skb_queue_empty(&sk->sk_receive_queue), copied); | 274 | + msk, skb_queue_empty(&sk->sk_receive_queue), copied); |
218 | 275 | ||
219 | release_sock(sk); | 276 | release_sock(sk); |
220 | return copied; | 277 | return copied; |
221 | @@ -XXX,XX +XXX,XX @@ static void __mptcp_init_sock(struct sock *sk) | 278 | @@ -XXX,XX +XXX,XX @@ static void __mptcp_init_sock(struct sock *sk) |
... | ... | ||
251 | static void mptcp_release_cb(struct sock *sk) | 308 | static void mptcp_release_cb(struct sock *sk) |
252 | @@ -XXX,XX +XXX,XX @@ static void mptcp_release_cb(struct sock *sk) | 309 | @@ -XXX,XX +XXX,XX @@ static void mptcp_release_cb(struct sock *sk) |
253 | __mptcp_push_pending(sk, 0); | 310 | __mptcp_push_pending(sk, 0); |
254 | if (flags & BIT(MPTCP_RETRANSMIT)) | 311 | if (flags & BIT(MPTCP_RETRANSMIT)) |
255 | __mptcp_retrans(sk); | 312 | __mptcp_retrans(sk); |
256 | + if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) | 313 | + if ((flags & BIT(MPTCP_DEQUEUE)) && __mptcp_move_skbs(sk)) { |
314 | + /* notify ack seq update */ | ||
315 | + mptcp_cleanup_rbuf(msk, 0); | ||
257 | + sk->sk_data_ready(sk); | 316 | + sk->sk_data_ready(sk); |
317 | + } | ||
258 | 318 | ||
259 | cond_resched(); | 319 | cond_resched(); |
260 | spin_lock_bh(&sk->sk_lock.slock); | 320 | spin_lock_bh(&sk->sk_lock.slock); |
261 | @@ -XXX,XX +XXX,XX @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) | 321 | @@ -XXX,XX +XXX,XX @@ static int mptcp_ioctl(struct sock *sk, int cmd, int *karg) |
262 | return -EINVAL; | 322 | return -EINVAL; |
263 | 323 | ||
264 | lock_sock(sk); | 324 | lock_sock(sk); |
265 | - __mptcp_move_skbs(msk); | 325 | - __mptcp_move_skbs(msk); |
266 | + __mptcp_move_skbs(sk); | 326 | + if (__mptcp_move_skbs(sk)) |
327 | + mptcp_cleanup_rbuf(msk, 0); | ||
267 | *karg = mptcp_inq_hint(sk); | 328 | *karg = mptcp_inq_hint(sk); |
268 | release_sock(sk); | 329 | release_sock(sk); |
269 | break; | 330 | break; |
270 | diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h | 331 | diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h |
271 | index XXXXXXX..XXXXXXX 100644 | 332 | index XXXXXXX..XXXXXXX 100644 |
... | ... | ||
285 | struct rb_root out_of_order_queue; | 346 | struct rb_root out_of_order_queue; |
286 | - struct sk_buff_head receive_queue; | 347 | - struct sk_buff_head receive_queue; |
287 | struct list_head conn_list; | 348 | struct list_head conn_list; |
288 | struct list_head rtx_queue; | 349 | struct list_head rtx_queue; |
289 | struct mptcp_data_frag *first_pending; | 350 | struct mptcp_data_frag *first_pending; |
351 | |||
290 | -- | 352 | -- |
291 | 2.45.2 | 353 | 2.47.1 | diff view generated by jsdifflib |
1 | From: Paolo Abeni <pabeni@redhat.com> | ||
---|---|---|---|
2 | |||
1 | After the previous patch, updating sk_forward_memory is cheap and | 3 | After the previous patch, updating sk_forward_memory is cheap and |
2 | we can drop a lot of complexity from the MPTCP memory acconting, | 4 | we can drop a lot of complexity from the MPTCP memory accounting, |
3 | removing the custom fwd mem allocations for rmem. | 5 | removing the custom fwd mem allocations for rmem. |
4 | 6 | ||
5 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | 7 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> |
8 | Reviewed-by: Mat Martineau <martineau@kernel.org> | ||
9 | Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> | ||
6 | --- | 10 | --- |
7 | net/mptcp/fastopen.c | 2 +- | 11 | net/mptcp/fastopen.c | 2 +- |
8 | net/mptcp/protocol.c | 128 ++++--------------------------------------- | 12 | net/mptcp/protocol.c | 115 ++++----------------------------------------------- |
9 | net/mptcp/protocol.h | 4 +- | 13 | net/mptcp/protocol.h | 4 +- |
10 | 3 files changed, 13 insertions(+), 121 deletions(-) | 14 | 3 files changed, 10 insertions(+), 111 deletions(-) |
11 | 15 | ||
12 | diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c | 16 | diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c |
13 | index XXXXXXX..XXXXXXX 100644 | 17 | index XXXXXXX..XXXXXXX 100644 |
14 | --- a/net/mptcp/fastopen.c | 18 | --- a/net/mptcp/fastopen.c |
15 | +++ b/net/mptcp/fastopen.c | 19 | +++ b/net/mptcp/fastopen.c |
... | ... | ||
141 | - mptcp_set_owner_r(skb, sk); | 145 | - mptcp_set_owner_r(skb, sk); |
142 | + skb_set_owner_r(skb, sk); | 146 | + skb_set_owner_r(skb, sk); |
143 | __skb_queue_tail(&sk->sk_receive_queue, skb); | 147 | __skb_queue_tail(&sk->sk_receive_queue, skb); |
144 | return true; | 148 | return true; |
145 | } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { | 149 | } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) { |
146 | @@ -XXX,XX +XXX,XX @@ static void __mptcp_clean_una(struct sock *sk) | ||
147 | |||
148 | static void __mptcp_clean_una_wakeup(struct sock *sk) | ||
149 | { | ||
150 | - lockdep_assert_held_once(&sk->sk_lock.slock); | ||
151 | - | ||
152 | __mptcp_clean_una(sk); | ||
153 | mptcp_write_space(sk); | ||
154 | } | ||
155 | |||
156 | -static void mptcp_clean_una_wakeup(struct sock *sk) | ||
157 | -{ | ||
158 | - __mptcp_clean_una_wakeup(sk); | ||
159 | -} | ||
160 | - | ||
161 | static void mptcp_enter_memory_pressure(struct sock *sk) | ||
162 | { | ||
163 | struct mptcp_subflow_context *subflow; | ||
164 | @@ -XXX,XX +XXX,XX @@ static int __mptcp_recvmsg_mskq(struct sock *sk, | 150 | @@ -XXX,XX +XXX,XX @@ static int __mptcp_recvmsg_mskq(struct sock *sk, |
165 | } | 151 | } |
166 | 152 | ||
167 | if (!(flags & MSG_PEEK)) { | 153 | if (!(flags & MSG_PEEK)) { |
168 | - /* we will bulk release the skb memory later */ | 154 | - /* we will bulk release the skb memory later */ |
... | ... | ||
190 | - WRITE_ONCE(msk->rmem_released, 0); | 176 | - WRITE_ONCE(msk->rmem_released, 0); |
191 | -} | 177 | -} |
192 | - | 178 | - |
193 | static bool __mptcp_move_skbs(struct sock *sk) | 179 | static bool __mptcp_move_skbs(struct sock *sk) |
194 | { | 180 | { |
195 | struct mptcp_sock *msk = mptcp_sk(sk); | 181 | struct mptcp_subflow_context *subflow; |
196 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs(struct sock *sk) | 182 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs(struct sock *sk) |
197 | break; | 183 | break; |
198 | 184 | ||
199 | slowpath = lock_sock_fast(ssk); | 185 | slowpath = lock_sock_fast(ssk); |
200 | - __mptcp_update_rmem(sk); | 186 | - __mptcp_update_rmem(sk); |
201 | done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); | 187 | done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); |
202 | 188 | ||
203 | if (unlikely(ssk->sk_err)) | 189 | if (unlikely(ssk->sk_err)) |
204 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs(struct sock *sk) | 190 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs(struct sock *sk) |
205 | 191 | unlock_sock_fast(ssk, slowpath); | |
206 | ret = moved > 0; | 192 | } while (!done); |
207 | if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || | 193 | |
194 | - ret = moved > 0; | ||
195 | - if (!RB_EMPTY_ROOT(&msk->out_of_order_queue) || | ||
208 | - !skb_queue_empty(&sk->sk_receive_queue)) { | 196 | - !skb_queue_empty(&sk->sk_receive_queue)) { |
209 | - __mptcp_update_rmem(sk); | 197 | - __mptcp_update_rmem(sk); |
210 | + !skb_queue_empty(&sk->sk_receive_queue)) | 198 | - ret |= __mptcp_ofo_queue(msk); |
211 | ret |= __mptcp_ofo_queue(msk); | ||
212 | - } | 199 | - } |
213 | + | 200 | + ret = moved > 0 || __mptcp_ofo_queue(msk); |
214 | if (ret) | 201 | if (ret) |
215 | mptcp_check_data_fin((struct sock *)msk); | 202 | mptcp_check_data_fin((struct sock *)msk); |
216 | return ret; | 203 | return ret; |
217 | @@ -XXX,XX +XXX,XX @@ bool __mptcp_retransmit_pending_data(struct sock *sk) | ||
218 | * some data in the mptcp rtx queue has not really xmitted yet. | ||
219 | * keep it simple and re-inject the whole mptcp level rtx queue | ||
220 | */ | ||
221 | - mptcp_data_lock(sk); | ||
222 | __mptcp_clean_una_wakeup(sk); | ||
223 | rtx_head = mptcp_rtx_head(sk); | ||
224 | - if (!rtx_head) { | ||
225 | - mptcp_data_unlock(sk); | ||
226 | + if (!rtx_head) | ||
227 | return false; | ||
228 | - } | ||
229 | |||
230 | msk->recovery_snd_nxt = msk->snd_nxt; | ||
231 | msk->recovery = true; | ||
232 | - mptcp_data_unlock(sk); | ||
233 | |||
234 | msk->first_pending = rtx_head; | ||
235 | msk->snd_burst = 0; | ||
236 | @@ -XXX,XX +XXX,XX @@ static void __mptcp_retrans(struct sock *sk) | ||
237 | int ret, err; | ||
238 | u16 len = 0; | ||
239 | |||
240 | - mptcp_clean_una_wakeup(sk); | ||
241 | + __mptcp_clean_una_wakeup(sk); | ||
242 | |||
243 | /* first check ssk: need to kick "stale" logic */ | ||
244 | err = mptcp_sched_get_retrans(msk); | ||
245 | @@ -XXX,XX +XXX,XX @@ static void __mptcp_init_sock(struct sock *sk) | 204 | @@ -XXX,XX +XXX,XX @@ static void __mptcp_init_sock(struct sock *sk) |
246 | INIT_WORK(&msk->work, mptcp_worker); | 205 | INIT_WORK(&msk->work, mptcp_worker); |
247 | msk->out_of_order_queue = RB_ROOT; | 206 | msk->out_of_order_queue = RB_ROOT; |
248 | msk->first_pending = NULL; | 207 | msk->first_pending = NULL; |
249 | - WRITE_ONCE(msk->rmem_fwd_alloc, 0); | 208 | - WRITE_ONCE(msk->rmem_fwd_alloc, 0); |
... | ... | ||
326 | - return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); | 285 | - return atomic_read(&sk->sk_rmem_alloc) - READ_ONCE(mptcp_sk(sk)->rmem_released); |
327 | + return atomic_read(&sk->sk_rmem_alloc); | 286 | + return atomic_read(&sk->sk_rmem_alloc); |
328 | } | 287 | } |
329 | 288 | ||
330 | static inline int mptcp_win_from_space(const struct sock *sk, int space) | 289 | static inline int mptcp_win_from_space(const struct sock *sk, int space) |
290 | |||
331 | -- | 291 | -- |
332 | 2.45.2 | 292 | 2.47.1 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | From: Paolo Abeni <pabeni@redhat.com> | ||
1 | 2 | ||
3 | After the previous patch we can remove the forward_alloc_get | ||
4 | proto callback, basically reverting commit 292e6077b040 ("net: introduce | ||
5 | sk_forward_alloc_get()") and commit 66d58f046c9d ("net: use | ||
6 | sk_forward_alloc_get() in sk_get_meminfo()"). | ||
7 | |||
8 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||
9 | Acked-by: Mat Martineau <martineau@kernel.org> | ||
10 | Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> | ||
11 | --- | ||
12 | include/net/sock.h | 13 ------------- | ||
13 | net/core/sock.c | 2 +- | ||
14 | net/ipv4/af_inet.c | 2 +- | ||
15 | net/ipv4/inet_diag.c | 2 +- | ||
16 | net/sched/em_meta.c | 2 +- | ||
17 | 5 files changed, 4 insertions(+), 17 deletions(-) | ||
18 | |||
19 | diff --git a/include/net/sock.h b/include/net/sock.h | ||
20 | index XXXXXXX..XXXXXXX 100644 | ||
21 | --- a/include/net/sock.h | ||
22 | +++ b/include/net/sock.h | ||
23 | @@ -XXX,XX +XXX,XX @@ struct proto { | ||
24 | unsigned int inuse_idx; | ||
25 | #endif | ||
26 | |||
27 | -#if IS_ENABLED(CONFIG_MPTCP) | ||
28 | - int (*forward_alloc_get)(const struct sock *sk); | ||
29 | -#endif | ||
30 | - | ||
31 | bool (*stream_memory_free)(const struct sock *sk, int wake); | ||
32 | bool (*sock_is_readable)(struct sock *sk); | ||
33 | /* Memory pressure */ | ||
34 | @@ -XXX,XX +XXX,XX @@ int sock_load_diag_module(int family, int protocol); | ||
35 | |||
36 | INDIRECT_CALLABLE_DECLARE(bool tcp_stream_memory_free(const struct sock *sk, int wake)); | ||
37 | |||
38 | -static inline int sk_forward_alloc_get(const struct sock *sk) | ||
39 | -{ | ||
40 | -#if IS_ENABLED(CONFIG_MPTCP) | ||
41 | - if (sk->sk_prot->forward_alloc_get) | ||
42 | - return sk->sk_prot->forward_alloc_get(sk); | ||
43 | -#endif | ||
44 | - return READ_ONCE(sk->sk_forward_alloc); | ||
45 | -} | ||
46 | - | ||
47 | static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) | ||
48 | { | ||
49 | if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) | ||
50 | diff --git a/net/core/sock.c b/net/core/sock.c | ||
51 | index XXXXXXX..XXXXXXX 100644 | ||
52 | --- a/net/core/sock.c | ||
53 | +++ b/net/core/sock.c | ||
54 | @@ -XXX,XX +XXX,XX @@ void sk_get_meminfo(const struct sock *sk, u32 *mem) | ||
55 | mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); | ||
56 | mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); | ||
57 | mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); | ||
58 | - mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk); | ||
59 | + mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); | ||
60 | mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); | ||
61 | mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); | ||
62 | mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); | ||
63 | diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c | ||
64 | index XXXXXXX..XXXXXXX 100644 | ||
65 | --- a/net/ipv4/af_inet.c | ||
66 | +++ b/net/ipv4/af_inet.c | ||
67 | @@ -XXX,XX +XXX,XX @@ void inet_sock_destruct(struct sock *sk) | ||
68 | WARN_ON_ONCE(atomic_read(&sk->sk_rmem_alloc)); | ||
69 | WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); | ||
70 | WARN_ON_ONCE(sk->sk_wmem_queued); | ||
71 | - WARN_ON_ONCE(sk_forward_alloc_get(sk)); | ||
72 | + WARN_ON_ONCE(sk->sk_forward_alloc); | ||
73 | |||
74 | kfree(rcu_dereference_protected(inet->inet_opt, 1)); | ||
75 | dst_release(rcu_dereference_protected(sk->sk_dst_cache, 1)); | ||
76 | diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c | ||
77 | index XXXXXXX..XXXXXXX 100644 | ||
78 | --- a/net/ipv4/inet_diag.c | ||
79 | +++ b/net/ipv4/inet_diag.c | ||
80 | @@ -XXX,XX +XXX,XX @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk, | ||
81 | struct inet_diag_meminfo minfo = { | ||
82 | .idiag_rmem = sk_rmem_alloc_get(sk), | ||
83 | .idiag_wmem = READ_ONCE(sk->sk_wmem_queued), | ||
84 | - .idiag_fmem = sk_forward_alloc_get(sk), | ||
85 | + .idiag_fmem = READ_ONCE(sk->sk_forward_alloc), | ||
86 | .idiag_tmem = sk_wmem_alloc_get(sk), | ||
87 | }; | ||
88 | |||
89 | diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c | ||
90 | index XXXXXXX..XXXXXXX 100644 | ||
91 | --- a/net/sched/em_meta.c | ||
92 | +++ b/net/sched/em_meta.c | ||
93 | @@ -XXX,XX +XXX,XX @@ META_COLLECTOR(int_sk_fwd_alloc) | ||
94 | *err = -1; | ||
95 | return; | ||
96 | } | ||
97 | - dst->value = sk_forward_alloc_get(sk); | ||
98 | + dst->value = READ_ONCE(sk->sk_forward_alloc); | ||
99 | } | ||
100 | |||
101 | META_COLLECTOR(int_sk_sndbuf) | ||
102 | |||
103 | -- | ||
104 | 2.47.1 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | From: Paolo Abeni <pabeni@redhat.com> | ||
1 | 2 | ||
3 | After the RX path refactor, it become a wrapper for sk_rmem_alloc | ||
4 | access, with a slightly misleading name. Just drop it. | ||
5 | |||
6 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||
7 | Reviewed-by: Mat Martineau <martineau@kernel.org> | ||
8 | Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> | ||
9 | --- | ||
10 | net/mptcp/protocol.c | 8 ++++---- | ||
11 | net/mptcp/protocol.h | 11 ++--------- | ||
12 | 2 files changed, 6 insertions(+), 13 deletions(-) | ||
13 | |||
14 | diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c | ||
15 | index XXXXXXX..XXXXXXX 100644 | ||
16 | --- a/net/mptcp/protocol.c | ||
17 | +++ b/net/mptcp/protocol.c | ||
18 | @@ -XXX,XX +XXX,XX @@ static void mptcp_cleanup_rbuf(struct mptcp_sock *msk, int copied) | ||
19 | bool cleanup, rx_empty; | ||
20 | |||
21 | cleanup = (space > 0) && (space >= (old_space << 1)) && copied; | ||
22 | - rx_empty = !__mptcp_rmem(sk) && copied; | ||
23 | + rx_empty = !sk_rmem_alloc_get(sk) && copied; | ||
24 | |||
25 | mptcp_for_each_subflow(msk, subflow) { | ||
26 | struct sock *ssk = mptcp_subflow_tcp_sock(subflow); | ||
27 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, | ||
28 | WRITE_ONCE(tp->copied_seq, seq); | ||
29 | more_data_avail = mptcp_subflow_data_available(ssk); | ||
30 | |||
31 | - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { | ||
32 | + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) { | ||
33 | done = true; | ||
34 | break; | ||
35 | } | ||
36 | @@ -XXX,XX +XXX,XX @@ static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) | ||
37 | __mptcp_rcvbuf_update(sk, ssk); | ||
38 | |||
39 | /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ | ||
40 | - if (__mptcp_rmem(sk) > sk->sk_rcvbuf) | ||
41 | + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) | ||
42 | return; | ||
43 | |||
44 | /* Wake-up the reader only for in-sequence data */ | ||
45 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs(struct sock *sk) | ||
46 | mptcp_for_each_subflow(msk, subflow) | ||
47 | __mptcp_rcvbuf_update(sk, subflow->tcp_sock); | ||
48 | |||
49 | - if (__mptcp_rmem(sk) > sk->sk_rcvbuf) | ||
50 | + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) | ||
51 | return false; | ||
52 | |||
53 | do { | ||
54 | diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h | ||
55 | index XXXXXXX..XXXXXXX 100644 | ||
56 | --- a/net/mptcp/protocol.h | ||
57 | +++ b/net/mptcp/protocol.h | ||
58 | @@ -XXX,XX +XXX,XX @@ static inline void msk_owned_by_me(const struct mptcp_sock *msk) | ||
59 | #define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) | ||
60 | #endif | ||
61 | |||
62 | -/* the msk socket don't use the backlog, also account for the bulk | ||
63 | - * free memory | ||
64 | - */ | ||
65 | -static inline int __mptcp_rmem(const struct sock *sk) | ||
66 | -{ | ||
67 | - return atomic_read(&sk->sk_rmem_alloc); | ||
68 | -} | ||
69 | - | ||
70 | static inline int mptcp_win_from_space(const struct sock *sk, int space) | ||
71 | { | ||
72 | return __tcp_win_from_space(mptcp_sk(sk)->scaling_ratio, space); | ||
73 | @@ -XXX,XX +XXX,XX @@ static inline int mptcp_space_from_win(const struct sock *sk, int win) | ||
74 | |||
75 | static inline int __mptcp_space(const struct sock *sk) | ||
76 | { | ||
77 | - return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - __mptcp_rmem(sk)); | ||
78 | + return mptcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - | ||
79 | + sk_rmem_alloc_get(sk)); | ||
80 | } | ||
81 | |||
82 | static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk) | ||
83 | |||
84 | -- | ||
85 | 2.47.1 | diff view generated by jsdifflib |
New patch | |||
---|---|---|---|
1 | 1 | From: Paolo Abeni <pabeni@redhat.com> | |
2 | |||
3 | After the RX path refactor the mentioned function is expected to run | ||
4 | frequently, let's optimize it a bit. | ||
5 | |||
6 | Scan for ready subflow from the last processed one, and stop after | ||
7 | traversing the list once or reaching the msk memory limit - instead of | ||
8 | looking for dubious per-subflow conditions. | ||
9 | Also re-order the memory limit checks, to avoid duplicate tests. | ||
10 | |||
11 | Signed-off-by: Paolo Abeni <pabeni@redhat.com> | ||
12 | Reviewed-by: Mat Martineau <martineau@kernel.org> | ||
13 | Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> | ||
14 | --- | ||
15 | net/mptcp/protocol.c | 111 +++++++++++++++++++++++---------------------------- | ||
16 | net/mptcp/protocol.h | 2 + | ||
17 | 2 files changed, 52 insertions(+), 61 deletions(-) | ||
18 | |||
19 | diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c | ||
20 | index XXXXXXX..XXXXXXX 100644 | ||
21 | --- a/net/mptcp/protocol.c | ||
22 | +++ b/net/mptcp/protocol.c | ||
23 | @@ -XXX,XX +XXX,XX @@ static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk) | ||
24 | } | ||
25 | |||
26 | static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, | ||
27 | - struct sock *ssk, | ||
28 | - unsigned int *bytes) | ||
29 | + struct sock *ssk) | ||
30 | { | ||
31 | struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk); | ||
32 | struct sock *sk = (struct sock *)msk; | ||
33 | - unsigned int moved = 0; | ||
34 | bool more_data_avail; | ||
35 | struct tcp_sock *tp; | ||
36 | - bool done = false; | ||
37 | + bool ret = false; | ||
38 | |||
39 | pr_debug("msk=%p ssk=%p\n", msk, ssk); | ||
40 | tp = tcp_sk(ssk); | ||
41 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, | ||
42 | struct sk_buff *skb; | ||
43 | bool fin; | ||
44 | |||
45 | + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) | ||
46 | + break; | ||
47 | + | ||
48 | /* try to move as much data as available */ | ||
49 | map_remaining = subflow->map_data_len - | ||
50 | mptcp_subflow_get_map_offset(subflow); | ||
51 | |||
52 | skb = skb_peek(&ssk->sk_receive_queue); | ||
53 | - if (!skb) { | ||
54 | - /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), | ||
55 | - * a different CPU can have already processed the pending | ||
56 | - * data, stop here or we can enter an infinite loop | ||
57 | - */ | ||
58 | - if (!moved) | ||
59 | - done = true; | ||
60 | + if (unlikely(!skb)) | ||
61 | break; | ||
62 | - } | ||
63 | |||
64 | if (__mptcp_check_fallback(msk)) { | ||
65 | /* Under fallback skbs have no MPTCP extension and TCP could | ||
66 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, | ||
67 | |||
68 | offset = seq - TCP_SKB_CB(skb)->seq; | ||
69 | fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; | ||
70 | - if (fin) { | ||
71 | - done = true; | ||
72 | + if (fin) | ||
73 | seq++; | ||
74 | - } | ||
75 | |||
76 | if (offset < skb->len) { | ||
77 | size_t len = skb->len - offset; | ||
78 | |||
79 | - if (tp->urg_data) | ||
80 | - done = true; | ||
81 | - | ||
82 | - if (__mptcp_move_skb(msk, ssk, skb, offset, len)) | ||
83 | - moved += len; | ||
84 | + ret = __mptcp_move_skb(msk, ssk, skb, offset, len) || ret; | ||
85 | seq += len; | ||
86 | |||
87 | if (unlikely(map_remaining < len)) { | ||
88 | @@ -XXX,XX +XXX,XX @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, | ||
89 | } | ||
90 | |||
91 | sk_eat_skb(ssk, skb); | ||
92 | - done = true; | ||
93 | } | ||
94 | |||
95 | WRITE_ONCE(tp->copied_seq, seq); | ||
96 | more_data_avail = mptcp_subflow_data_available(ssk); | ||
97 | |||
98 | - if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) { | ||
99 | - done = true; | ||
100 | - break; | ||
101 | - } | ||
102 | } while (more_data_avail); | ||
103 | |||
104 | - if (moved > 0) | ||
105 | + if (ret) | ||
106 | msk->last_data_recv = tcp_jiffies32; | ||
107 | - *bytes += moved; | ||
108 | - return done; | ||
109 | + return ret; | ||
110 | } | ||
111 | |||
112 | static bool __mptcp_ofo_queue(struct mptcp_sock *msk) | ||
113 | @@ -XXX,XX +XXX,XX @@ void __mptcp_error_report(struct sock *sk) | ||
114 | static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) | ||
115 | { | ||
116 | struct sock *sk = (struct sock *)msk; | ||
117 | - unsigned int moved = 0; | ||
118 | + bool moved; | ||
119 | |||
120 | - __mptcp_move_skbs_from_subflow(msk, ssk, &moved); | ||
121 | + moved = __mptcp_move_skbs_from_subflow(msk, ssk); | ||
122 | __mptcp_ofo_queue(msk); | ||
123 | if (unlikely(ssk->sk_err)) { | ||
124 | if (!sock_owned_by_user(sk)) | ||
125 | @@ -XXX,XX +XXX,XX @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk) | ||
126 | */ | ||
127 | if (mptcp_pending_data_fin(sk, NULL)) | ||
128 | mptcp_schedule_work(sk); | ||
129 | - return moved > 0; | ||
130 | + return moved; | ||
131 | } | ||
132 | |||
133 | static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk) | ||
134 | @@ -XXX,XX +XXX,XX @@ static void __mptcp_data_ready(struct sock *sk, struct sock *ssk) | ||
135 | |||
136 | __mptcp_rcvbuf_update(sk, ssk); | ||
137 | |||
138 | - /* over limit? can't append more skbs to msk, Also, no need to wake-up*/ | ||
139 | - if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) | ||
140 | - return; | ||
141 | - | ||
142 | /* Wake-up the reader only for in-sequence data */ | ||
143 | if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk)) | ||
144 | sk->sk_data_ready(sk); | ||
145 | @@ -XXX,XX +XXX,XX @@ bool mptcp_schedule_work(struct sock *sk) | ||
146 | return false; | ||
147 | } | ||
148 | |||
149 | -static struct sock *mptcp_subflow_recv_lookup(const struct mptcp_sock *msk) | ||
150 | -{ | ||
151 | - struct mptcp_subflow_context *subflow; | ||
152 | - | ||
153 | - msk_owned_by_me(msk); | ||
154 | - | ||
155 | - mptcp_for_each_subflow(msk, subflow) { | ||
156 | - if (READ_ONCE(subflow->data_avail)) | ||
157 | - return mptcp_subflow_tcp_sock(subflow); | ||
158 | - } | ||
159 | - | ||
160 | - return NULL; | ||
161 | -} | ||
162 | - | ||
163 | static bool mptcp_skb_can_collapse_to(u64 write_seq, | ||
164 | const struct sk_buff *skb, | ||
165 | const struct mptcp_ext *mpext) | ||
166 | @@ -XXX,XX +XXX,XX @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) | ||
167 | msk->rcvq_space.time = mstamp; | ||
168 | } | ||
169 | |||
170 | +static struct mptcp_subflow_context * | ||
171 | +__mptcp_first_ready_from(struct mptcp_sock *msk, | ||
172 | + struct mptcp_subflow_context *subflow) | ||
173 | +{ | ||
174 | + struct mptcp_subflow_context *start_subflow = subflow; | ||
175 | + | ||
176 | + while (!READ_ONCE(subflow->data_avail)) { | ||
177 | + subflow = mptcp_next_subflow(msk, subflow); | ||
178 | + if (subflow == start_subflow) | ||
179 | + return NULL; | ||
180 | + } | ||
181 | + return subflow; | ||
182 | +} | ||
183 | + | ||
184 | static bool __mptcp_move_skbs(struct sock *sk) | ||
185 | { | ||
186 | struct mptcp_subflow_context *subflow; | ||
187 | struct mptcp_sock *msk = mptcp_sk(sk); | ||
188 | - unsigned int moved = 0; | ||
189 | - bool ret, done; | ||
190 | + bool ret = false; | ||
191 | + | ||
192 | + if (list_empty(&msk->conn_list)) | ||
193 | + return false; | ||
194 | |||
195 | /* verify we can move any data from the subflow, eventually updating */ | ||
196 | if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) | ||
197 | mptcp_for_each_subflow(msk, subflow) | ||
198 | __mptcp_rcvbuf_update(sk, subflow->tcp_sock); | ||
199 | |||
200 | - if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) | ||
201 | - return false; | ||
202 | - | ||
203 | - do { | ||
204 | - struct sock *ssk = mptcp_subflow_recv_lookup(msk); | ||
205 | + subflow = list_first_entry(&msk->conn_list, | ||
206 | + struct mptcp_subflow_context, node); | ||
207 | + for (;;) { | ||
208 | + struct sock *ssk; | ||
209 | bool slowpath; | ||
210 | |||
211 | - if (unlikely(!ssk)) | ||
212 | + /* | ||
213 | + * As an optimization avoid traversing the subflows list | ||
214 | + * and ev. acquiring the subflow socket lock before baling out | ||
215 | + */ | ||
216 | + if (sk_rmem_alloc_get(sk) > sk->sk_rcvbuf) | ||
217 | break; | ||
218 | |||
219 | - slowpath = lock_sock_fast(ssk); | ||
220 | - done = __mptcp_move_skbs_from_subflow(msk, ssk, &moved); | ||
221 | + subflow = __mptcp_first_ready_from(msk, subflow); | ||
222 | + if (!subflow) | ||
223 | + break; | ||
224 | |||
225 | + ssk = mptcp_subflow_tcp_sock(subflow); | ||
226 | + slowpath = lock_sock_fast(ssk); | ||
227 | + ret = __mptcp_move_skbs_from_subflow(msk, ssk) || ret; | ||
228 | if (unlikely(ssk->sk_err)) | ||
229 | __mptcp_error_report(sk); | ||
230 | unlock_sock_fast(ssk, slowpath); | ||
231 | - } while (!done); | ||
232 | |||
233 | - ret = moved > 0 || __mptcp_ofo_queue(msk); | ||
234 | + subflow = mptcp_next_subflow(msk, subflow); | ||
235 | + } | ||
236 | + | ||
237 | + __mptcp_ofo_queue(msk); | ||
238 | if (ret) | ||
239 | mptcp_check_data_fin((struct sock *)msk); | ||
240 | return ret; | ||
241 | diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h | ||
242 | index XXXXXXX..XXXXXXX 100644 | ||
243 | --- a/net/mptcp/protocol.h | ||
244 | +++ b/net/mptcp/protocol.h | ||
245 | @@ -XXX,XX +XXX,XX @@ struct mptcp_sock { | ||
246 | list_for_each_entry(__subflow, &((__msk)->conn_list), node) | ||
247 | #define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ | ||
248 | list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) | ||
249 | +#define mptcp_next_subflow(__msk, __subflow) \ | ||
250 | + list_next_entry_circular(__subflow, &((__msk)->conn_list), node) | ||
251 | |||
252 | extern struct genl_family mptcp_genl_family; | ||
253 | |||
254 | |||
255 | -- | ||
256 | 2.47.1 | diff view generated by jsdifflib |