net/mptcp/protocol.c | 219 ++++++++++++++++++------------------------- net/mptcp/protocol.h | 15 +-- 2 files changed, 90 insertions(+), 144 deletions(-)
All the mptcp receive path is protected by the msk socket
spinlock. As consequence, the tx path has to play a few to allocate
the forward memory without acquiring the spinlock multile times, making
the overall TX path quite complex.
This patch tries to clean-up a bit the tx path, using completely
separated fwd memory allocation, for the rx and the tx path.
The forward memory allocated in the rx path is now accounted in
msk->rmem_fwd_alloc and is (still) protected by the msk socket spinlock.
To cope with the above we provied a few MPTCP-specific variant for
the helpers to charge, uncharge, reclaim and free the forward
memory in the receive path.
msk->sk_forward_alloc now accounts only the forward memory for the tx path,
we can the plain core sock helper to manipulate it and drop quite a bit
of complexity.
On memory pressure reclaim both rx and tx fwd memory.
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
Note: this would additionally require a core change to properly fetch
the forward allocated memory in inet_sk_diag_fill(). I think a new
'struct proto' ops would do:
struct proto {
// ...
int (*forward_alloc)(const struct sock *sk);
// ...
};
static inline sk_forward_alloc(const struct sock *sk)
{
if (!sk->sk_prot.forward_alloc)
return sk->sk_forward_alloc;
return sk->sk_prot.forward_alloc(sk);
}
---
net/mptcp/protocol.c | 219 ++++++++++++++++++-------------------------
net/mptcp/protocol.h | 15 +--
2 files changed, 90 insertions(+), 144 deletions(-)
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 21716392e754..1d26462a1daf 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
__kfree_skb(skb);
}
+static void mptcp_rmem_charge(struct sock *sk, int size)
+{
+ mptcp_sk(sk)->rmem_fwd_alloc -= size;
+}
+
static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
struct sk_buff *from)
{
@@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
kfree_skb_partial(from, fragstolen);
atomic_add(delta, &sk->sk_rmem_alloc);
- sk_mem_charge(sk, delta);
+ mptcp_rmem_charge(sk, delta);
return true;
}
@@ -155,6 +160,50 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
return mptcp_try_coalesce((struct sock *)msk, to, from);
}
+void __mptcp_rmem_reclaim(struct sock *sk, int amount)
+{
+ amount >>= SK_MEM_QUANTUM_SHIFT;
+ mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ __sk_mem_reduce_allocated(sk, amount);
+}
+
+static void mptcp_rmem_uncharge(struct sock *sk, int size)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int reclaimable;
+
+ msk->rmem_fwd_alloc += size;
+ reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
+
+ /* Avoid a possible overflow.
+ * TCP send queues can make this happen, if sk_mem_reclaim()
+ * is not called and more than 2 GBytes are released at once.
+ *
+ * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
+ * no need to hold that much forward allocation anyway.
+ */
+ if (unlikely(reclaimable >= 1 << 21))
+ __mptcp_rmem_reclaim(sk, (1 << 20));
+}
+
+static void mptcp_rfree(struct sk_buff *skb)
+{
+ unsigned int len = skb->truesize;
+ struct sock *sk = skb->sk;
+
+ atomic_sub(len, &sk->sk_rmem_alloc);
+ mptcp_rmem_uncharge(sk, len);
+}
+
+static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
+{
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = mptcp_rfree;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ mptcp_rmem_charge(sk, skb->truesize);
+}
+
/* "inspired" by tcp_data_queue_ofo(), main differences:
* - use mptcp seqs
* - don't cope with sacks
@@ -267,7 +316,29 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
end:
skb_condense(skb);
- skb_set_owner_r(skb, sk);
+ mptcp_set_owner_r(skb, sk);
+}
+
+static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+ int amt, amount;
+
+ if (size < msk->rmem_fwd_alloc)
+ return true;;
+
+ amt = sk_mem_pages(size);
+ amount = amt << SK_MEM_QUANTUM_SHIFT;
+ msk->rmem_fwd_alloc += amount;
+ if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
+ if (ssk->sk_forward_alloc < amount) {
+ msk->rmem_fwd_alloc -= amount;
+ return false;
+ }
+
+ ssk->sk_forward_alloc -= amount;
+ }
+ return true;
}
static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
@@ -285,15 +356,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
skb_orphan(skb);
/* try to fetch required memory from subflow */
- if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
- int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT;
-
- if (ssk->sk_forward_alloc < amount)
- goto drop;
-
- ssk->sk_forward_alloc -= amount;
- sk->sk_forward_alloc += amount;
- }
+ if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
+ goto drop;
has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
@@ -313,7 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
if (tail && mptcp_try_coalesce(sk, tail, skb))
return true;
- skb_set_owner_r(skb, sk);
+ mptcp_set_owner_r(skb, sk);
__skb_queue_tail(&sk->sk_receive_queue, skb);
return true;
} else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
@@ -908,122 +972,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
df->data_seq + df->data_len == msk->write_seq;
}
-static int mptcp_wmem_with_overhead(int size)
-{
- return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
-}
-
-static void __mptcp_wmem_reserve(struct sock *sk, int size)
-{
- int amount = mptcp_wmem_with_overhead(size);
- struct mptcp_sock *msk = mptcp_sk(sk);
-
- WARN_ON_ONCE(msk->wmem_reserved);
- if (WARN_ON_ONCE(amount < 0))
- amount = 0;
-
- if (amount <= sk->sk_forward_alloc)
- goto reserve;
-
- /* under memory pressure try to reserve at most a single page
- * otherwise try to reserve the full estimate and fallback
- * to a single page before entering the error path
- */
- if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
- !sk_wmem_schedule(sk, amount)) {
- if (amount <= PAGE_SIZE)
- goto nomem;
-
- amount = PAGE_SIZE;
- if (!sk_wmem_schedule(sk, amount))
- goto nomem;
- }
-
-reserve:
- msk->wmem_reserved = amount;
- sk->sk_forward_alloc -= amount;
- return;
-
-nomem:
- /* we will wait for memory on next allocation */
- msk->wmem_reserved = -1;
-}
-
-static void __mptcp_update_wmem(struct sock *sk)
+static void __mptcp_mem_reclaim_partial(struct sock *sk)
{
- struct mptcp_sock *msk = mptcp_sk(sk);
+ int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
lockdep_assert_held_once(&sk->sk_lock.slock);
- if (!msk->wmem_reserved)
- return;
-
- if (msk->wmem_reserved < 0)
- msk->wmem_reserved = 0;
- if (msk->wmem_reserved > 0) {
- sk->sk_forward_alloc += msk->wmem_reserved;
- msk->wmem_reserved = 0;
- }
-}
-
-static bool mptcp_wmem_alloc(struct sock *sk, int size)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
-
- /* check for pre-existing error condition */
- if (msk->wmem_reserved < 0)
- return false;
-
- if (msk->wmem_reserved >= size)
- goto account;
-
- mptcp_data_lock(sk);
- if (!sk_wmem_schedule(sk, size)) {
- mptcp_data_unlock(sk);
- return false;
- }
-
- sk->sk_forward_alloc -= size;
- msk->wmem_reserved += size;
- mptcp_data_unlock(sk);
-
-account:
- msk->wmem_reserved -= size;
- return true;
-}
-
-static void mptcp_wmem_uncharge(struct sock *sk, int size)
-{
- struct mptcp_sock *msk = mptcp_sk(sk);
-
- if (msk->wmem_reserved < 0)
- msk->wmem_reserved = 0;
- msk->wmem_reserved += size;
-}
-
-static void __mptcp_mem_reclaim_partial(struct sock *sk)
-{
- lockdep_assert_held_once(&sk->sk_lock.slock);
- __mptcp_update_wmem(sk);
+ __mptcp_rmem_reclaim(sk, reclaimable - 1);
sk_mem_reclaim_partial(sk);
}
static void mptcp_mem_reclaim_partial(struct sock *sk)
{
- struct mptcp_sock *msk = mptcp_sk(sk);
-
- /* if we are experiencing a transint allocation error,
- * the forward allocation memory has been already
- * released
- */
- if (msk->wmem_reserved < 0)
- return;
-
mptcp_data_lock(sk);
- sk->sk_forward_alloc += msk->wmem_reserved;
- sk_mem_reclaim_partial(sk);
- msk->wmem_reserved = sk->sk_forward_alloc;
- sk->sk_forward_alloc = 0;
+ __mptcp_mem_reclaim_partial(sk);
mptcp_data_unlock(sk);
}
@@ -1664,7 +1626,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
/* __mptcp_alloc_tx_skb could have released some wmem and we are
* not going to flush it via release_sock()
*/
- __mptcp_update_wmem(sk);
if (copied) {
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
@@ -1701,7 +1662,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
/* silently ignore everything else */
msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
- mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
+ lock_sock(sk);
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
@@ -1749,17 +1710,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
psize = min_t(size_t, psize, msg_data_left(msg));
total_ts = psize + frag_truesize;
- if (!mptcp_wmem_alloc(sk, total_ts))
+ if (!sk_wmem_schedule(sk, total_ts))
goto wait_for_memory;
if (copy_page_from_iter(dfrag->page, offset, psize,
&msg->msg_iter) != psize) {
- mptcp_wmem_uncharge(sk, psize + frag_truesize);
ret = -EFAULT;
goto out;
}
/* data successfully copied into the write queue */
+ sk->sk_forward_alloc -= total_ts;
copied += psize;
dfrag->data_len += psize;
frag_truesize += psize;
@@ -1956,7 +1917,7 @@ static void __mptcp_update_rmem(struct sock *sk)
return;
atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
- sk_mem_uncharge(sk, msk->rmem_released);
+ mptcp_rmem_uncharge(sk, msk->rmem_released);
WRITE_ONCE(msk->rmem_released, 0);
}
@@ -2024,7 +1985,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
if (unlikely(flags & MSG_ERRQUEUE))
return inet_recv_error(sk, msg, len, addr_len);
- mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
+ lock_sock(sk);
if (unlikely(sk->sk_state == TCP_LISTEN)) {
copied = -ENOTCONN;
goto out_err;
@@ -2504,7 +2465,6 @@ static int __mptcp_init_sock(struct sock *sk)
__skb_queue_head_init(&msk->receive_queue);
msk->out_of_order_queue = RB_ROOT;
msk->first_pending = NULL;
- msk->wmem_reserved = 0;
WRITE_ONCE(msk->rmem_released, 0);
msk->timer_ival = TCP_RTO_MIN;
@@ -2719,7 +2679,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
sk->sk_prot->destroy(sk);
- WARN_ON_ONCE(msk->wmem_reserved);
+ WARN_ON_ONCE(msk->rmem_fwd_alloc);
WARN_ON_ONCE(msk->rmem_released);
sk_stream_kill_queues(sk);
xfrm_sk_free_policy(sk);
@@ -2954,6 +2914,9 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
/* move to sk_receive_queue, sk_stream_kill_queues will purge it */
skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
+ __skb_queue_purge(&sk->sk_receive_queue);
+ sk->sk_forward_alloc += msk->rmem_fwd_alloc;
+ msk->rmem_fwd_alloc = 0;
skb_rbtree_purge(&msk->out_of_order_queue);
mptcp_token_destroy(msk);
@@ -3037,10 +3000,6 @@ static void mptcp_release_cb(struct sock *sk)
if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags))
__mptcp_error_report(sk);
- /* push_pending may touch wmem_reserved, ensure we do the cleanup
- * later
- */
- __mptcp_update_wmem(sk);
__mptcp_update_rmem(sk);
}
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 7379ab580a7e..cfb374634a83 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -227,7 +227,7 @@ struct mptcp_sock {
u64 ack_seq;
u64 rcv_wnd_sent;
u64 rcv_data_fin_seq;
- int wmem_reserved;
+ int rmem_fwd_alloc;
struct sock *last_snd;
int snd_burst;
int old_wspace;
@@ -272,19 +272,6 @@ struct mptcp_sock {
char ca_name[TCP_CA_NAME_MAX];
};
-#define mptcp_lock_sock(___sk, cb) do { \
- struct sock *__sk = (___sk); /* silence macro reuse warning */ \
- might_sleep(); \
- spin_lock_bh(&__sk->sk_lock.slock); \
- if (__sk->sk_lock.owned) \
- __lock_sock(__sk); \
- cb; \
- __sk->sk_lock.owned = 1; \
- spin_unlock(&__sk->sk_lock.slock); \
- mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \
- local_bh_enable(); \
-} while (0)
-
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
#define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
--
2.26.3
On Tue, 5 Oct 2021, Paolo Abeni wrote:
> All the mptcp receive path is protected by the msk socket
> spinlock. As consequence, the tx path has to play a few to allocate
> the forward memory without acquiring the spinlock multile times, making
> the overall TX path quite complex.
>
> This patch tries to clean-up a bit the tx path, using completely
> separated fwd memory allocation, for the rx and the tx path.
>
Overall, it does seem like a cleaner approach than the existing
wmem_reserved code. Do you think the existing code is related to any
current bugs or performance issues?
> The forward memory allocated in the rx path is now accounted in
> msk->rmem_fwd_alloc and is (still) protected by the msk socket spinlock.
>
> To cope with the above we provied a few MPTCP-specific variant for
> the helpers to charge, uncharge, reclaim and free the forward
> memory in the receive path.
>
> msk->sk_forward_alloc now accounts only the forward memory for the tx path,
> we can the plain core sock helper to manipulate it and drop quite a bit
> of complexity.
>
> On memory pressure reclaim both rx and tx fwd memory.
>
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> ---
> Note: this would additionally require a core change to properly fetch
> the forward allocated memory in inet_sk_diag_fill(). I think a new
> 'struct proto' ops would do:
>
> struct proto {
> // ...
> int (*forward_alloc)(const struct sock *sk);
> // ...
> };
>
> static inline sk_forward_alloc(const struct sock *sk)
> {
> if (!sk->sk_prot.forward_alloc)
> return sk->sk_forward_alloc;
> return sk->sk_prot.forward_alloc(sk);
> }
Seems ok to me to handle the diag info this way, hope upstream reviewers
agree!
> ---
> net/mptcp/protocol.c | 219 ++++++++++++++++++-------------------------
> net/mptcp/protocol.h | 15 +--
> 2 files changed, 90 insertions(+), 144 deletions(-)
>
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 21716392e754..1d26462a1daf 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
> @@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
> __kfree_skb(skb);
> }
>
> +static void mptcp_rmem_charge(struct sock *sk, int size)
> +{
> + mptcp_sk(sk)->rmem_fwd_alloc -= size;
> +}
> +
> static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
> struct sk_buff *from)
> {
> @@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
> MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
> kfree_skb_partial(from, fragstolen);
> atomic_add(delta, &sk->sk_rmem_alloc);
> - sk_mem_charge(sk, delta);
> + mptcp_rmem_charge(sk, delta);
> return true;
> }
>
> @@ -155,6 +160,50 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
> return mptcp_try_coalesce((struct sock *)msk, to, from);
> }
>
> +void __mptcp_rmem_reclaim(struct sock *sk, int amount)
> +{
> + amount >>= SK_MEM_QUANTUM_SHIFT;
> + mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
> + __sk_mem_reduce_allocated(sk, amount);
> +}
> +
> +static void mptcp_rmem_uncharge(struct sock *sk, int size)
> +{
> + struct mptcp_sock *msk = mptcp_sk(sk);
> + int reclaimable;
> +
> + msk->rmem_fwd_alloc += size;
> + reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
I was going to ask if this patch takes in to account the recent
SO_RESERVE_MEM patchset's changes to sk_forward_alloc handling, but these
changes mirror those in sk_mem_reclaim() so it seems the answer is "yes".
> +
> + /* Avoid a possible overflow.
> + * TCP send queues can make this happen, if sk_mem_reclaim()
> + * is not called and more than 2 GBytes are released at once.
> + *
> + * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
> + * no need to hold that much forward allocation anyway.
> + */
Minor indent issue in the above comment.
> + if (unlikely(reclaimable >= 1 << 21))
> + __mptcp_rmem_reclaim(sk, (1 << 20));
I realize these "1 << 21" and "1 << 20" values are copied from
sk_mem_uncharge(). Should they be changed in both files to symbolic values
defined in sock.h, to help keep the values matched if someone decides to
adjust them in the future?
> +}
> +
> +static void mptcp_rfree(struct sk_buff *skb)
> +{
> + unsigned int len = skb->truesize;
> + struct sock *sk = skb->sk;
> +
> + atomic_sub(len, &sk->sk_rmem_alloc);
> + mptcp_rmem_uncharge(sk, len);
> +}
> +
> +static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
> +{
> + skb_orphan(skb);
> + skb->sk = sk;
> + skb->destructor = mptcp_rfree;
> + atomic_add(skb->truesize, &sk->sk_rmem_alloc);
> + mptcp_rmem_charge(sk, skb->truesize);
> +}
> +
> /* "inspired" by tcp_data_queue_ofo(), main differences:
> * - use mptcp seqs
> * - don't cope with sacks
> @@ -267,7 +316,29 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
>
> end:
> skb_condense(skb);
> - skb_set_owner_r(skb, sk);
> + mptcp_set_owner_r(skb, sk);
> +}
> +
> +static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
> +{
> + struct mptcp_sock *msk = mptcp_sk(sk);
> + int amt, amount;
> +
> + if (size < msk->rmem_fwd_alloc)
> + return true;;
> +
> + amt = sk_mem_pages(size);
> + amount = amt << SK_MEM_QUANTUM_SHIFT;
> + msk->rmem_fwd_alloc += amount;
> + if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
> + if (ssk->sk_forward_alloc < amount) {
> + msk->rmem_fwd_alloc -= amount;
> + return false;
> + }
> +
> + ssk->sk_forward_alloc -= amount;
> + }
> + return true;
> }
>
> static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> @@ -285,15 +356,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> skb_orphan(skb);
>
> /* try to fetch required memory from subflow */
> - if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
> - int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT;
> -
> - if (ssk->sk_forward_alloc < amount)
> - goto drop;
> -
> - ssk->sk_forward_alloc -= amount;
> - sk->sk_forward_alloc += amount;
> - }
> + if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
> + goto drop;
>
> has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
>
> @@ -313,7 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> if (tail && mptcp_try_coalesce(sk, tail, skb))
> return true;
>
> - skb_set_owner_r(skb, sk);
> + mptcp_set_owner_r(skb, sk);
> __skb_queue_tail(&sk->sk_receive_queue, skb);
> return true;
> } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
> @@ -908,122 +972,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
> df->data_seq + df->data_len == msk->write_seq;
> }
>
> -static int mptcp_wmem_with_overhead(int size)
> -{
> - return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
> -}
> -
> -static void __mptcp_wmem_reserve(struct sock *sk, int size)
> -{
> - int amount = mptcp_wmem_with_overhead(size);
> - struct mptcp_sock *msk = mptcp_sk(sk);
> -
> - WARN_ON_ONCE(msk->wmem_reserved);
> - if (WARN_ON_ONCE(amount < 0))
> - amount = 0;
> -
> - if (amount <= sk->sk_forward_alloc)
> - goto reserve;
> -
> - /* under memory pressure try to reserve at most a single page
> - * otherwise try to reserve the full estimate and fallback
> - * to a single page before entering the error path
> - */
> - if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
> - !sk_wmem_schedule(sk, amount)) {
> - if (amount <= PAGE_SIZE)
> - goto nomem;
> -
> - amount = PAGE_SIZE;
> - if (!sk_wmem_schedule(sk, amount))
> - goto nomem;
> - }
> -
> -reserve:
> - msk->wmem_reserved = amount;
> - sk->sk_forward_alloc -= amount;
> - return;
> -
> -nomem:
> - /* we will wait for memory on next allocation */
> - msk->wmem_reserved = -1;
> -}
> -
> -static void __mptcp_update_wmem(struct sock *sk)
> +static void __mptcp_mem_reclaim_partial(struct sock *sk)
> {
> - struct mptcp_sock *msk = mptcp_sk(sk);
> + int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
>
> lockdep_assert_held_once(&sk->sk_lock.slock);
>
> - if (!msk->wmem_reserved)
> - return;
> -
> - if (msk->wmem_reserved < 0)
> - msk->wmem_reserved = 0;
> - if (msk->wmem_reserved > 0) {
> - sk->sk_forward_alloc += msk->wmem_reserved;
> - msk->wmem_reserved = 0;
> - }
> -}
> -
> -static bool mptcp_wmem_alloc(struct sock *sk, int size)
> -{
> - struct mptcp_sock *msk = mptcp_sk(sk);
> -
> - /* check for pre-existing error condition */
> - if (msk->wmem_reserved < 0)
> - return false;
> -
> - if (msk->wmem_reserved >= size)
> - goto account;
> -
> - mptcp_data_lock(sk);
> - if (!sk_wmem_schedule(sk, size)) {
> - mptcp_data_unlock(sk);
> - return false;
> - }
> -
> - sk->sk_forward_alloc -= size;
> - msk->wmem_reserved += size;
> - mptcp_data_unlock(sk);
> -
> -account:
> - msk->wmem_reserved -= size;
> - return true;
> -}
> -
> -static void mptcp_wmem_uncharge(struct sock *sk, int size)
> -{
> - struct mptcp_sock *msk = mptcp_sk(sk);
> -
> - if (msk->wmem_reserved < 0)
> - msk->wmem_reserved = 0;
> - msk->wmem_reserved += size;
> -}
> -
> -static void __mptcp_mem_reclaim_partial(struct sock *sk)
> -{
> - lockdep_assert_held_once(&sk->sk_lock.slock);
> - __mptcp_update_wmem(sk);
> + __mptcp_rmem_reclaim(sk, reclaimable - 1);
> sk_mem_reclaim_partial(sk);
> }
>
> static void mptcp_mem_reclaim_partial(struct sock *sk)
> {
> - struct mptcp_sock *msk = mptcp_sk(sk);
> -
> - /* if we are experiencing a transint allocation error,
> - * the forward allocation memory has been already
> - * released
> - */
> - if (msk->wmem_reserved < 0)
> - return;
> -
> mptcp_data_lock(sk);
> - sk->sk_forward_alloc += msk->wmem_reserved;
> - sk_mem_reclaim_partial(sk);
> - msk->wmem_reserved = sk->sk_forward_alloc;
> - sk->sk_forward_alloc = 0;
> + __mptcp_mem_reclaim_partial(sk);
> mptcp_data_unlock(sk);
> }
>
> @@ -1664,7 +1626,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
> /* __mptcp_alloc_tx_skb could have released some wmem and we are
> * not going to flush it via release_sock()
> */
> - __mptcp_update_wmem(sk);
> if (copied) {
> tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
> info.size_goal);
> @@ -1701,7 +1662,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> /* silently ignore everything else */
> msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
>
> - mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
> + lock_sock(sk);
>
> timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
>
> @@ -1749,17 +1710,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> psize = min_t(size_t, psize, msg_data_left(msg));
> total_ts = psize + frag_truesize;
>
> - if (!mptcp_wmem_alloc(sk, total_ts))
> + if (!sk_wmem_schedule(sk, total_ts))
> goto wait_for_memory;
>
> if (copy_page_from_iter(dfrag->page, offset, psize,
> &msg->msg_iter) != psize) {
> - mptcp_wmem_uncharge(sk, psize + frag_truesize);
> ret = -EFAULT;
> goto out;
> }
>
> /* data successfully copied into the write queue */
> + sk->sk_forward_alloc -= total_ts;
> copied += psize;
> dfrag->data_len += psize;
> frag_truesize += psize;
> @@ -1956,7 +1917,7 @@ static void __mptcp_update_rmem(struct sock *sk)
> return;
>
> atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
> - sk_mem_uncharge(sk, msk->rmem_released);
> + mptcp_rmem_uncharge(sk, msk->rmem_released);
> WRITE_ONCE(msk->rmem_released, 0);
> }
>
> @@ -2024,7 +1985,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> if (unlikely(flags & MSG_ERRQUEUE))
> return inet_recv_error(sk, msg, len, addr_len);
>
> - mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
> + lock_sock(sk);
> if (unlikely(sk->sk_state == TCP_LISTEN)) {
> copied = -ENOTCONN;
> goto out_err;
> @@ -2504,7 +2465,6 @@ static int __mptcp_init_sock(struct sock *sk)
> __skb_queue_head_init(&msk->receive_queue);
> msk->out_of_order_queue = RB_ROOT;
> msk->first_pending = NULL;
> - msk->wmem_reserved = 0;
msk->rmem_fwd_alloc init needs to be added, right?
> WRITE_ONCE(msk->rmem_released, 0);
> msk->timer_ival = TCP_RTO_MIN;
>
> @@ -2719,7 +2679,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
>
> sk->sk_prot->destroy(sk);
>
> - WARN_ON_ONCE(msk->wmem_reserved);
> + WARN_ON_ONCE(msk->rmem_fwd_alloc);
> WARN_ON_ONCE(msk->rmem_released);
> sk_stream_kill_queues(sk);
> xfrm_sk_free_policy(sk);
> @@ -2954,6 +2914,9 @@ void mptcp_destroy_common(struct mptcp_sock *msk)
>
> /* move to sk_receive_queue, sk_stream_kill_queues will purge it */
> skb_queue_splice_tail_init(&msk->receive_queue, &sk->sk_receive_queue);
> + __skb_queue_purge(&sk->sk_receive_queue);
> + sk->sk_forward_alloc += msk->rmem_fwd_alloc;
> + msk->rmem_fwd_alloc = 0;
>
> skb_rbtree_purge(&msk->out_of_order_queue);
> mptcp_token_destroy(msk);
> @@ -3037,10 +3000,6 @@ static void mptcp_release_cb(struct sock *sk)
> if (test_and_clear_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->flags))
> __mptcp_error_report(sk);
>
> - /* push_pending may touch wmem_reserved, ensure we do the cleanup
> - * later
> - */
> - __mptcp_update_wmem(sk);
> __mptcp_update_rmem(sk);
> }
>
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 7379ab580a7e..cfb374634a83 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -227,7 +227,7 @@ struct mptcp_sock {
> u64 ack_seq;
> u64 rcv_wnd_sent;
> u64 rcv_data_fin_seq;
> - int wmem_reserved;
> + int rmem_fwd_alloc;
> struct sock *last_snd;
> int snd_burst;
> int old_wspace;
> @@ -272,19 +272,6 @@ struct mptcp_sock {
> char ca_name[TCP_CA_NAME_MAX];
> };
>
> -#define mptcp_lock_sock(___sk, cb) do { \
> - struct sock *__sk = (___sk); /* silence macro reuse warning */ \
> - might_sleep(); \
> - spin_lock_bh(&__sk->sk_lock.slock); \
> - if (__sk->sk_lock.owned) \
> - __lock_sock(__sk); \
> - cb; \
> - __sk->sk_lock.owned = 1; \
> - spin_unlock(&__sk->sk_lock.slock); \
> - mutex_acquire(&__sk->sk_lock.dep_map, 0, 0, _RET_IP_); \
> - local_bh_enable(); \
> -} while (0)
> -
> #define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
> #define mptcp_data_unlock(sk) spin_unlock_bh(&(sk)->sk_lock.slock)
>
--
Mat Martineau
Intel
On Tue, 2021-10-05 at 16:07 -0700, Mat Martineau wrote:
> On Tue, 5 Oct 2021, Paolo Abeni wrote:
>
> > All the mptcp receive path is protected by the msk socket
> > spinlock. As consequence, the tx path has to play a few to allocate
> > the forward memory without acquiring the spinlock multile times, making
> > the overall TX path quite complex.
> >
> > This patch tries to clean-up a bit the tx path, using completely
> > separated fwd memory allocation, for the rx and the tx path.
> >
>
> Overall, it does seem like a cleaner approach than the existing
> wmem_reserved code. Do you think the existing code is related to any
> current bugs or performance issues?
None that I know, so far. But I fear (I'm almost quite sure) that the
current wmem_reserved is prone to problems with memory pressure and
could perform badly on some scenarios. Overall the 'wmem_reserved'
looks like the ugliest spot we currently have in the mptcp
implementation.
I "feel" that an approach like this one should be more roboust in the
long term.
> > The forward memory allocated in the rx path is now accounted in
> > msk->rmem_fwd_alloc and is (still) protected by the msk socket spinlock.
> >
> > To cope with the above we provied a few MPTCP-specific variant for
> > the helpers to charge, uncharge, reclaim and free the forward
> > memory in the receive path.
> >
> > msk->sk_forward_alloc now accounts only the forward memory for the tx path,
> > we can the plain core sock helper to manipulate it and drop quite a bit
> > of complexity.
> >
> > On memory pressure reclaim both rx and tx fwd memory.
> >
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> > ---
> > Note: this would additionally require a core change to properly fetch
> > the forward allocated memory in inet_sk_diag_fill(). I think a new
> > 'struct proto' ops would do:
> >
> > struct proto {
> > // ...
> > int (*forward_alloc)(const struct sock *sk);
> > // ...
> > };
> >
> > static inline sk_forward_alloc(const struct sock *sk)
> > {
> > if (!sk->sk_prot.forward_alloc)
> > return sk->sk_forward_alloc;
> > return sk->sk_prot.forward_alloc(sk);
> > }
>
> Seems ok to me to handle the diag info this way, hope upstream reviewers
> agree!
:)
> > ---
> > net/mptcp/protocol.c | 219 ++++++++++++++++++-------------------------
> > net/mptcp/protocol.h | 15 +--
> > 2 files changed, 90 insertions(+), 144 deletions(-)
> >
> > diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> > index 21716392e754..1d26462a1daf 100644
> > --- a/net/mptcp/protocol.c
> > +++ b/net/mptcp/protocol.c
> > @@ -126,6 +126,11 @@ static void mptcp_drop(struct sock *sk, struct sk_buff *skb)
> > __kfree_skb(skb);
> > }
> >
> > +static void mptcp_rmem_charge(struct sock *sk, int size)
> > +{
> > + mptcp_sk(sk)->rmem_fwd_alloc -= size;
> > +}
> > +
> > static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
> > struct sk_buff *from)
> > {
> > @@ -142,7 +147,7 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to,
> > MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq;
> > kfree_skb_partial(from, fragstolen);
> > atomic_add(delta, &sk->sk_rmem_alloc);
> > - sk_mem_charge(sk, delta);
> > + mptcp_rmem_charge(sk, delta);
> > return true;
> > }
> >
> > @@ -155,6 +160,50 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
> > return mptcp_try_coalesce((struct sock *)msk, to, from);
> > }
> >
> > +void __mptcp_rmem_reclaim(struct sock *sk, int amount)
> > +{
> > + amount >>= SK_MEM_QUANTUM_SHIFT;
> > + mptcp_sk(sk)->rmem_fwd_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
> > + __sk_mem_reduce_allocated(sk, amount);
> > +}
> > +
> > +static void mptcp_rmem_uncharge(struct sock *sk, int size)
> > +{
> > + struct mptcp_sock *msk = mptcp_sk(sk);
> > + int reclaimable;
> > +
> > + msk->rmem_fwd_alloc += size;
> > + reclaimable = msk->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
>
> I was going to ask if this patch takes in to account the recent
> SO_RESERVE_MEM patchset's changes to sk_forward_alloc handling, but these
> changes mirror those in sk_mem_reclaim() so it seems the answer is "yes".
Indeed!
> > +
> > + /* Avoid a possible overflow.
> > + * TCP send queues can make this happen, if sk_mem_reclaim()
> > + * is not called and more than 2 GBytes are released at once.
> > + *
> > + * If we reach 2 MBytes, reclaim 1 MBytes right now, there is
> > + * no need to hold that much forward allocation anyway.
> > + */
>
> Minor indent issue in the above comment.
>
> > + if (unlikely(reclaimable >= 1 << 21))
> > + __mptcp_rmem_reclaim(sk, (1 << 20));
>
> I realize these "1 << 21" and "1 << 20" values are copied from
> sk_mem_uncharge(). Should they be changed in both files to symbolic values
> defined in sock.h, to help keep the values matched if someone decides to
> adjust them in the future?
I'll define some macro for that.
> > +}
> > +
> > +static void mptcp_rfree(struct sk_buff *skb)
> > +{
> > + unsigned int len = skb->truesize;
> > + struct sock *sk = skb->sk;
> > +
> > + atomic_sub(len, &sk->sk_rmem_alloc);
> > + mptcp_rmem_uncharge(sk, len);
> > +}
> > +
> > +static void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk)
> > +{
> > + skb_orphan(skb);
> > + skb->sk = sk;
> > + skb->destructor = mptcp_rfree;
> > + atomic_add(skb->truesize, &sk->sk_rmem_alloc);
> > + mptcp_rmem_charge(sk, skb->truesize);
> > +}
> > +
> > /* "inspired" by tcp_data_queue_ofo(), main differences:
> > * - use mptcp seqs
> > * - don't cope with sacks
> > @@ -267,7 +316,29 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
> >
> > end:
> > skb_condense(skb);
> > - skb_set_owner_r(skb, sk);
> > + mptcp_set_owner_r(skb, sk);
> > +}
> > +
> > +static bool mptcp_rmem_schedule(struct sock *sk, struct sock *ssk, int size)
> > +{
> > + struct mptcp_sock *msk = mptcp_sk(sk);
> > + int amt, amount;
> > +
> > + if (size < msk->rmem_fwd_alloc)
> > + return true;;
> > +
> > + amt = sk_mem_pages(size);
> > + amount = amt << SK_MEM_QUANTUM_SHIFT;
> > + msk->rmem_fwd_alloc += amount;
> > + if (!__sk_mem_raise_allocated(sk, size, amt, SK_MEM_RECV)) {
> > + if (ssk->sk_forward_alloc < amount) {
> > + msk->rmem_fwd_alloc -= amount;
> > + return false;
> > + }
> > +
> > + ssk->sk_forward_alloc -= amount;
> > + }
> > + return true;
> > }
> >
> > static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> > @@ -285,15 +356,8 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> > skb_orphan(skb);
> >
> > /* try to fetch required memory from subflow */
> > - if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
> > - int amount = sk_mem_pages(skb->truesize) << SK_MEM_QUANTUM_SHIFT;
> > -
> > - if (ssk->sk_forward_alloc < amount)
> > - goto drop;
> > -
> > - ssk->sk_forward_alloc -= amount;
> > - sk->sk_forward_alloc += amount;
> > - }
> > + if (!mptcp_rmem_schedule(sk, ssk, skb->truesize))
> > + goto drop;
> >
> > has_rxtstamp = TCP_SKB_CB(skb)->has_rxtstamp;
> >
> > @@ -313,7 +377,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
> > if (tail && mptcp_try_coalesce(sk, tail, skb))
> > return true;
> >
> > - skb_set_owner_r(skb, sk);
> > + mptcp_set_owner_r(skb, sk);
> > __skb_queue_tail(&sk->sk_receive_queue, skb);
> > return true;
> > } else if (after64(MPTCP_SKB_CB(skb)->map_seq, msk->ack_seq)) {
> > @@ -908,122 +972,20 @@ static bool mptcp_frag_can_collapse_to(const struct mptcp_sock *msk,
> > df->data_seq + df->data_len == msk->write_seq;
> > }
> >
> > -static int mptcp_wmem_with_overhead(int size)
> > -{
> > - return size + ((sizeof(struct mptcp_data_frag) * size) >> PAGE_SHIFT);
> > -}
> > -
> > -static void __mptcp_wmem_reserve(struct sock *sk, int size)
> > -{
> > - int amount = mptcp_wmem_with_overhead(size);
> > - struct mptcp_sock *msk = mptcp_sk(sk);
> > -
> > - WARN_ON_ONCE(msk->wmem_reserved);
> > - if (WARN_ON_ONCE(amount < 0))
> > - amount = 0;
> > -
> > - if (amount <= sk->sk_forward_alloc)
> > - goto reserve;
> > -
> > - /* under memory pressure try to reserve at most a single page
> > - * otherwise try to reserve the full estimate and fallback
> > - * to a single page before entering the error path
> > - */
> > - if ((tcp_under_memory_pressure(sk) && amount > PAGE_SIZE) ||
> > - !sk_wmem_schedule(sk, amount)) {
> > - if (amount <= PAGE_SIZE)
> > - goto nomem;
> > -
> > - amount = PAGE_SIZE;
> > - if (!sk_wmem_schedule(sk, amount))
> > - goto nomem;
> > - }
> > -
> > -reserve:
> > - msk->wmem_reserved = amount;
> > - sk->sk_forward_alloc -= amount;
> > - return;
> > -
> > -nomem:
> > - /* we will wait for memory on next allocation */
> > - msk->wmem_reserved = -1;
> > -}
> > -
> > -static void __mptcp_update_wmem(struct sock *sk)
> > +static void __mptcp_mem_reclaim_partial(struct sock *sk)
> > {
> > - struct mptcp_sock *msk = mptcp_sk(sk);
> > + int reclaimable = mptcp_sk(sk)->rmem_fwd_alloc - sk_unused_reserved_mem(sk);
> >
> > lockdep_assert_held_once(&sk->sk_lock.slock);
> >
> > - if (!msk->wmem_reserved)
> > - return;
> > -
> > - if (msk->wmem_reserved < 0)
> > - msk->wmem_reserved = 0;
> > - if (msk->wmem_reserved > 0) {
> > - sk->sk_forward_alloc += msk->wmem_reserved;
> > - msk->wmem_reserved = 0;
> > - }
> > -}
> > -
> > -static bool mptcp_wmem_alloc(struct sock *sk, int size)
> > -{
> > - struct mptcp_sock *msk = mptcp_sk(sk);
> > -
> > - /* check for pre-existing error condition */
> > - if (msk->wmem_reserved < 0)
> > - return false;
> > -
> > - if (msk->wmem_reserved >= size)
> > - goto account;
> > -
> > - mptcp_data_lock(sk);
> > - if (!sk_wmem_schedule(sk, size)) {
> > - mptcp_data_unlock(sk);
> > - return false;
> > - }
> > -
> > - sk->sk_forward_alloc -= size;
> > - msk->wmem_reserved += size;
> > - mptcp_data_unlock(sk);
> > -
> > -account:
> > - msk->wmem_reserved -= size;
> > - return true;
> > -}
> > -
> > -static void mptcp_wmem_uncharge(struct sock *sk, int size)
> > -{
> > - struct mptcp_sock *msk = mptcp_sk(sk);
> > -
> > - if (msk->wmem_reserved < 0)
> > - msk->wmem_reserved = 0;
> > - msk->wmem_reserved += size;
> > -}
> > -
> > -static void __mptcp_mem_reclaim_partial(struct sock *sk)
> > -{
> > - lockdep_assert_held_once(&sk->sk_lock.slock);
> > - __mptcp_update_wmem(sk);
> > + __mptcp_rmem_reclaim(sk, reclaimable - 1);
> > sk_mem_reclaim_partial(sk);
> > }
> >
> > static void mptcp_mem_reclaim_partial(struct sock *sk)
> > {
> > - struct mptcp_sock *msk = mptcp_sk(sk);
> > -
> > - /* if we are experiencing a transint allocation error,
> > - * the forward allocation memory has been already
> > - * released
> > - */
> > - if (msk->wmem_reserved < 0)
> > - return;
> > -
> > mptcp_data_lock(sk);
> > - sk->sk_forward_alloc += msk->wmem_reserved;
> > - sk_mem_reclaim_partial(sk);
> > - msk->wmem_reserved = sk->sk_forward_alloc;
> > - sk->sk_forward_alloc = 0;
> > + __mptcp_mem_reclaim_partial(sk);
> > mptcp_data_unlock(sk);
> > }
> >
> > @@ -1664,7 +1626,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk)
> > /* __mptcp_alloc_tx_skb could have released some wmem and we are
> > * not going to flush it via release_sock()
> > */
> > - __mptcp_update_wmem(sk);
> > if (copied) {
> > tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
> > info.size_goal);
> > @@ -1701,7 +1662,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> > /* silently ignore everything else */
> > msg->msg_flags &= MSG_MORE | MSG_DONTWAIT | MSG_NOSIGNAL;
> >
> > - mptcp_lock_sock(sk, __mptcp_wmem_reserve(sk, min_t(size_t, 1 << 20, len)));
> > + lock_sock(sk);
> >
> > timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
> >
> > @@ -1749,17 +1710,17 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
> > psize = min_t(size_t, psize, msg_data_left(msg));
> > total_ts = psize + frag_truesize;
> >
> > - if (!mptcp_wmem_alloc(sk, total_ts))
> > + if (!sk_wmem_schedule(sk, total_ts))
> > goto wait_for_memory;
> >
> > if (copy_page_from_iter(dfrag->page, offset, psize,
> > &msg->msg_iter) != psize) {
> > - mptcp_wmem_uncharge(sk, psize + frag_truesize);
> > ret = -EFAULT;
> > goto out;
> > }
> >
> > /* data successfully copied into the write queue */
> > + sk->sk_forward_alloc -= total_ts;
> > copied += psize;
> > dfrag->data_len += psize;
> > frag_truesize += psize;
> > @@ -1956,7 +1917,7 @@ static void __mptcp_update_rmem(struct sock *sk)
> > return;
> >
> > atomic_sub(msk->rmem_released, &sk->sk_rmem_alloc);
> > - sk_mem_uncharge(sk, msk->rmem_released);
> > + mptcp_rmem_uncharge(sk, msk->rmem_released);
> > WRITE_ONCE(msk->rmem_released, 0);
> > }
> >
> > @@ -2024,7 +1985,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
> > if (unlikely(flags & MSG_ERRQUEUE))
> > return inet_recv_error(sk, msg, len, addr_len);
> >
> > - mptcp_lock_sock(sk, __mptcp_splice_receive_queue(sk));
> > + lock_sock(sk);
> > if (unlikely(sk->sk_state == TCP_LISTEN)) {
> > copied = -ENOTCONN;
> > goto out_err;
> > @@ -2504,7 +2465,6 @@ static int __mptcp_init_sock(struct sock *sk)
> > __skb_queue_head_init(&msk->receive_queue);
> > msk->out_of_order_queue = RB_ROOT;
> > msk->first_pending = NULL;
> > - msk->wmem_reserved = 0;
>
> msk->rmem_fwd_alloc init needs to be added, right?
Right. Not sure with self-tests did not show any issues ??!
Thanks!
Paolo
Hi Paolo,
On 05/10/2021 18:39, Paolo Abeni wrote:
> All the mptcp receive path is protected by the msk socket
> spinlock. As consequence, the tx path has to play a few to allocate
> the forward memory without acquiring the spinlock multile times, making
> the overall TX path quite complex.
>
> This patch tries to clean-up a bit the tx path, using completely
> separated fwd memory allocation, for the rx and the tx path.
>
> The forward memory allocated in the rx path is now accounted in
> msk->rmem_fwd_alloc and is (still) protected by the msk socket spinlock.
>
> To cope with the above we provied a few MPTCP-specific variant for
> the helpers to charge, uncharge, reclaim and free the forward
> memory in the receive path.
>
> msk->sk_forward_alloc now accounts only the forward memory for the tx path,
> we can the plain core sock helper to manipulate it and drop quite a bit
> of complexity.
>
> On memory pressure reclaim both rx and tx fwd memory.
Thank you for looking at that!
(...)
> diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
> index 21716392e754..1d26462a1daf 100644
> --- a/net/mptcp/protocol.c
> +++ b/net/mptcp/protocol.c
(...)
> @@ -155,6 +160,50 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
> return mptcp_try_coalesce((struct sock *)msk, to, from);
> }
>
> +void __mptcp_rmem_reclaim(struct sock *sk, int amount)
The public CI complained about this:
ERROR: Unable to compile mptcp source code with W=1 C=1:
net/mptcp/protocol.c:163:6: error: no previous prototype for
'__mptcp_rmem_reclaim' [-Werror=missing-prototypes]
163 | void __mptcp_rmem_reclaim(struct sock *sk, int amount)
| ^~~~~~~~~~~~~~~~~~~~
https://github.com/multipath-tcp/mptcp_net-next/runs/3868191624?check_suite_focus=true
As confirmed by Davide on IRC, marking the function as 'static' fixes
the error.
Cheers,
Matt
--
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net
© 2016 - 2026 Red Hat, Inc.