[PATCH mptcp-next 3/4] mptcp: track some aggregate data counters.

Paolo Abeni posted 4 patches 1 year, 4 months ago
There is a newer version of this series
[PATCH mptcp-next 3/4] mptcp: track some aggregate data counters.
Posted by Paolo Abeni 1 year, 4 months ago
Currently there are no data transfer counters accounting for all
the subflows used by a given MPTCP socket. The user-space can compute
such figures aggregating the subflow info, but that is inaccurate
if any subflow is closed before the MPTCP socket itself.

Add the new counters in the MPTCP socket itself and expose them
via the existing diag and sockopt. While touching mptcp_diag_fill_info(),
acquire the relevant locks before fetching the msk data, to ensure
better data consistency

Signed-off-by: Paolo Abeni <pabeni@redhat.com>
---
 include/uapi/linux/mptcp.h |  5 +++++
 net/mptcp/options.c        | 10 ++++++++--
 net/mptcp/protocol.c       | 11 ++++++++++-
 net/mptcp/protocol.h       |  4 ++++
 net/mptcp/sockopt.c        | 22 +++++++++++++++++-----
 5 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 49bfbf85cb80..868a77434ec5 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -123,6 +123,11 @@ struct mptcp_info {
 	__u8	mptcpi_local_addr_used;
 	__u8	mptcpi_local_addr_max;
 	__u8	mptcpi_csum_enabled;
+	__u32	mptcpi_retransmits;
+	__u64	mptcpi_bytes_retrans;
+	__u64	mptcpi_bytes_sent;
+	__u64	mptcpi_bytes_received;
+	__u64	mptcpi_bytes_acked;
 };
 
 /*
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index 4bdcd2b326bd..c254accb14de 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -1026,6 +1026,12 @@ u64 __mptcp_expand_seq(u64 old_seq, u64 cur_seq)
 	return cur_seq;
 }
 
+static void __mptcp_snd_una_update(struct mptcp_sock *msk, u64 new_snd_una)
+{
+	msk->bytes_acked += new_snd_una - msk->snd_una;
+	msk->snd_una = new_snd_una;
+}
+
 static void ack_update_msk(struct mptcp_sock *msk,
 			   struct sock *ssk,
 			   struct mptcp_options_received *mp_opt)
@@ -1057,7 +1063,7 @@ static void ack_update_msk(struct mptcp_sock *msk,
 		__mptcp_check_push(sk, ssk);
 
 	if (after64(new_snd_una, old_snd_una)) {
-		msk->snd_una = new_snd_una;
+		__mptcp_snd_una_update(msk, new_snd_una);
 		__mptcp_data_acked(sk);
 	}
 	mptcp_data_unlock(sk);
@@ -1123,7 +1129,7 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
 		/* on fallback we just need to ignore the msk-level snd_una, as
 		 * this is really plain TCP
 		 */
-		msk->snd_una = READ_ONCE(msk->snd_nxt);
+		__mptcp_snd_una_update(msk, READ_ONCE(msk->snd_nxt));
 
 		__mptcp_data_acked(subflow->conn);
 		mptcp_data_unlock(subflow->conn);
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 8d674aef2f1e..312bc8e32e93 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -378,6 +378,7 @@ static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
 
 	if (MPTCP_SKB_CB(skb)->map_seq == msk->ack_seq) {
 		/* in sequence */
+		msk->bytes_received += copy_len;
 		WRITE_ONCE(msk->ack_seq, msk->ack_seq + copy_len);
 		tail = skb_peek_tail(&sk->sk_receive_queue);
 		if (tail && mptcp_try_coalesce(sk, tail, skb))
@@ -761,6 +762,7 @@ static bool __mptcp_ofo_queue(struct mptcp_sock *msk)
 			MPTCP_SKB_CB(skb)->map_seq += delta;
 			__skb_queue_tail(&sk->sk_receive_queue, skb);
 		}
+		msk->bytes_received += end_seq - msk->ack_seq;
 		msk->ack_seq = end_seq;
 		moved = true;
 	}
@@ -1525,8 +1527,10 @@ static void mptcp_update_post_push(struct mptcp_sock *msk,
 	 * that has been handed to the subflow for transmission
 	 * and skip update in case it was old dfrag.
 	 */
-	if (likely(after64(snd_nxt_new, msk->snd_nxt)))
+	if (likely(after64(snd_nxt_new, msk->snd_nxt))) {
+		msk->bytes_sent += snd_nxt_new - msk->snd_nxt;
 		msk->snd_nxt = snd_nxt_new;
+	}
 }
 
 void mptcp_check_and_set_pending(struct sock *sk)
@@ -2585,6 +2589,7 @@ static void __mptcp_retrans(struct sock *sk)
 	}
 	if (copied) {
 		dfrag->already_sent = max(dfrag->already_sent, info.sent);
+		msk->bytes_retrans += copied;
 		tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
 			 info.size_goal);
 		WRITE_ONCE(msk->allow_infinite_fallback, false);
@@ -3098,6 +3103,10 @@ static int mptcp_disconnect(struct sock *sk, int flags)
 	WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
 	mptcp_pm_data_reset(msk);
 	mptcp_ca_reset(sk);
+	msk->bytes_acked = 0;
+	msk->bytes_received = 0;
+	msk->bytes_sent = 0;
+	msk->bytes_retrans = 0;
 
 	sk->sk_shutdown = 0;
 	sk_error_report(sk);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index babf1230c84d..8559d4f81654 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -262,10 +262,13 @@ struct mptcp_sock {
 	u64		local_key;
 	u64		remote_key;
 	u64		write_seq;
+	u64		bytes_sent;
 	u64		snd_nxt;
+	u64		bytes_received;
 	u64		ack_seq;
 	atomic64_t	rcv_wnd_sent;
 	u64		rcv_data_fin_seq;
+	u64		bytes_retrans;
 	int		rmem_fwd_alloc;
 	struct sock	*last_snd;
 	int		snd_burst;
@@ -274,6 +277,7 @@ struct mptcp_sock {
 						 * recovery related fields are under data_lock
 						 * protection
 						 */
+	u64		bytes_acked;
 	u64		snd_una;
 	u64		wnd_end;
 	unsigned long	timer_ival;
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index d2aa02e28917..2265387a5db2 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -888,7 +888,9 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
 
 void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 {
+	struct sock *sk = (struct sock *)msk;
 	u32 flags = 0;
+	bool slow;
 
 	memset(info, 0, sizeof(*info));
 
@@ -914,11 +916,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 	if (READ_ONCE(msk->can_ack))
 		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
 	info->mptcpi_flags = flags;
-	info->mptcpi_token = READ_ONCE(msk->token);
-	info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
-	info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
-	info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
-	info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
+	mptcp_data_lock(sk);
+	info->mptcpi_snd_una = msk->snd_una;
+	info->mptcpi_rcv_nxt = msk->ack_seq;
+	info->mptcpi_bytes_acked = msk->bytes_acked;
+	mptcp_data_unlock(sk);
+
+	slow = lock_sock_fast(sk);
+	info->mptcpi_csum_enabled = msk->csum_enabled;
+	info->mptcpi_token = msk->token;
+	info->mptcpi_write_seq = msk->write_seq;
+	info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
+	info->mptcpi_bytes_sent = msk->bytes_sent;
+	info->mptcpi_bytes_received = msk->bytes_received;
+	info->mptcpi_bytes_retrans = msk->bytes_retrans;
+	unlock_sock_fast(sk, slow);
 }
 EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);
 
-- 
2.40.0
Re: [PATCH mptcp-next 3/4] mptcp: track some aggregate data counters.
Posted by Matthieu Baerts 1 year, 3 months ago
Hi Paolo,

On 04/05/2023 18:40, Paolo Abeni wrote:
> Currently there are no data transfer counters accounting for all
> the subflows used by a given MPTCP socket. The user-space can compute
> such figures aggregating the subflow info, but that is inaccurate
> if any subflow is closed before the MPTCP socket itself.
> 
> Add the new counters in the MPTCP socket itself and expose them
> via the existing diag and sockopt. While touching mptcp_diag_fill_info(),
> acquire the relevant locks before fetching the msk data, to ensure
> better data consistency
> 
> Signed-off-by: Paolo Abeni <pabeni@redhat.com>

(...)

> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> index d2aa02e28917..2265387a5db2 100644
> --- a/net/mptcp/sockopt.c
> +++ b/net/mptcp/sockopt.c
> @@ -888,7 +888,9 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
>  
>  void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
>  {
> +	struct sock *sk = (struct sock *)msk;
>  	u32 flags = 0;
> +	bool slow;
>  
>  	memset(info, 0, sizeof(*info));
>  
> @@ -914,11 +916,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
>  	if (READ_ONCE(msk->can_ack))
>  		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
>  	info->mptcpi_flags = flags;
> -	info->mptcpi_token = READ_ONCE(msk->token);
> -	info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
> -	info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
> -	info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
> -	info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
> +	mptcp_data_lock(sk);
> +	info->mptcpi_snd_una = msk->snd_una;
> +	info->mptcpi_rcv_nxt = msk->ack_seq;
> +	info->mptcpi_bytes_acked = msk->bytes_acked;
> +	mptcp_data_unlock(sk);

Do you know if this could have a "bad" impact on the performances?

I mean mptcp_diag_fill_info() is also used on when dumping all the
connections via Netlink. I guess the lock_sock_fast() should be fine
because it is also used in tcp_get_info() but what about the data lock?
Also OK because that's a spin lock and we only copy fields?

> +
> +	slow = lock_sock_fast(sk);
> +	info->mptcpi_csum_enabled = msk->csum_enabled;
> +	info->mptcpi_token = msk->token;
> +	info->mptcpi_write_seq = msk->write_seq;
> +	info->mptcpi_retransmits = inet_csk(sk)->icsk_retransmits;
> +	info->mptcpi_bytes_sent = msk->bytes_sent;
> +	info->mptcpi_bytes_received = msk->bytes_received;
> +	info->mptcpi_bytes_retrans = msk->bytes_retrans;
> +	unlock_sock_fast(sk, slow);
>  }
>  EXPORT_SYMBOL_GPL(mptcp_diag_fill_info);

Cheers,
Matt
-- 
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net
Re: [PATCH mptcp-next 3/4] mptcp: track some aggregate data counters.
Posted by Paolo Abeni 1 year, 3 months ago
On Wed, 2023-05-17 at 16:26 +0200, Matthieu Baerts wrote:
> Hi Paolo,
> 
> On 04/05/2023 18:40, Paolo Abeni wrote:
> > Currently there are no data transfer counters accounting for all
> > the subflows used by a given MPTCP socket. The user-space can compute
> > such figures aggregating the subflow info, but that is inaccurate
> > if any subflow is closed before the MPTCP socket itself.
> > 
> > Add the new counters in the MPTCP socket itself and expose them
> > via the existing diag and sockopt. While touching mptcp_diag_fill_info(),
> > acquire the relevant locks before fetching the msk data, to ensure
> > better data consistency
> > 
> > Signed-off-by: Paolo Abeni <pabeni@redhat.com>
> 
> (...)
> 
> > diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> > index d2aa02e28917..2265387a5db2 100644
> > --- a/net/mptcp/sockopt.c
> > +++ b/net/mptcp/sockopt.c
> > @@ -888,7 +888,9 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
> >  
> >  void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
> >  {
> > +	struct sock *sk = (struct sock *)msk;
> >  	u32 flags = 0;
> > +	bool slow;
> >  
> >  	memset(info, 0, sizeof(*info));
> >  
> > @@ -914,11 +916,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
> >  	if (READ_ONCE(msk->can_ack))
> >  		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
> >  	info->mptcpi_flags = flags;
> > -	info->mptcpi_token = READ_ONCE(msk->token);
> > -	info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
> > -	info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
> > -	info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
> > -	info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
> > +	mptcp_data_lock(sk);
> > +	info->mptcpi_snd_una = msk->snd_una;
> > +	info->mptcpi_rcv_nxt = msk->ack_seq;
> > +	info->mptcpi_bytes_acked = msk->bytes_acked;
> > +	mptcp_data_unlock(sk);
> 
> Do you know if this could have a "bad" impact on the performances?

It depends ;)

If you do something alike:

for CPU in $(seq $(nproc)); do while ss -M; do :; done & done

it will impact mptcp socket performances ;)

For a more reasonable usage, nothing measurable is expected: once every
a few minutes? hours? much longer interval? an additional process will
compete for the mptcp data and socket lock. It's not expected to matter
much.

The fact we need to acquire independently the data lock and socket lock
is a bit not nice - for the user-space requesting the dump. It can
possibly be avoided implementing some helper to upgrade the mptcp data
lock to the (fast) socket lock, but at this point I think it would be a
bit overkill.

> I mean mptcp_diag_fill_info() is also used on when dumping all the
> connections via Netlink. I guess the lock_sock_fast() should be fine
> because it is also used in tcp_get_info() but what about the data lock?
> Also OK because that's a spin lock and we only copy fields?

The main impact we have on the data path is the dirtying of the msk
socket spin lock cacheline, which will possibly cause a cache miss on
the next data path lock operation, if the dumping happens on a
different core. Not relevant until we get many of them per socket per
seconds.

Cheers,

Paolo
Re: [PATCH mptcp-next 3/4] mptcp: track some aggregate data counters.
Posted by Matthieu Baerts 1 year, 3 months ago
Hi Paolo,

On 18/05/2023 16:17, Paolo Abeni wrote:
> On Wed, 2023-05-17 at 16:26 +0200, Matthieu Baerts wrote:
>> Hi Paolo,
>>
>> On 04/05/2023 18:40, Paolo Abeni wrote:
>>> Currently there are no data transfer counters accounting for all
>>> the subflows used by a given MPTCP socket. The user-space can compute
>>> such figures aggregating the subflow info, but that is inaccurate
>>> if any subflow is closed before the MPTCP socket itself.
>>>
>>> Add the new counters in the MPTCP socket itself and expose them
>>> via the existing diag and sockopt. While touching mptcp_diag_fill_info(),
>>> acquire the relevant locks before fetching the msk data, to ensure
>>> better data consistency
>>>
>>> Signed-off-by: Paolo Abeni <pabeni@redhat.com>
>>
>> (...)
>>
>>> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
>>> index d2aa02e28917..2265387a5db2 100644
>>> --- a/net/mptcp/sockopt.c
>>> +++ b/net/mptcp/sockopt.c
>>> @@ -888,7 +888,9 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int
>>>  
>>>  void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
>>>  {
>>> +	struct sock *sk = (struct sock *)msk;
>>>  	u32 flags = 0;
>>> +	bool slow;
>>>  
>>>  	memset(info, 0, sizeof(*info));
>>>  
>>> @@ -914,11 +916,21 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
>>>  	if (READ_ONCE(msk->can_ack))
>>>  		flags |= MPTCP_INFO_FLAG_REMOTE_KEY_RECEIVED;
>>>  	info->mptcpi_flags = flags;
>>> -	info->mptcpi_token = READ_ONCE(msk->token);
>>> -	info->mptcpi_write_seq = READ_ONCE(msk->write_seq);
>>> -	info->mptcpi_snd_una = READ_ONCE(msk->snd_una);
>>> -	info->mptcpi_rcv_nxt = READ_ONCE(msk->ack_seq);
>>> -	info->mptcpi_csum_enabled = READ_ONCE(msk->csum_enabled);
>>> +	mptcp_data_lock(sk);
>>> +	info->mptcpi_snd_una = msk->snd_una;
>>> +	info->mptcpi_rcv_nxt = msk->ack_seq;
>>> +	info->mptcpi_bytes_acked = msk->bytes_acked;
>>> +	mptcp_data_unlock(sk);
>>
>> Do you know if this could have a "bad" impact on the performances?
> 
> It depends ;)
> 
> If you do something alike:
> 
> for CPU in $(seq $(nproc)); do while ss -M; do :; done & done
> 
> it will impact mptcp socket performances ;)
> 
> For a more reasonable usage, nothing measurable is expected: once every
> a few minutes? hours? much longer interval? an additional process will
> compete for the mptcp data and socket lock. It's not expected to matter
> much.

In case of debugging, we might want to take these info very regularly.
But then it is a debugging session and it is understandable it can have
an impact. Taking more CPU waiting to get the info is fine in this case,
more than slowing the connections.

> The fact we need to acquire independently the data lock and socket lock
> is a bit not nice - for the user-space requesting the dump. It can
> possibly be avoided implementing some helper to upgrade the mptcp data
> lock to the (fast) socket lock, but at this point I think it would be a
> bit overkill.

Indeed.

>> I mean mptcp_diag_fill_info() is also used on when dumping all the
>> connections via Netlink. I guess the lock_sock_fast() should be fine
>> because it is also used in tcp_get_info() but what about the data lock?
>> Also OK because that's a spin lock and we only copy fields?
> 
> The main impact we have on the data path is the dirtying of the msk
> socket spin lock cacheline, which will possibly cause a cache miss on
> the next data path lock operation, if the dumping happens on a
> different core. Not relevant until we get many of them per socket per
> seconds.

Good point.

Thank you for having shared your point of view!

I'm fine with this!

I guess we don't need to backport the locks you added then, can be seen
as a new feature.

Cheers,
Matt
-- 
Tessares | Belgium | Hybrid Access Solutions
www.tessares.net