[PATCH mptcp-next 5/6] mptcp: pm: in-kernel: add 'address' endpoints

Matthieu Baerts (NGI0) posted 6 patches 2 weeks, 4 days ago
There is a newer version of this series
[PATCH mptcp-next 5/6] mptcp: pm: in-kernel: add 'address' endpoints
Posted by Matthieu Baerts (NGI0) 2 weeks, 4 days ago
Currently, upon the reception of an ADD_ADDR (and when the fullmesh flag
is not used), the in-kernel PM will create new subflows using the local
address the routing configuration will pick.

It would be easier to pick local addresses from a selected list of
endpoints, and use it only once, than relying on routing rules.

Use case: both the client (C) and the server (S) have two addresses (a
and b). The client establishes the connection between C(a) and S(a).
Once established, the server announces its additional address S(b). Once
received, the client connects to it using its second address C(b).
Compared to a situation without the 'address' endpoint for C(b), the
client didn't use this address C(b) to establish a subflow to the
server's primary address S(a). So at the end, we have:

   C        S
  C(a) --- S(a)
  C(b) --- S(b)

In case of a 3rd address on each side (C(c) and S(c)), upon the
reception of an ADD_ADDR with S(c), the client should not pick C(b)
because it has already been used. C(c) should then be used.

Note that this situation is currently possible if C doesn't add any
endpoint, but configure the routing in order to pick C(b) for the route
to S(b), and pick C(c) for the route to S(c). That doesn't sound very
practical because it means knowing in advance the IP addresses that
will be used and announced by the server.

In the code, the new endpoint type is added. Similar to the other
subflow types, an MPTCP_INFO counter is added. While at it, hole are now
commented in struct mptcp_info, to remember next time that these holes
can no longer be used.

Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/503
Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
---
 include/uapi/linux/mptcp.h |  6 +++-
 net/mptcp/pm_kernel.c      | 82 ++++++++++++++++++++++++++++++++++++++++++++++
 net/mptcp/protocol.h       |  1 +
 net/mptcp/sockopt.c        |  2 ++
 4 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
index 5ec996977b3fa2351222e6d01b814770b34348e9..65dc069e9063325ad2e1ffb1da21cc4a4b6efd32 100644
--- a/include/uapi/linux/mptcp.h
+++ b/include/uapi/linux/mptcp.h
@@ -39,6 +39,7 @@
 #define MPTCP_PM_ADDR_FLAG_BACKUP		_BITUL(2)
 #define MPTCP_PM_ADDR_FLAG_FULLMESH		_BITUL(3)
 #define MPTCP_PM_ADDR_FLAG_IMPLICIT		_BITUL(4)
+#define MPTCP_PM_ADDR_FLAG_ADDRESS		_BITUL(5)
 
 struct mptcp_info {
 	__u8	mptcpi_subflows;
@@ -51,6 +52,7 @@ struct mptcp_info {
 	#define mptcpi_endp_signal_max mptcpi_add_addr_signal_max
 	__u8	mptcpi_add_addr_accepted_max;
 	#define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max
+	/* 16-bit hole that can no longer be filled */
 	__u32	mptcpi_flags;
 	__u32	mptcpi_token;
 	__u64	mptcpi_write_seq;
@@ -60,13 +62,15 @@ struct mptcp_info {
 	__u8	mptcpi_local_addr_max;
 	#define mptcpi_endp_subflow_max mptcpi_local_addr_max
 	__u8	mptcpi_csum_enabled;
+	/* 8-bit hole that can no longer be filled */
 	__u32	mptcpi_retransmits;
 	__u64	mptcpi_bytes_retrans;
 	__u64	mptcpi_bytes_sent;
 	__u64	mptcpi_bytes_received;
 	__u64	mptcpi_bytes_acked;
 	__u8	mptcpi_subflows_total;
-	__u8	reserved[3];
+	__u8	mptcpi_endp_address_max;
+	__u8	reserved[2];
 	__u32	mptcpi_last_data_sent;
 	__u32	mptcpi_last_data_recv;
 	__u32	mptcpi_last_ack_recv;
diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
index fbd0a4ade8469ee75d99083bf640ad91a6fb714e..790dd7bc7f79e95a1fb73cbfb065087aa28f8f4b 100644
--- a/net/mptcp/pm_kernel.c
+++ b/net/mptcp/pm_kernel.c
@@ -21,6 +21,7 @@ struct pm_nl_pernet {
 	u8			endpoints;
 	u8			endp_signal_max;
 	u8			endp_subflow_max;
+	u8			endp_address_max;
 	u8			limit_add_addr_accepted;
 	u8			limit_extra_subflows;
 	u8			next_id;
@@ -61,6 +62,14 @@ u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk)
 }
 EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max);
 
+u8 mptcp_pm_get_endp_address_max(const struct mptcp_sock *msk)
+{
+	struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+
+	return READ_ONCE(pernet->endp_address_max);
+}
+EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_address_max);
+
 u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk)
 {
 	struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
@@ -451,6 +460,66 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk,
 	return i;
 }
 
+static unsigned int
+fill_local_addresses_vec_address(struct mptcp_sock *msk,
+				 struct mptcp_addr_info *remote,
+				 struct mptcp_pm_local *locals)
+{
+	struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
+	DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+	struct mptcp_subflow_context *subflow;
+	struct sock *sk = (struct sock *)msk;
+	struct mptcp_pm_addr_entry *entry;
+	struct mptcp_pm_local *local;
+	int i = 0;
+
+	/* Forbid creation of new subflows matching existing ones, possibly
+	 * already created by 'subflow' endpoints
+	 */
+	bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
+	mptcp_for_each_subflow(msk, subflow) {
+		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+
+		if ((1 << inet_sk_state_load(ssk)) &
+		    (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE))
+			continue;
+
+		__set_bit(READ_ONCE(subflow->local_id), unavail_id);
+	}
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
+		if (!(entry->flags & MPTCP_PM_ADDR_FLAG_ADDRESS))
+			continue;
+
+		if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
+			continue;
+
+		if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr),
+			     unavail_id))
+			continue;
+
+		local = &locals[i];
+		local->addr = entry->addr;
+		local->flags = entry->flags;
+		local->ifindex = entry->ifindex;
+
+		if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
+			__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
+
+			if (local->addr.id != msk->mpc_endpoint_id)
+				msk->pm.local_addr_used++;
+		}
+
+		msk->pm.extra_subflows++;
+		i++;
+		break;
+	}
+	rcu_read_unlock();
+
+	return i;
+}
+
 static unsigned int
 fill_local_addresses_vec_c_flag(struct mptcp_sock *msk,
 				struct mptcp_addr_info *remote,
@@ -527,6 +596,10 @@ fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
 	if (i)
 		return i;
 
+	/* If there is at least one MPTCP endpoint with an address flag */
+	if (mptcp_pm_get_endp_address_max(msk))
+		return fill_local_addresses_vec_address(msk, remote, locals);
+
 	/* Special case: peer sets the C flag, accept one ADD_ADDR if default
 	 * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints
 	 */
@@ -701,6 +774,10 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
 		addr_max = pernet->endp_subflow_max;
 		WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1);
 	}
+	if (entry->flags & MPTCP_PM_ADDR_FLAG_ADDRESS) {
+		addr_max = pernet->endp_address_max;
+		WRITE_ONCE(pernet->endp_address_max, addr_max + 1);
+	}
 
 	pernet->endpoints++;
 	if (!entry->addr.port)
@@ -1095,6 +1172,10 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
 		addr_max = pernet->endp_subflow_max;
 		WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1);
 	}
+	if (entry->flags & MPTCP_PM_ADDR_FLAG_ADDRESS) {
+		addr_max = pernet->endp_address_max;
+		WRITE_ONCE(pernet->endp_address_max, addr_max - 1);
+	}
 
 	pernet->endpoints--;
 	list_del_rcu(&entry->list);
@@ -1177,6 +1258,7 @@ static void __reset_counters(struct pm_nl_pernet *pernet)
 {
 	WRITE_ONCE(pernet->endp_signal_max, 0);
 	WRITE_ONCE(pernet->endp_subflow_max, 0);
+	WRITE_ONCE(pernet->endp_address_max, 0);
 	pernet->endpoints = 0;
 }
 
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 027d717ef7cffe150f8de7b3b404916a1899537a..57e4db26e0ae1c5e82bc5a262ccb9d5e36508543 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -1179,6 +1179,7 @@ void mptcp_pm_worker(struct mptcp_sock *msk);
 void __mptcp_pm_kernel_worker(struct mptcp_sock *msk);
 u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk);
 u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk);
+u8 mptcp_pm_get_endp_address_max(const struct mptcp_sock *msk);
 u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk);
 u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk);
 
diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
index 92a2a274262732a345b9ab185efd7da1f0a5773a..3cdc35323cc18de3585169fe729a51cab25a4cba 100644
--- a/net/mptcp/sockopt.c
+++ b/net/mptcp/sockopt.c
@@ -980,6 +980,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
 			mptcp_pm_get_limit_add_addr_accepted(msk);
 		info->mptcpi_endp_subflow_max =
 			mptcp_pm_get_endp_subflow_max(msk);
+		info->mptcpi_endp_address_max =
+			mptcp_pm_get_endp_address_max(msk);
 	}
 
 	if (__mptcp_check_fallback(msk))

-- 
2.51.0
Re: [PATCH mptcp-next 5/6] mptcp: pm: in-kernel: add 'address' endpoints
Posted by Mat Martineau 2 weeks, 4 days ago
On Tue, 23 Sep 2025, Matthieu Baerts (NGI0) wrote:

> Currently, upon the reception of an ADD_ADDR (and when the fullmesh flag
> is not used), the in-kernel PM will create new subflows using the local
> address the routing configuration will pick.
>
> It would be easier to pick local addresses from a selected list of
> endpoints, and use it only once, than relying on routing rules.
>
> Use case: both the client (C) and the server (S) have two addresses (a
> and b). The client establishes the connection between C(a) and S(a).
> Once established, the server announces its additional address S(b). Once
> received, the client connects to it using its second address C(b).
> Compared to a situation without the 'address' endpoint for C(b), the
> client didn't use this address C(b) to establish a subflow to the
> server's primary address S(a). So at the end, we have:
>
>   C        S
>  C(a) --- S(a)
>  C(b) --- S(b)
>
> In case of a 3rd address on each side (C(c) and S(c)), upon the
> reception of an ADD_ADDR with S(c), the client should not pick C(b)
> because it has already been used. C(c) should then be used.
>
> Note that this situation is currently possible if C doesn't add any
> endpoint, but configure the routing in order to pick C(b) for the route
> to S(b), and pick C(c) for the route to S(c). That doesn't sound very
> practical because it means knowing in advance the IP addresses that
> will be used and announced by the server.
>
> In the code, the new endpoint type is added. Similar to the other
> subflow types, an MPTCP_INFO counter is added. While at it, hole are now
> commented in struct mptcp_info, to remember next time that these holes
> can no longer be used.

Hi Matthieu -

I think this patch brings up a few larger topics of discussion: path 
manager strategy (in-kernel/userspace/bpf), interaction of in-kernel PM 
flags, and (once again!) naming.

I'm not sure the reply chain for this patch is the right place to have the 
discussion, but adding another in-kernel PM "mode" makes me think we need 
a community-level (MPTCP) discussion on our path manager strategy. The 
original plan was to have a single general in-kernel PM, and rely on 
userspace/mptcpd for anything else. We've obviously made some 
changes to that plan, adding fullmesh and having the BPF PM in 
progress. This has made path management more complex to understand, use, 
and maintain - so I want to be sure we are making a careful choice about 
which PM features to add.

Our userspace API for the in-kernel PM also makes it complicated to 
explain what happens when there is a mix of endpoint types. Typical use 
would probably be reasonable (all fullmesh, all 'address', etc). But it's 
good to avoid confusion, and more importantly bugs!

As for naming, unfortunately "address" is a very frequently used word in 
our subsystem! In mptcpd the similar plugin is called "sspi" (single 
subflow per interface). I'm definitely open to other ideas that are 
identifiable and descriptive.


I think there are good use cases for this feature, that's why we included 
the similar feature in mptcpd! I'd like to get our core group (you, me, 
Geliang, and Paolo) aligned on a general direction for path management, 
does that sound reasonable?


One technical question down below too
  |
  v


>
> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/503
> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
> ---
> include/uapi/linux/mptcp.h |  6 +++-
> net/mptcp/pm_kernel.c      | 82 ++++++++++++++++++++++++++++++++++++++++++++++
> net/mptcp/protocol.h       |  1 +
> net/mptcp/sockopt.c        |  2 ++
> 4 files changed, 90 insertions(+), 1 deletion(-)
>
> diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
> index 5ec996977b3fa2351222e6d01b814770b34348e9..65dc069e9063325ad2e1ffb1da21cc4a4b6efd32 100644
> --- a/include/uapi/linux/mptcp.h
> +++ b/include/uapi/linux/mptcp.h
> @@ -39,6 +39,7 @@
> #define MPTCP_PM_ADDR_FLAG_BACKUP		_BITUL(2)
> #define MPTCP_PM_ADDR_FLAG_FULLMESH		_BITUL(3)
> #define MPTCP_PM_ADDR_FLAG_IMPLICIT		_BITUL(4)
> +#define MPTCP_PM_ADDR_FLAG_ADDRESS		_BITUL(5)
>
> struct mptcp_info {
> 	__u8	mptcpi_subflows;
> @@ -51,6 +52,7 @@ struct mptcp_info {
> 	#define mptcpi_endp_signal_max mptcpi_add_addr_signal_max
> 	__u8	mptcpi_add_addr_accepted_max;
> 	#define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max
> +	/* 16-bit hole that can no longer be filled */
> 	__u32	mptcpi_flags;
> 	__u32	mptcpi_token;
> 	__u64	mptcpi_write_seq;
> @@ -60,13 +62,15 @@ struct mptcp_info {
> 	__u8	mptcpi_local_addr_max;
> 	#define mptcpi_endp_subflow_max mptcpi_local_addr_max
> 	__u8	mptcpi_csum_enabled;
> +	/* 8-bit hole that can no longer be filled */
> 	__u32	mptcpi_retransmits;
> 	__u64	mptcpi_bytes_retrans;
> 	__u64	mptcpi_bytes_sent;
> 	__u64	mptcpi_bytes_received;
> 	__u64	mptcpi_bytes_acked;
> 	__u8	mptcpi_subflows_total;
> -	__u8	reserved[3];
> +	__u8	mptcpi_endp_address_max;
> +	__u8	reserved[2];
> 	__u32	mptcpi_last_data_sent;
> 	__u32	mptcpi_last_data_recv;
> 	__u32	mptcpi_last_ack_recv;
> diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
> index fbd0a4ade8469ee75d99083bf640ad91a6fb714e..790dd7bc7f79e95a1fb73cbfb065087aa28f8f4b 100644
> --- a/net/mptcp/pm_kernel.c
> +++ b/net/mptcp/pm_kernel.c
> @@ -21,6 +21,7 @@ struct pm_nl_pernet {
> 	u8			endpoints;
> 	u8			endp_signal_max;
> 	u8			endp_subflow_max;
> +	u8			endp_address_max;
> 	u8			limit_add_addr_accepted;
> 	u8			limit_extra_subflows;
> 	u8			next_id;
> @@ -61,6 +62,14 @@ u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk)
> }
> EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max);
>
> +u8 mptcp_pm_get_endp_address_max(const struct mptcp_sock *msk)
> +{
> +	struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
> +
> +	return READ_ONCE(pernet->endp_address_max);
> +}
> +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_address_max);
> +
> u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk)
> {
> 	struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
> @@ -451,6 +460,66 @@ fill_local_addresses_vec_fullmesh(struct mptcp_sock *msk,
> 	return i;
> }
>
> +static unsigned int
> +fill_local_addresses_vec_address(struct mptcp_sock *msk,
> +				 struct mptcp_addr_info *remote,
> +				 struct mptcp_pm_local *locals)
> +{
> +	struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
> +	DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
> +	struct mptcp_subflow_context *subflow;
> +	struct sock *sk = (struct sock *)msk;
> +	struct mptcp_pm_addr_entry *entry;
> +	struct mptcp_pm_local *local;
> +	int i = 0;
> +
> +	/* Forbid creation of new subflows matching existing ones, possibly
> +	 * already created by 'subflow' endpoints
> +	 */
> +	bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
> +	mptcp_for_each_subflow(msk, subflow) {
> +		struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
> +
> +		if ((1 << inet_sk_state_load(ssk)) &
> +		    (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING | TCPF_CLOSE))
> +			continue;
> +
> +		__set_bit(READ_ONCE(subflow->local_id), unavail_id);
> +	}
> +
> +	rcu_read_lock();
> +	list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
> +		if (!(entry->flags & MPTCP_PM_ADDR_FLAG_ADDRESS))
> +			continue;
> +
> +		if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
> +			continue;
> +
> +		if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr),
> +			     unavail_id))
> +			continue;
> +
> +		local = &locals[i];

Looks like 'i' is always 0 here, since the only code path from here leads 
to 'break'. Would be clearer to hardcode 0 and clarify the variable name 
for the return value.


Thanks,

Mat

> +		local->addr = entry->addr;
> +		local->flags = entry->flags;
> +		local->ifindex = entry->ifindex;
> +
> +		if (entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW) {
> +			__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
> +
> +			if (local->addr.id != msk->mpc_endpoint_id)
> +				msk->pm.local_addr_used++;
> +		}
> +
> +		msk->pm.extra_subflows++;
> +		i++;
> +		break;
> +	}
> +	rcu_read_unlock();
> +
> +	return i;
> +}
> +
> static unsigned int
> fill_local_addresses_vec_c_flag(struct mptcp_sock *msk,
> 				struct mptcp_addr_info *remote,
> @@ -527,6 +596,10 @@ fill_local_addresses_vec(struct mptcp_sock *msk, struct mptcp_addr_info *remote,
> 	if (i)
> 		return i;
>
> +	/* If there is at least one MPTCP endpoint with an address flag */
> +	if (mptcp_pm_get_endp_address_max(msk))
> +		return fill_local_addresses_vec_address(msk, remote, locals);
> +
> 	/* Special case: peer sets the C flag, accept one ADD_ADDR if default
> 	 * limits are used -- accepting no ADD_ADDR -- and use subflow endpoints
> 	 */
> @@ -701,6 +774,10 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
> 		addr_max = pernet->endp_subflow_max;
> 		WRITE_ONCE(pernet->endp_subflow_max, addr_max + 1);
> 	}
> +	if (entry->flags & MPTCP_PM_ADDR_FLAG_ADDRESS) {
> +		addr_max = pernet->endp_address_max;
> +		WRITE_ONCE(pernet->endp_address_max, addr_max + 1);
> +	}
>
> 	pernet->endpoints++;
> 	if (!entry->addr.port)
> @@ -1095,6 +1172,10 @@ int mptcp_pm_nl_del_addr_doit(struct sk_buff *skb, struct genl_info *info)
> 		addr_max = pernet->endp_subflow_max;
> 		WRITE_ONCE(pernet->endp_subflow_max, addr_max - 1);
> 	}
> +	if (entry->flags & MPTCP_PM_ADDR_FLAG_ADDRESS) {
> +		addr_max = pernet->endp_address_max;
> +		WRITE_ONCE(pernet->endp_address_max, addr_max - 1);
> +	}
>
> 	pernet->endpoints--;
> 	list_del_rcu(&entry->list);
> @@ -1177,6 +1258,7 @@ static void __reset_counters(struct pm_nl_pernet *pernet)
> {
> 	WRITE_ONCE(pernet->endp_signal_max, 0);
> 	WRITE_ONCE(pernet->endp_subflow_max, 0);
> +	WRITE_ONCE(pernet->endp_address_max, 0);
> 	pernet->endpoints = 0;
> }
>
> diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
> index 027d717ef7cffe150f8de7b3b404916a1899537a..57e4db26e0ae1c5e82bc5a262ccb9d5e36508543 100644
> --- a/net/mptcp/protocol.h
> +++ b/net/mptcp/protocol.h
> @@ -1179,6 +1179,7 @@ void mptcp_pm_worker(struct mptcp_sock *msk);
> void __mptcp_pm_kernel_worker(struct mptcp_sock *msk);
> u8 mptcp_pm_get_endp_signal_max(const struct mptcp_sock *msk);
> u8 mptcp_pm_get_endp_subflow_max(const struct mptcp_sock *msk);
> +u8 mptcp_pm_get_endp_address_max(const struct mptcp_sock *msk);
> u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk);
> u8 mptcp_pm_get_limit_extra_subflows(const struct mptcp_sock *msk);
>
> diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c
> index 92a2a274262732a345b9ab185efd7da1f0a5773a..3cdc35323cc18de3585169fe729a51cab25a4cba 100644
> --- a/net/mptcp/sockopt.c
> +++ b/net/mptcp/sockopt.c
> @@ -980,6 +980,8 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info)
> 			mptcp_pm_get_limit_add_addr_accepted(msk);
> 		info->mptcpi_endp_subflow_max =
> 			mptcp_pm_get_endp_subflow_max(msk);
> +		info->mptcpi_endp_address_max =
> +			mptcp_pm_get_endp_address_max(msk);
> 	}
>
> 	if (__mptcp_check_fallback(msk))
>
> -- 
> 2.51.0
>
>
>
Re: [PATCH mptcp-next 5/6] mptcp: pm: in-kernel: add 'address' endpoints
Posted by Matthieu Baerts 2 weeks, 3 days ago
Hi Mat,

On 23/09/2025 06:17, Mat Martineau wrote:
> On Tue, 23 Sep 2025, Matthieu Baerts (NGI0) wrote:
> 
>> Currently, upon the reception of an ADD_ADDR (and when the fullmesh flag
>> is not used), the in-kernel PM will create new subflows using the local
>> address the routing configuration will pick.
>>
>> It would be easier to pick local addresses from a selected list of
>> endpoints, and use it only once, than relying on routing rules.
>>
>> Use case: both the client (C) and the server (S) have two addresses (a
>> and b). The client establishes the connection between C(a) and S(a).
>> Once established, the server announces its additional address S(b). Once
>> received, the client connects to it using its second address C(b).
>> Compared to a situation without the 'address' endpoint for C(b), the
>> client didn't use this address C(b) to establish a subflow to the
>> server's primary address S(a). So at the end, we have:
>>
>>   C        S
>>  C(a) --- S(a)
>>  C(b) --- S(b)
>>
>> In case of a 3rd address on each side (C(c) and S(c)), upon the
>> reception of an ADD_ADDR with S(c), the client should not pick C(b)
>> because it has already been used. C(c) should then be used.
>>
>> Note that this situation is currently possible if C doesn't add any
>> endpoint, but configure the routing in order to pick C(b) for the route
>> to S(b), and pick C(c) for the route to S(c). That doesn't sound very
>> practical because it means knowing in advance the IP addresses that
>> will be used and announced by the server.
>>
>> In the code, the new endpoint type is added. Similar to the other
>> subflow types, an MPTCP_INFO counter is added. While at it, hole are now
>> commented in struct mptcp_info, to remember next time that these holes
>> can no longer be used.
> 
> Hi Matthieu -
> 
> I think this patch brings up a few larger topics of discussion: path
> manager strategy (in-kernel/userspace/bpf), interaction of in-kernel PM
> flags, and (once again!) naming.
> 
> I'm not sure the reply chain for this patch is the right place to have
> the discussion, but adding another in-kernel PM "mode" makes me think we
> need a community-level (MPTCP) discussion on our path manager strategy.
> The original plan was to have a single general in-kernel PM, and rely on
> userspace/mptcpd for anything else. We've obviously made some changes to
> that plan, adding fullmesh and having the BPF PM in progress. This has
> made path management more complex to understand, use, and maintain - so
> I want to be sure we are making a careful choice about which PM features
> to add.

Good point!

(Regarding the BPF PM, it is similar to the userspace PM, but can be
used in environment handling loads of connections in parallel ; and it
introduces a better separation between the different PMs, which makes it
worth it for the maintenance to me.)

> Our userspace API for the in-kernel PM also makes it complicated to
> explain what happens when there is a mix of endpoint types. Typical use
> would probably be reasonable (all fullmesh, all 'address', etc). But
> it's good to avoid confusion, and more importantly bugs!

Indeed, I fixed quite a few issues and inconsistencies last year, around
the same time I opened this issue #503.

I have to admit that the 'fullmesh' mode added quite a bit of unexpected
complexity for such a niche use-case (but it allows "workarounds"). I
think the new mode suggested here is simple, targeting one specific part
of the code (an ADD_ADDR is received) and fix the lack of control on
what source IP address is used to create new subflows when an ADD_ADDR
is received. Not as invasive as the 'fullmesh' one.

> As for naming, unfortunately "address" is a very frequently used word in
> our subsystem! In mptcpd the similar plugin is called "sspi" (single
> subflow per interface). I'm definitely open to other ideas that are
> identifiable and descriptive.

Me too! I initially picked "add-addr", but I wasn't happy with the minus
sign, nor by its name. I guess a good name could be "endpoint used when
an ADD_ADDR is received", or maybe "received-add-addr", but that seems
too long, and more than one word.

I ended up picking "address", because it is short, and similar to
"signal" and "subflow" (which are not very clear, but hard to do better
with one word...).

> I think there are good use cases for this feature, that's why we
> included the similar feature in mptcpd! I'd like to get our core group
> (you, me, Geliang, and Paolo) aligned on a general direction for path
> management, does that sound reasonable?

Yes indeed, it is important.

To be honest, I thought we already had this discussion at the meeting
following the opening of this ticket #503, but that was a bit more than
one year ago. I started working on it because the lack of control on
what source IP address is used to create new subflows when an ADD_ADDR
is received came back in a few discussions, even prior #503. Even
recently when a client wanted to use dedicated interface per path. It
feels like something is missing in this in-kernel Netlink API to use
endpoints to create subflows to addresses announced by the server.

But yes, we are exposing a new option to the userspace, and if it is
accepted -- and not modified/reverted in the new 10 weeks -- we will
have to maintain it for a long time. So better not to get it wrong. I
think it is "self-contain" and worth it. I would like a better name than
"address", but I didn't find one. At the end, I'm happy with it as it is
similar to the others and will have a longer description in the doc, but
I'm open to another name :)


> One technical question down below too
>  |
>  v
> 
> 
>>
>> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/503
>> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org>
>> ---
>> include/uapi/linux/mptcp.h |  6 +++-
>> net/mptcp/pm_kernel.c      | 82 ++++++++++++++++++++++++++++++++++++++
>> ++++++++
>> net/mptcp/protocol.h       |  1 +
>> net/mptcp/sockopt.c        |  2 ++
>> 4 files changed, 90 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/uapi/linux/mptcp.h b/include/uapi/linux/mptcp.h
>> index
>> 5ec996977b3fa2351222e6d01b814770b34348e9..65dc069e9063325ad2e1ffb1da21cc4a4b6efd32 100644
>> --- a/include/uapi/linux/mptcp.h
>> +++ b/include/uapi/linux/mptcp.h
>> @@ -39,6 +39,7 @@
>> #define MPTCP_PM_ADDR_FLAG_BACKUP        _BITUL(2)
>> #define MPTCP_PM_ADDR_FLAG_FULLMESH        _BITUL(3)
>> #define MPTCP_PM_ADDR_FLAG_IMPLICIT        _BITUL(4)
>> +#define MPTCP_PM_ADDR_FLAG_ADDRESS        _BITUL(5)
>>
>> struct mptcp_info {
>>     __u8    mptcpi_subflows;
>> @@ -51,6 +52,7 @@ struct mptcp_info {
>>     #define mptcpi_endp_signal_max mptcpi_add_addr_signal_max
>>     __u8    mptcpi_add_addr_accepted_max;
>>     #define mptcpi_limit_add_addr_accepted mptcpi_add_addr_accepted_max
>> +    /* 16-bit hole that can no longer be filled */
>>     __u32    mptcpi_flags;
>>     __u32    mptcpi_token;
>>     __u64    mptcpi_write_seq;
>> @@ -60,13 +62,15 @@ struct mptcp_info {
>>     __u8    mptcpi_local_addr_max;
>>     #define mptcpi_endp_subflow_max mptcpi_local_addr_max
>>     __u8    mptcpi_csum_enabled;
>> +    /* 8-bit hole that can no longer be filled */
>>     __u32    mptcpi_retransmits;
>>     __u64    mptcpi_bytes_retrans;
>>     __u64    mptcpi_bytes_sent;
>>     __u64    mptcpi_bytes_received;
>>     __u64    mptcpi_bytes_acked;
>>     __u8    mptcpi_subflows_total;
>> -    __u8    reserved[3];
>> +    __u8    mptcpi_endp_address_max;
>> +    __u8    reserved[2];
>>     __u32    mptcpi_last_data_sent;
>>     __u32    mptcpi_last_data_recv;
>>     __u32    mptcpi_last_ack_recv;
>> diff --git a/net/mptcp/pm_kernel.c b/net/mptcp/pm_kernel.c
>> index
>> fbd0a4ade8469ee75d99083bf640ad91a6fb714e..790dd7bc7f79e95a1fb73cbfb065087aa28f8f4b 100644
>> --- a/net/mptcp/pm_kernel.c
>> +++ b/net/mptcp/pm_kernel.c
>> @@ -21,6 +21,7 @@ struct pm_nl_pernet {
>>     u8            endpoints;
>>     u8            endp_signal_max;
>>     u8            endp_subflow_max;
>> +    u8            endp_address_max;
>>     u8            limit_add_addr_accepted;
>>     u8            limit_extra_subflows;
>>     u8            next_id;
>> @@ -61,6 +62,14 @@ u8 mptcp_pm_get_endp_subflow_max(const struct
>> mptcp_sock *msk)
>> }
>> EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_subflow_max);
>>
>> +u8 mptcp_pm_get_endp_address_max(const struct mptcp_sock *msk)
>> +{
>> +    struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
>> +
>> +    return READ_ONCE(pernet->endp_address_max);
>> +}
>> +EXPORT_SYMBOL_GPL(mptcp_pm_get_endp_address_max);
>> +
>> u8 mptcp_pm_get_limit_add_addr_accepted(const struct mptcp_sock *msk)
>> {
>>     struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
>> @@ -451,6 +460,66 @@ fill_local_addresses_vec_fullmesh(struct
>> mptcp_sock *msk,
>>     return i;
>> }
>>
>> +static unsigned int
>> +fill_local_addresses_vec_address(struct mptcp_sock *msk,
>> +                 struct mptcp_addr_info *remote,
>> +                 struct mptcp_pm_local *locals)
>> +{
>> +    struct pm_nl_pernet *pernet = pm_nl_get_pernet_from_msk(msk);
>> +    DECLARE_BITMAP(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
>> +    struct mptcp_subflow_context *subflow;
>> +    struct sock *sk = (struct sock *)msk;
>> +    struct mptcp_pm_addr_entry *entry;
>> +    struct mptcp_pm_local *local;
>> +    int i = 0;
>> +
>> +    /* Forbid creation of new subflows matching existing ones, possibly
>> +     * already created by 'subflow' endpoints
>> +     */
>> +    bitmap_zero(unavail_id, MPTCP_PM_MAX_ADDR_ID + 1);
>> +    mptcp_for_each_subflow(msk, subflow) {
>> +        struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
>> +
>> +        if ((1 << inet_sk_state_load(ssk)) &
>> +            (TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSING |
>> TCPF_CLOSE))
>> +            continue;
>> +
>> +        __set_bit(READ_ONCE(subflow->local_id), unavail_id);
>> +    }
>> +
>> +    rcu_read_lock();
>> +    list_for_each_entry_rcu(entry, &pernet->endp_list, list) {
>> +        if (!(entry->flags & MPTCP_PM_ADDR_FLAG_ADDRESS))
>> +            continue;
>> +
>> +        if (!mptcp_pm_addr_families_match(sk, &entry->addr, remote))
>> +            continue;
>> +
>> +        if (test_bit(mptcp_endp_get_local_id(msk, &entry->addr),
>> +                 unavail_id))
>> +            continue;
>> +
>> +        local = &locals[i];
> 
> Looks like 'i' is always 0 here, since the only code path from here
> leads to 'break'. Would be clearer to hardcode 0 and clarify the
> variable name for the return value.

Good idea! I can also rename the function.

Cheers,
Matt
-- 
Sponsored by the NGI0 Core fund.

Re: [PATCH mptcp-next 5/6] mptcp: pm: in-kernel: add 'address' endpoints
Posted by Mat Martineau 2 weeks, 3 days ago
On Tue, 23 Sep 2025, Matthieu Baerts wrote:

> Hi Mat,
>
> On 23/09/2025 06:17, Mat Martineau wrote:
>> On Tue, 23 Sep 2025, Matthieu Baerts (NGI0) wrote:
>>
>>> Currently, upon the reception of an ADD_ADDR (and when the fullmesh flag
>>> is not used), the in-kernel PM will create new subflows using the local
>>> address the routing configuration will pick.
>>>
>>> It would be easier to pick local addresses from a selected list of
>>> endpoints, and use it only once, than relying on routing rules.
>>>
>>> Use case: both the client (C) and the server (S) have two addresses (a
>>> and b). The client establishes the connection between C(a) and S(a).
>>> Once established, the server announces its additional address S(b). Once
>>> received, the client connects to it using its second address C(b).
>>> Compared to a situation without the 'address' endpoint for C(b), the
>>> client didn't use this address C(b) to establish a subflow to the
>>> server's primary address S(a). So at the end, we have:
>>>
>>>   C        S
>>>  C(a) --- S(a)
>>>  C(b) --- S(b)
>>>
>>> In case of a 3rd address on each side (C(c) and S(c)), upon the
>>> reception of an ADD_ADDR with S(c), the client should not pick C(b)
>>> because it has already been used. C(c) should then be used.
>>>
>>> Note that this situation is currently possible if C doesn't add any
>>> endpoint, but configure the routing in order to pick C(b) for the route
>>> to S(b), and pick C(c) for the route to S(c). That doesn't sound very
>>> practical because it means knowing in advance the IP addresses that
>>> will be used and announced by the server.
>>>
>>> In the code, the new endpoint type is added. Similar to the other
>>> subflow types, an MPTCP_INFO counter is added. While at it, hole are now
>>> commented in struct mptcp_info, to remember next time that these holes
>>> can no longer be used.
>>
>> Hi Matthieu -
>>
>> I think this patch brings up a few larger topics of discussion: path
>> manager strategy (in-kernel/userspace/bpf), interaction of in-kernel PM
>> flags, and (once again!) naming.
>>
>> I'm not sure the reply chain for this patch is the right place to have
>> the discussion, but adding another in-kernel PM "mode" makes me think we
>> need a community-level (MPTCP) discussion on our path manager strategy.
>> The original plan was to have a single general in-kernel PM, and rely on
>> userspace/mptcpd for anything else. We've obviously made some changes to
>> that plan, adding fullmesh and having the BPF PM in progress. This has
>> made path management more complex to understand, use, and maintain - so
>> I want to be sure we are making a careful choice about which PM features
>> to add.
>
> Good point!
>
> (Regarding the BPF PM, it is similar to the userspace PM, but can be
> used in environment handling loads of connections in parallel ; and it
> introduces a better separation between the different PMs, which makes it
> worth it for the maintenance to me.)
>
>> Our userspace API for the in-kernel PM also makes it complicated to
>> explain what happens when there is a mix of endpoint types. Typical use
>> would probably be reasonable (all fullmesh, all 'address', etc). But
>> it's good to avoid confusion, and more importantly bugs!
>
> Indeed, I fixed quite a few issues and inconsistencies last year, around
> the same time I opened this issue #503.
>
> I have to admit that the 'fullmesh' mode added quite a bit of unexpected
> complexity for such a niche use-case (but it allows "workarounds"). I
> think the new mode suggested here is simple, targeting one specific part
> of the code (an ADD_ADDR is received) and fix the lack of control on
> what source IP address is used to create new subflows when an ADD_ADDR
> is received. Not as invasive as the 'fullmesh' one.
>

Hi Matthieu -

Yes, I agree this addition is more targeted, and I also think it's a very 
useful capability. All of these small additions do add up over time, and I 
want to be sure we keep track of the "big picture" and think about that 
full context when deciding how to expand our PM features.

>> As for naming, unfortunately "address" is a very frequently used word in
>> our subsystem! In mptcpd the similar plugin is called "sspi" (single
>> subflow per interface). I'm definitely open to other ideas that are
>> identifiable and descriptive.
>
> Me too! I initially picked "add-addr", but I wasn't happy with the minus
> sign, nor by its name. I guess a good name could be "endpoint used when
> an ADD_ADDR is received", or maybe "received-add-addr", but that seems
> too long, and more than one word.
>
> I ended up picking "address", because it is short, and similar to
> "signal" and "subflow" (which are not very clear, but hard to do better
> with one word...).
>

I still get 'signal' and 'subflow' mixed up sometimes and think we can 
improve our next choice :)

>> I think there are good use cases for this feature, that's why we
>> included the similar feature in mptcpd! I'd like to get our core group
>> (you, me, Geliang, and Paolo) aligned on a general direction for path
>> management, does that sound reasonable?
>
> Yes indeed, it is important.
>
> To be honest, I thought we already had this discussion at the meeting
> following the opening of this ticket #503, but that was a bit more than
> one year ago. I started working on it because the lack of control on
> what source IP address is used to create new subflows when an ADD_ADDR
> is received came back in a few discussions, even prior #503. Even
> recently when a client wanted to use dedicated interface per path. It
> feels like something is missing in this in-kernel Netlink API to use
> endpoints to create subflows to addresses announced by the server.
>
> But yes, we are exposing a new option to the userspace, and if it is
> accepted -- and not modified/reverted in the new 10 weeks -- we will
> have to maintain it for a long time. So better not to get it wrong. I
> think it is "self-contain" and worth it. I would like a better name than
> "address", but I didn't find one. At the end, I'm happy with it as it is
> similar to the others and will have a longer description in the doc, but
> I'm open to another name :)
>

Ok, thanks for clarifying. I think our discussion of #503 15 months ago 
was fairly brief and I'm guessing the significance of the additional 
endpoint type didn't sink in at the time. Definitely easier to understand 
the implications seeing the patches!

Will discuss the naming details in the v2 thread.

- Mat