Not required anymore, syn packets with a join requests are redirected
to pernet mptcp pseudo-listening socket.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
net/mptcp/pm_netlink.c | 65 ------------------------------------------
1 file changed, 65 deletions(-)
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index dcbc11d6b767..836326e04c4a 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -27,7 +27,6 @@ struct mptcp_pm_addr_entry {
struct mptcp_addr_info addr;
u8 flags;
int ifindex;
- struct socket *lsk;
};
struct mptcp_pm_add_entry {
@@ -883,8 +882,6 @@ static bool address_use_port(struct mptcp_pm_addr_entry *entry)
/* caller must ensure the RCU grace period is already elapsed */
static void __mptcp_pm_release_addr_entry(struct mptcp_pm_addr_entry *entry)
{
- if (entry->lsk)
- sock_release(entry->lsk);
kfree(entry);
}
@@ -972,57 +969,6 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet,
return ret;
}
-static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
- struct mptcp_pm_addr_entry *entry)
-{
- int addrlen = sizeof(struct sockaddr_in);
- struct sockaddr_storage addr;
- struct mptcp_sock *msk;
- struct socket *ssock;
- int backlog = 1024;
- int err;
-
- err = sock_create_kern(sock_net(sk), entry->addr.family,
- SOCK_STREAM, IPPROTO_MPTCP, &entry->lsk);
- if (err)
- return err;
-
- msk = mptcp_sk(entry->lsk->sk);
- if (!msk) {
- err = -EINVAL;
- goto out;
- }
-
- ssock = __mptcp_nmpc_socket(msk);
- if (!ssock) {
- err = -EINVAL;
- goto out;
- }
-
- mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family);
-#if IS_ENABLED(CONFIG_MPTCP_IPV6)
- if (entry->addr.family == AF_INET6)
- addrlen = sizeof(struct sockaddr_in6);
-#endif
- err = kernel_bind(ssock, (struct sockaddr *)&addr, addrlen);
- if (err) {
- pr_warn("kernel_bind error, err=%d", err);
- goto out;
- }
-
- err = kernel_listen(ssock, backlog);
- if (err) {
- pr_warn("kernel_listen error, err=%d", err);
- goto out;
- }
-
- return 0;
-
-out:
- sock_release(entry->lsk);
- return err;
-}
-
int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
{
struct mptcp_pm_addr_entry *entry;
@@ -1065,7 +1011,6 @@ int mptcp_pm_nl_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
entry->addr.port = 0;
entry->ifindex = 0;
entry->flags = MPTCP_PM_ADDR_FLAG_IMPLICIT;
- entry->lsk = NULL;
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0)
kfree(entry);
@@ -1284,19 +1229,9 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info)
}
*entry = addr;
- if (entry->addr.port) {
- ret = mptcp_pm_nl_create_listen_socket(skb->sk, entry);
- if (ret) {
- GENL_SET_ERR_MSG(info, "create listen socket error");
- kfree(entry);
- return ret;
- }
- }
ret = mptcp_pm_nl_append_new_local_addr(pernet, entry);
if (ret < 0) {
GENL_SET_ERR_MSG(info, "too many addresses or duplicate one");
- if (entry->lsk)
- sock_release(entry->lsk);
kfree(entry);
return ret;
}
--
2.34.1
Hi Florian, Thank you for your modifications, that's great! Our CI did some validations and here is its report: - KVM Validation: normal: - Unstable: 1 failed test(s): selftest_mptcp_join 🔴: - Task: https://cirrus-ci.com/task/6708418592374784 - Summary: https://api.cirrus-ci.com/v1/artifact/task/6708418592374784/summary/summary.txt - KVM Validation: debug: - Success! ✅: - Task: https://cirrus-ci.com/task/5856925177872384 - Summary: https://api.cirrus-ci.com/v1/artifact/task/5856925177872384/summary/summary.txt Initiator: Patchew Applier Commits: https://github.com/multipath-tcp/mptcp_net-next/commits/6c9d91dd089f Please note that despite all the efforts that have been already done to have a stable tests suite when executed on a public CI like here, it is possible some reported issues are not due to your modifications. Still, do not hesitate to help us improve that ;-) Cheers, MPTCP GH Action bot Bot operated by Matthieu Baerts (Tessares)
Relative patch to better explain whats happening, if we go this route
then this would be squashed/replace patch #3.
Main problem: bypasses TW infrastructure, i.e. SYN+JOIN with existing
subflow quadruple won't work.
This could be resolved by duplicating considerable code from
tcp_ipv4(6).c to mptcp_handle_join, but I don't like that either.
IOW, I would prefer to keep the sequence as-is, first do the standard
port-based demux and then, if no result was found, do the join handling.
This means subflow to TCP-only listener will fail to establish, but that
could be resolved by having a 'reserve the port' approach (mptcpd?) in
the tooling that adds the required netlink commands.
I would prefer to minimize kernel work.
Signed-off-by: Florian Westphal <fw@strlen.de>
---
include/net/mptcp.h | 17 +++++++-------
net/ipv4/tcp_ipv4.c | 12 ++++------
net/ipv6/tcp_ipv6.c | 9 ++------
net/mptcp/ctrl.c | 54 ++++++++++++++++++++++++++++++++++++---------
4 files changed, 59 insertions(+), 33 deletions(-)
diff --git a/include/net/mptcp.h b/include/net/mptcp.h
index b8939d7ea12e..cc95c279a196 100644
--- a/include/net/mptcp.h
+++ b/include/net/mptcp.h
@@ -189,7 +189,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
struct sk_buff *skb);
__be32 mptcp_get_reset_option(const struct sk_buff *skb);
-struct sock *__mptcp_handle_join(int af, struct sk_buff *skb);
+bool __mptcp_handle_join(int af, struct sk_buff *skb);
static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
{
@@ -199,17 +199,17 @@ static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
return htonl(0u);
}
-static inline struct sock *mptcp_handle_join(struct sk_buff *skb, int af)
+static inline bool mptcp_handle_join(struct sk_buff *skb, int af)
{
const struct tcphdr *th = tcp_hdr(skb);
- if (th->syn && !th->ack && !th->rst && !th->fin)
+ if (unlikely(th->syn && !th->ack && !th->rst && !th->fin))
return __mptcp_handle_join(af, skb);
- return NULL;
+ return true;
}
-static inline struct sock *mptcp_handle_join4(struct sk_buff *skb)
+static inline bool mptcp_handle_join4(struct sk_buff *skb)
{
return mptcp_handle_join(skb, AF_INET);
}
@@ -290,7 +290,8 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req,
}
static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); }
-static inline struct sock *mptcp_handle_join4(struct sk_buff *skb) { return NULL; }
+static inline bool mptcp_handle_join4(int af, struct sk_buff *skb) { return true; }
+
#endif /* CONFIG_MPTCP */
#if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@ -299,7 +300,7 @@ int mptcpv6_init_net(struct net *net);
void mptcpv6_exit_net(struct net *net);
void mptcpv6_handle_mapped(struct sock *sk, bool mapped);
-static inline struct sock *mptcp_handle_join6(struct sk_buff *skb)
+static inline bool mptcp_handle_join6(struct sk_buff *skb)
{
return mptcp_handle_join(skb, AF_INET6);
}
@@ -308,7 +309,7 @@ static inline int mptcpv6_init(void) { return 0; }
static inline int mptcpv6_init_net(struct net *net) { return 0; }
static inline void mptcpv6_exit_net(struct net *net) { }
static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { }
-static inline struct sock *mptcp_handle_join6(struct sk_buff *skb) { return NULL; }
+static inline bool mptcp_handle_join6(struct sk_buff *skb) { return true; }
#endif
#endif /* __NET_MPTCP_H */
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index da9ed7b0b7f5..ba685f18a767 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1949,12 +1949,15 @@ int tcp_v4_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
iph = ip_hdr(skb);
+
+ if (!mptcp_handle_join4(skb))
+ goto no_tcp_socket;
+
lookup:
sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
th->dest, sdif, &refcounted);
if (!sk)
goto no_tcp_socket;
-
process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
@@ -2087,10 +2090,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
- sk = mptcp_handle_join4(skb);
- if (sk)
- goto process;
-
tcp_v4_fill_cb(skb, iph, th);
if (tcp_checksum_complete(skb)) {
@@ -2137,9 +2136,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
iph->daddr, th->dest,
inet_iif(skb),
sdif);
- if (!sk2)
- sk2 = mptcp_handle_join4(skb);
-
if (sk2) {
inet_twsk_deschedule_put(inet_twsk(sk));
sk = sk2;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index a8aebfbb531e..39184d7654b1 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1615,6 +1615,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
th = (const struct tcphdr *)skb->data;
hdr = ipv6_hdr(skb);
+ if (!mptcp_handle_join6(skb))
+ goto no_tcp_socket;
lookup:
sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
th->source, th->dest, inet6_iif(skb), sdif,
@@ -1746,10 +1748,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
goto discard_it;
- sk = mptcp_handle_join6(skb);
- if (sk)
- goto process;
-
tcp_v6_fill_cb(skb, hdr, th);
if (tcp_checksum_complete(skb)) {
@@ -1799,9 +1797,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
ntohs(th->dest),
tcp_v6_iif_l3_slave(skb),
sdif);
- if (!sk2)
- sk2 = mptcp_handle_join6(skb);
-
if (sk2) {
struct inet_timewait_sock *tw = inet_twsk(sk);
inet_twsk_deschedule_put(tw);
diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
index c7370c5147df..bf079bb50177 100644
--- a/net/mptcp/ctrl.c
+++ b/net/mptcp/ctrl.c
@@ -10,6 +10,8 @@
#include <net/net_namespace.h>
#include <net/netns/generic.h>
+#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
#include "protocol.h"
#include "mib.h"
@@ -214,30 +216,54 @@ static void add_mptcp_rst(struct sk_buff *skb)
}
}
-struct sock *__mptcp_handle_join(int af, struct sk_buff *skb)
+/* return false if tcp must pretend no socket was found. */
+bool __mptcp_handle_join(int af, struct sk_buff *skb)
{
struct mptcp_options_received mp_opt;
+ const struct ipv6hdr *ip6h;
+ const struct iphdr *iph;
+ struct sock *lsk = NULL;
+ const struct tcphdr *th;
struct mptcp_pernet *pernet;
struct mptcp_sock *msk;
struct socket *ssock;
- struct sock *lsk;
struct net *net;
/* paranoia check: don't allow 0 destination port,
* else __inet_inherit_port will insert the child socket
* into the phony hash slot of the pernet listener.
*/
- if (tcp_hdr(skb)->dest == 0)
- return NULL;
+ if (tcp_hdr(skb)->dest == 0 || skb->sk)
+ return true;
mptcp_get_options(skb, &mp_opt);
if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ))
- return NULL;
+ return true;
net = dev_net(skb_dst(skb)->dev);
if (!mptcp_is_enabled(net))
- return NULL;
+ return true;
+
+ lsk = NULL;
+ th = tcp_hdr(skb);
+ switch (af) {
+ case AF_INET:
+ iph = ip_hdr(skb);
+ lsk = inet_lookup_listener(net, &tcp_hashinfo, skb, __tcp_hdrlen(th), iph->saddr,
+ th->source, iph->daddr, th->dest, inet_iif(skb), inet_sdif(skb));
+ break;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ case AF_INET6:
+ ip6h = ipv6_hdr(skb);
+ lsk = inet6_lookup_listener(net, &tcp_hashinfo, skb, __tcp_hdrlen(th), &ip6h->saddr,
+ th->source, &ip6h->daddr, th->dest, inet6_iif(skb), inet6_sdif(skb));
+ break;
+#endif
+ }
+
+ if (lsk && sk_is_mptcp(lsk))
+ goto assign_sk;
/* RFC8684: If the token is unknown [..], the receiver will send
* back a reset (RST) signal, analogous to an unknown port in TCP,
@@ -246,14 +272,14 @@ struct sock *__mptcp_handle_join(int af, struct sk_buff *skb)
msk = mptcp_token_get_sock(net, mp_opt.token);
if (!msk) {
add_mptcp_rst(skb);
- return NULL;
+ return false; /* suppress 4-tuple based lookups */
}
if (!mptcp_pm_sport_in_anno_list(msk, af, skb)) {
sock_put((struct sock *)msk);
MPTCP_INC_STATS(net, MPTCP_MIB_MISMATCHPORTSYNRX);
add_mptcp_rst(skb);
- return NULL;
+ return false;
}
sock_put((struct sock *)msk);
@@ -270,14 +296,21 @@ struct sock *__mptcp_handle_join(int af, struct sk_buff *skb)
#endif
default:
WARN_ON_ONCE(1);
- return NULL;
+ return true;
}
ssock = __mptcp_nmpc_socket(mptcp_sk(lsk));
if (WARN_ON(!ssock))
return NULL;
- return ssock->sk;
+ lsk = ssock->sk;
+
+assign_sk:
+ WARN_ON_ONCE(sk_is_refcounted(lsk));
+
+ skb->sk = lsk;
+ skb->destructor = sock_pfree;
+ return true;
}
static struct socket *mptcp_create_join_listen_socket(struct net *net, int af)
@@ -297,6 +330,7 @@ static struct socket *mptcp_create_join_listen_socket(struct net *net, int af)
ssock->sk->sk_max_ack_backlog = SOMAXCONN;
inet_sk_state_store(ssock->sk, TCP_LISTEN);
+ sock_set_flag(ssock->sk, SOCK_RCU_FREE);
s->sk->sk_max_ack_backlog = SOMAXCONN;
inet_sk_state_store(s->sk, TCP_LISTEN);
--
2.34.1
On Wed, 2 Mar 2022, Florian Westphal wrote:
> Relative patch to better explain whats happening, if we go this route
> then this would be squashed/replace patch #3.
>
Thanks for trying this out and explaining the tradeoffs. I think the
runtime overhead for early join handling is less than I was originally
thinking. This approach does seem to touch less TCP code but is a little
more expensive at runtime.
> Main problem: bypasses TW infrastructure, i.e. SYN+JOIN with existing
> subflow quadruple won't work.
>
> This could be resolved by duplicating considerable code from
> tcp_ipv4(6).c to mptcp_handle_join, but I don't like that either.
>
Agreed on wanting to avoid that.
> IOW, I would prefer to keep the sequence as-is, first do the standard
> port-based demux and then, if no result was found, do the join handling.
>
> This means subflow to TCP-only listener will fail to establish, but that
> could be resolved by having a 'reserve the port' approach (mptcpd?) in
> the tooling that adds the required netlink commands.
>
I'm not sure what this would look like? Seems like a bit of kernel work in
itself. Not sure if you want to explain here on this list or cover it in
the meeting.
> I would prefer to minimize kernel work.
>
I hear you :)
-Mat
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
> include/net/mptcp.h | 17 +++++++-------
> net/ipv4/tcp_ipv4.c | 12 ++++------
> net/ipv6/tcp_ipv6.c | 9 ++------
> net/mptcp/ctrl.c | 54 ++++++++++++++++++++++++++++++++++++---------
> 4 files changed, 59 insertions(+), 33 deletions(-)
>
> diff --git a/include/net/mptcp.h b/include/net/mptcp.h
> index b8939d7ea12e..cc95c279a196 100644
> --- a/include/net/mptcp.h
> +++ b/include/net/mptcp.h
> @@ -189,7 +189,7 @@ int mptcp_subflow_init_cookie_req(struct request_sock *req,
> struct sk_buff *skb);
>
> __be32 mptcp_get_reset_option(const struct sk_buff *skb);
> -struct sock *__mptcp_handle_join(int af, struct sk_buff *skb);
> +bool __mptcp_handle_join(int af, struct sk_buff *skb);
>
> static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
> {
> @@ -199,17 +199,17 @@ static inline __be32 mptcp_reset_option(const struct sk_buff *skb)
> return htonl(0u);
> }
>
> -static inline struct sock *mptcp_handle_join(struct sk_buff *skb, int af)
> +static inline bool mptcp_handle_join(struct sk_buff *skb, int af)
> {
> const struct tcphdr *th = tcp_hdr(skb);
>
> - if (th->syn && !th->ack && !th->rst && !th->fin)
> + if (unlikely(th->syn && !th->ack && !th->rst && !th->fin))
> return __mptcp_handle_join(af, skb);
>
> - return NULL;
> + return true;
> }
>
> -static inline struct sock *mptcp_handle_join4(struct sk_buff *skb)
> +static inline bool mptcp_handle_join4(struct sk_buff *skb)
> {
> return mptcp_handle_join(skb, AF_INET);
> }
> @@ -290,7 +290,8 @@ static inline int mptcp_subflow_init_cookie_req(struct request_sock *req,
> }
>
> static inline __be32 mptcp_reset_option(const struct sk_buff *skb) { return htonl(0u); }
> -static inline struct sock *mptcp_handle_join4(struct sk_buff *skb) { return NULL; }
> +static inline bool mptcp_handle_join4(int af, struct sk_buff *skb) { return true; }
> +
> #endif /* CONFIG_MPTCP */
>
> #if IS_ENABLED(CONFIG_MPTCP_IPV6)
> @@ -299,7 +300,7 @@ int mptcpv6_init_net(struct net *net);
> void mptcpv6_exit_net(struct net *net);
> void mptcpv6_handle_mapped(struct sock *sk, bool mapped);
>
> -static inline struct sock *mptcp_handle_join6(struct sk_buff *skb)
> +static inline bool mptcp_handle_join6(struct sk_buff *skb)
> {
> return mptcp_handle_join(skb, AF_INET6);
> }
> @@ -308,7 +309,7 @@ static inline int mptcpv6_init(void) { return 0; }
> static inline int mptcpv6_init_net(struct net *net) { return 0; }
> static inline void mptcpv6_exit_net(struct net *net) { }
> static inline void mptcpv6_handle_mapped(struct sock *sk, bool mapped) { }
> -static inline struct sock *mptcp_handle_join6(struct sk_buff *skb) { return NULL; }
> +static inline bool mptcp_handle_join6(struct sk_buff *skb) { return true; }
> #endif
>
> #endif /* __NET_MPTCP_H */
> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
> index da9ed7b0b7f5..ba685f18a767 100644
> --- a/net/ipv4/tcp_ipv4.c
> +++ b/net/ipv4/tcp_ipv4.c
> @@ -1949,12 +1949,15 @@ int tcp_v4_rcv(struct sk_buff *skb)
>
> th = (const struct tcphdr *)skb->data;
> iph = ip_hdr(skb);
> +
> + if (!mptcp_handle_join4(skb))
> + goto no_tcp_socket;
> +
> lookup:
> sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
> th->dest, sdif, &refcounted);
> if (!sk)
> goto no_tcp_socket;
> -
> process:
> if (sk->sk_state == TCP_TIME_WAIT)
> goto do_time_wait;
> @@ -2087,10 +2090,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
> if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
> goto discard_it;
>
> - sk = mptcp_handle_join4(skb);
> - if (sk)
> - goto process;
> -
> tcp_v4_fill_cb(skb, iph, th);
>
> if (tcp_checksum_complete(skb)) {
> @@ -2137,9 +2136,6 @@ int tcp_v4_rcv(struct sk_buff *skb)
> iph->daddr, th->dest,
> inet_iif(skb),
> sdif);
> - if (!sk2)
> - sk2 = mptcp_handle_join4(skb);
> -
> if (sk2) {
> inet_twsk_deschedule_put(inet_twsk(sk));
> sk = sk2;
> diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
> index a8aebfbb531e..39184d7654b1 100644
> --- a/net/ipv6/tcp_ipv6.c
> +++ b/net/ipv6/tcp_ipv6.c
> @@ -1615,6 +1615,8 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
> th = (const struct tcphdr *)skb->data;
> hdr = ipv6_hdr(skb);
>
> + if (!mptcp_handle_join6(skb))
> + goto no_tcp_socket;
> lookup:
> sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
> th->source, th->dest, inet6_iif(skb), sdif,
> @@ -1746,10 +1748,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
> if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb))
> goto discard_it;
>
> - sk = mptcp_handle_join6(skb);
> - if (sk)
> - goto process;
> -
> tcp_v6_fill_cb(skb, hdr, th);
>
> if (tcp_checksum_complete(skb)) {
> @@ -1799,9 +1797,6 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb)
> ntohs(th->dest),
> tcp_v6_iif_l3_slave(skb),
> sdif);
> - if (!sk2)
> - sk2 = mptcp_handle_join6(skb);
> -
> if (sk2) {
> struct inet_timewait_sock *tw = inet_twsk(sk);
> inet_twsk_deschedule_put(tw);
> diff --git a/net/mptcp/ctrl.c b/net/mptcp/ctrl.c
> index c7370c5147df..bf079bb50177 100644
> --- a/net/mptcp/ctrl.c
> +++ b/net/mptcp/ctrl.c
> @@ -10,6 +10,8 @@
>
> #include <net/net_namespace.h>
> #include <net/netns/generic.h>
> +#include <net/inet_hashtables.h>
> +#include <net/inet6_hashtables.h>
>
> #include "protocol.h"
> #include "mib.h"
> @@ -214,30 +216,54 @@ static void add_mptcp_rst(struct sk_buff *skb)
> }
> }
>
> -struct sock *__mptcp_handle_join(int af, struct sk_buff *skb)
> +/* return false if tcp must pretend no socket was found. */
> +bool __mptcp_handle_join(int af, struct sk_buff *skb)
> {
> struct mptcp_options_received mp_opt;
> + const struct ipv6hdr *ip6h;
> + const struct iphdr *iph;
> + struct sock *lsk = NULL;
> + const struct tcphdr *th;
> struct mptcp_pernet *pernet;
> struct mptcp_sock *msk;
> struct socket *ssock;
> - struct sock *lsk;
> struct net *net;
>
> /* paranoia check: don't allow 0 destination port,
> * else __inet_inherit_port will insert the child socket
> * into the phony hash slot of the pernet listener.
> */
> - if (tcp_hdr(skb)->dest == 0)
> - return NULL;
> + if (tcp_hdr(skb)->dest == 0 || skb->sk)
> + return true;
>
> mptcp_get_options(skb, &mp_opt);
>
> if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ))
> - return NULL;
> + return true;
>
> net = dev_net(skb_dst(skb)->dev);
> if (!mptcp_is_enabled(net))
> - return NULL;
> + return true;
> +
> + lsk = NULL;
> + th = tcp_hdr(skb);
> + switch (af) {
> + case AF_INET:
> + iph = ip_hdr(skb);
> + lsk = inet_lookup_listener(net, &tcp_hashinfo, skb, __tcp_hdrlen(th), iph->saddr,
> + th->source, iph->daddr, th->dest, inet_iif(skb), inet_sdif(skb));
> + break;
> +#if IS_ENABLED(CONFIG_MPTCP_IPV6)
> + case AF_INET6:
> + ip6h = ipv6_hdr(skb);
> + lsk = inet6_lookup_listener(net, &tcp_hashinfo, skb, __tcp_hdrlen(th), &ip6h->saddr,
> + th->source, &ip6h->daddr, th->dest, inet6_iif(skb), inet6_sdif(skb));
> + break;
> +#endif
> + }
> +
> + if (lsk && sk_is_mptcp(lsk))
> + goto assign_sk;
>
> /* RFC8684: If the token is unknown [..], the receiver will send
> * back a reset (RST) signal, analogous to an unknown port in TCP,
> @@ -246,14 +272,14 @@ struct sock *__mptcp_handle_join(int af, struct sk_buff *skb)
> msk = mptcp_token_get_sock(net, mp_opt.token);
> if (!msk) {
> add_mptcp_rst(skb);
> - return NULL;
> + return false; /* suppress 4-tuple based lookups */
> }
>
> if (!mptcp_pm_sport_in_anno_list(msk, af, skb)) {
> sock_put((struct sock *)msk);
> MPTCP_INC_STATS(net, MPTCP_MIB_MISMATCHPORTSYNRX);
> add_mptcp_rst(skb);
> - return NULL;
> + return false;
> }
>
> sock_put((struct sock *)msk);
> @@ -270,14 +296,21 @@ struct sock *__mptcp_handle_join(int af, struct sk_buff *skb)
> #endif
> default:
> WARN_ON_ONCE(1);
> - return NULL;
> + return true;
> }
>
> ssock = __mptcp_nmpc_socket(mptcp_sk(lsk));
> if (WARN_ON(!ssock))
> return NULL;
>
> - return ssock->sk;
> + lsk = ssock->sk;
> +
> +assign_sk:
> + WARN_ON_ONCE(sk_is_refcounted(lsk));
> +
> + skb->sk = lsk;
> + skb->destructor = sock_pfree;
> + return true;
> }
>
> static struct socket *mptcp_create_join_listen_socket(struct net *net, int af)
> @@ -297,6 +330,7 @@ static struct socket *mptcp_create_join_listen_socket(struct net *net, int af)
>
> ssock->sk->sk_max_ack_backlog = SOMAXCONN;
> inet_sk_state_store(ssock->sk, TCP_LISTEN);
> + sock_set_flag(ssock->sk, SOCK_RCU_FREE);
>
> s->sk->sk_max_ack_backlog = SOMAXCONN;
> inet_sk_state_store(s->sk, TCP_LISTEN);
> --
> 2.34.1
>
>
>
--
Mat Martineau
Intel
Mat Martineau <mathew.j.martineau@linux.intel.com> wrote: > > This means subflow to TCP-only listener will fail to establish, but that > > could be resolved by having a 'reserve the port' approach (mptcpd?) in > > the tooling that adds the required netlink commands. > > > > I'm not sure what this would look like? Seems like a bit of kernel work in > itself. Not sure if you want to explain here on this list or cover it in the > meeting. I meant something really crude^W simple, i.e. socket+bind before telling kernel that we can receive subflows on x:y.
© 2016 - 2026 Red Hat, Inc.