When ADD_ADDR announcements use the port associated with an
active subflow, this change ensures that a listening socket is
bound to the announced address and port for subsequently
receiving MP_JOINs from the remote end. In case there's
a recorded lsk bound to that address+port, it is reused.
But if a listening socket for this address is already held by the
application then no further action is taken.
When a listening socket is created, it is stored in
struct mptcp_pm_add_entry and released accordingly.
Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/203
v2: fixed formatting
Signed-off-by: Kishen Maloor <kishen.maloor@intel.com>
---
net/mptcp/pm_netlink.c | 56 ++++++++++++++++++++++++++++++++++++++++--
1 file changed, 54 insertions(+), 2 deletions(-)
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index 779ec9d375f0..e2211f3b8c8c 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -43,6 +43,7 @@ struct mptcp_pm_add_entry {
struct mptcp_addr_info addr;
struct timer_list add_timer;
struct mptcp_sock *sock;
+ struct mptcp_local_lsk *lsk_ref;
u8 retrans_times;
};
@@ -66,6 +67,10 @@ struct pm_nl_pernet {
#define MPTCP_PM_ADDR_MAX 8
#define ADD_ADDR_RETRANS_MAX 3
+static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
+ struct mptcp_pm_addr_entry *entry,
+ struct socket **lsk);
+
static bool addresses_equal(const struct mptcp_addr_info *a,
const struct mptcp_addr_info *b, bool use_port)
{
@@ -438,7 +443,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk,
}
static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
- struct mptcp_pm_addr_entry *entry)
+ struct mptcp_pm_addr_entry *entry,
+ struct mptcp_local_lsk *lsk_ref)
{
struct mptcp_pm_add_entry *add_entry = NULL;
struct sock *sk = (struct sock *)msk;
@@ -458,6 +464,10 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk,
add_entry->addr = entry->addr;
add_entry->sock = msk;
add_entry->retrans_times = 0;
+ add_entry->lsk_ref = lsk_ref;
+
+ if (lsk_ref)
+ lsk_list_add_ref(lsk_ref);
timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0);
sk_reset_timer(sk, &add_entry->add_timer,
@@ -470,8 +480,11 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
{
struct mptcp_pm_add_entry *entry, *tmp;
struct sock *sk = (struct sock *)msk;
+ struct pm_nl_pernet *pernet;
LIST_HEAD(free_list);
+ pernet = net_generic(sock_net(sk), pm_nl_pernet_id);
+
pr_debug("msk=%p", msk);
spin_lock_bh(&msk->pm.lock);
@@ -480,6 +493,8 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk)
list_for_each_entry_safe(entry, tmp, &free_list, list) {
sk_stop_timer_sync(sk, &entry->add_timer);
+ if (entry->lsk_ref)
+ lsk_list_release(pernet, entry->lsk_ref);
kfree(entry);
}
}
@@ -570,13 +585,16 @@ lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *add
}
static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
+ __must_hold(&msk->pm.lock)
{
+ struct mptcp_local_lsk *lsk_ref = NULL;
struct sock *sk = (struct sock *)msk;
struct mptcp_pm_addr_entry *local;
unsigned int add_addr_signal_max;
unsigned int local_addr_max;
struct pm_nl_pernet *pernet;
unsigned int subflows_max;
+ struct socket *lsk;
pernet = net_generic(sock_net(sk), pm_nl_pernet_id);
@@ -607,12 +625,39 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk)
local = select_signal_address(pernet, msk);
if (local) {
- if (mptcp_pm_alloc_anno_list(msk, local)) {
+ if (!local->addr.port) {
+ local->addr.port =
+ ((struct inet_sock *)inet_sk
+ ((struct sock *)msk))->inet_sport;
+
+ lsk_ref = lsk_list_find(pernet, &local->addr);
+
+ if (!lsk_ref) {
+ spin_unlock_bh(&msk->pm.lock);
+
+ mptcp_pm_nl_create_listen_socket(sk, local, &lsk);
+
+ spin_lock_bh(&msk->pm.lock);
+
+ if (lsk)
+ lsk_ref = lsk_list_add(pernet, &local->addr, lsk);
+
+ if (lsk && !lsk_ref)
+ sock_release(lsk);
+ }
+
+ local->addr.port = 0;
+ }
+
+ if (mptcp_pm_alloc_anno_list(msk, local, lsk_ref)) {
__clear_bit(local->addr.id, msk->pm.id_avail_bitmap);
msk->pm.add_addr_signaled++;
mptcp_pm_announce_addr(msk, &local->addr, false);
mptcp_pm_nl_addr_send_ack(msk);
}
+
+ if (lsk_ref)
+ lsk_list_release(pernet, lsk_ref);
}
}
@@ -704,6 +749,7 @@ static unsigned int fill_local_addresses_vec(struct mptcp_sock *msk,
}
static void mptcp_pm_nl_add_addr_received(struct mptcp_sock *msk)
+ __must_hold(&msk->pm.lock)
{
struct mptcp_addr_info addrs[MPTCP_PM_ADDR_MAX];
struct sock *sk = (struct sock *)msk;
@@ -1352,11 +1398,17 @@ int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id,
static bool remove_anno_list_by_saddr(struct mptcp_sock *msk,
struct mptcp_addr_info *addr)
{
+ struct sock *sk = (struct sock *)msk;
struct mptcp_pm_add_entry *entry;
+ struct pm_nl_pernet *pernet;
+
+ pernet = net_generic(sock_net(sk), pm_nl_pernet_id);
entry = mptcp_pm_del_add_timer(msk, addr, false);
if (entry) {
list_del(&entry->list);
+ if (entry->lsk_ref)
+ lsk_list_release(pernet, entry->lsk_ref);
kfree(entry);
return true;
}
--
2.31.1
Hello, On Wed, 2022-01-12 at 17:15 -0500, Kishen Maloor wrote: > When ADD_ADDR announcements use the port associated with an > active subflow, this change ensures that a listening socket is > bound to the announced address and port for subsequently > receiving MP_JOINs from the remote end. In case there's > a recorded lsk bound to that address+port, it is reused. > But if a listening socket for this address is already held by the > application then no further action is taken. > > When a listening socket is created, it is stored in > struct mptcp_pm_add_entry and released accordingly. > > Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/203 > > v2: fixed formatting > > Signed-off-by: Kishen Maloor <kishen.maloor@intel.com> should be either: """ <changelog> <tags> """ or: """ <tags> --- <changelog> """ we usually keep the changelog outside the commit message for development history before landing on netdev, that is: """ Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/203 Signed-off-by: Kishen Maloor <kishen.maloor@intel.com> --- v2: fixed formatting """ > --- > net/mptcp/pm_netlink.c | 56 ++++++++++++++++++++++++++++++++++++++++-- > 1 file changed, 54 insertions(+), 2 deletions(-) > > diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c > index 779ec9d375f0..e2211f3b8c8c 100644 > --- a/net/mptcp/pm_netlink.c > +++ b/net/mptcp/pm_netlink.c > @@ -43,6 +43,7 @@ struct mptcp_pm_add_entry { > struct mptcp_addr_info addr; > struct timer_list add_timer; > struct mptcp_sock *sock; > + struct mptcp_local_lsk *lsk_ref; > u8 retrans_times; > }; > > @@ -66,6 +67,10 @@ struct pm_nl_pernet { > #define MPTCP_PM_ADDR_MAX 8 > #define ADD_ADDR_RETRANS_MAX 3 > > +static int mptcp_pm_nl_create_listen_socket(struct sock *sk, > + struct mptcp_pm_addr_entry *entry, > + struct socket **lsk); > + > static bool addresses_equal(const struct mptcp_addr_info *a, > const struct mptcp_addr_info *b, bool use_port) > { > @@ -438,7 +443,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, > } > > static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, > - struct mptcp_pm_addr_entry *entry) > + struct mptcp_pm_addr_entry *entry, > + struct mptcp_local_lsk *lsk_ref) > { > struct mptcp_pm_add_entry *add_entry = NULL; > struct sock *sk = (struct sock *)msk; > @@ -458,6 +464,10 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, > add_entry->addr = entry->addr; > add_entry->sock = msk; > add_entry->retrans_times = 0; > + add_entry->lsk_ref = lsk_ref; > + > + if (lsk_ref) > + lsk_list_add_ref(lsk_ref); > > timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); > sk_reset_timer(sk, &add_entry->add_timer, > @@ -470,8 +480,11 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) > { > struct mptcp_pm_add_entry *entry, *tmp; > struct sock *sk = (struct sock *)msk; > + struct pm_nl_pernet *pernet; > LIST_HEAD(free_list); > > + pernet = net_generic(sock_net(sk), pm_nl_pernet_id); > + > pr_debug("msk=%p", msk); > > spin_lock_bh(&msk->pm.lock); > @@ -480,6 +493,8 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) > > list_for_each_entry_safe(entry, tmp, &free_list, list) { > sk_stop_timer_sync(sk, &entry->add_timer); > + if (entry->lsk_ref) > + lsk_list_release(pernet, entry->lsk_ref); > kfree(entry); > } > } > @@ -570,13 +585,16 @@ lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *add > } > > static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) > + __must_hold(&msk->pm.lock) > { > + struct mptcp_local_lsk *lsk_ref = NULL; > struct sock *sk = (struct sock *)msk; > struct mptcp_pm_addr_entry *local; > unsigned int add_addr_signal_max; > unsigned int local_addr_max; > struct pm_nl_pernet *pernet; > unsigned int subflows_max; > + struct socket *lsk; > > pernet = net_generic(sock_net(sk), pm_nl_pernet_id); > > @@ -607,12 +625,39 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) > local = select_signal_address(pernet, msk); > > if (local) { > - if (mptcp_pm_alloc_anno_list(msk, local)) { > + if (!local->addr.port) { > + local->addr.port = > + ((struct inet_sock *)inet_sk > + ((struct sock *)msk))->inet_sport; > + > + lsk_ref = lsk_list_find(pernet, &local->addr); > + > + if (!lsk_ref) { > + spin_unlock_bh(&msk->pm.lock); > + > + mptcp_pm_nl_create_listen_socket(sk, local, &lsk); > + > + spin_lock_bh(&msk->pm.lock); > + > + if (lsk) > + lsk_ref = lsk_list_add(pernet, &local->addr, lsk); > + > + if (lsk && !lsk_ref) > + sock_release(lsk); Let's suppose an user-space application listen on 2 different address (A, B) and does: """ s1 = socket() bind(s1, A) listen(s1) // at this point incoming MPTCP connection can be established on s1 // and ADD_ADDR sub-options could be sent back s2 = socket() bind(s2, B) listen(s2) """ If there is a signal endpoint on B, the above listen can race with the mptcp_pm_nl_create_listen_socket() call, leading to hard to track startup issues for user-space application. I really think we want at list a configuration option, off by default, for this feature. Some specific self-test would be a plus. It will help reviewing, splitting this series in at least 2 chunks: * pre-reqs up to ~this patch * user-space PM specific stuff Side note: it would be nice reducing the level of intentation, e.g. factoring-out part of the inner code in some helper. > + } > + > + local->addr.port = 0; > + } > + > + if (mptcp_pm_alloc_anno_list(msk, local, lsk_ref)) { > __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); > msk->pm.add_addr_signaled++; > mptcp_pm_announce_addr(msk, &local->addr, false); > mptcp_pm_nl_addr_send_ack(msk); > } > + > + if (lsk_ref) > + lsk_list_release(pernet, lsk_ref); Probaly not very relevant, but something alike: rcu_read_lock() lsk_ref = __lsk_list_find(); if (lst_ref) if (mptcp_pm_alloc_anno_list(...) rcu_read_unlock() would save a pair of possibly contended atomic operations in the common case. Thanks! Paolo
Hi Kishen, Paolo, On 14/01/2022 16:54, Paolo Abeni wrote: > I really think we want at list a configuration option, off by default, > for this feature. Some specific self-test would be a plus. > > It will help reviewing, splitting this series in at least 2 chunks: > * pre-reqs up to ~this patch > * user-space PM specific stuff Good idea, I think it will help indeed. And it will be required when we will send these patches to netdev. For me, it makes sense to have more features only for the userspace PM and of course more flexibility. But we still need to make sure the in-kernel PM can be controlled to avoid some behaviours. For example, when sending the command to emit an ADD_ADDR, a parameter could be used to (try to) create a listening socket as well. I think an in-kernel PM makes sense for a typical server mainly having to send ADD_ADDRs and accept subflows. Having a userspace PM daemon can be costly for such "basic task". Also a server could also send an ADD_ADDR for an IP that it doesn't directly "own" and we might save some resources by not creating or trying to create listening sockets. We could also have a new flag with 'ip mptcp' to create listening sockets for the IP we want to "signal". I mean: even if it is now easier to create listening sockets, we might not want to do it all the time. Cheers, Matt -- Tessares | Belgium | Hybrid Access Solutions www.tessares.net
Hi Paolo, Matthieu, On 1/14/22 7:54 AM, Paolo Abeni wrote: > Hello, > > On Wed, 2022-01-12 at 17:15 -0500, Kishen Maloor wrote: >> When ADD_ADDR announcements use the port associated with an >> active subflow, this change ensures that a listening socket is >> bound to the announced address and port for subsequently >> receiving MP_JOINs from the remote end. In case there's >> a recorded lsk bound to that address+port, it is reused. >> But if a listening socket for this address is already held by the >> application then no further action is taken. >> >> When a listening socket is created, it is stored in >> struct mptcp_pm_add_entry and released accordingly. >> >> Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/203 >> >> v2: fixed formatting >> >> Signed-off-by: Kishen Maloor <kishen.maloor@intel.com> > > should be either: > > """ > <changelog> > > <tags> > """ > > or: > > """ > <tags> > --- > <changelog> > """ > > we usually keep the changelog outside the commit message for > development history before landing on netdev, that is: Thanks! I shall reflect this change in the related commit messages. > > """ > Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/203 > Signed-off-by: Kishen Maloor <kishen.maloor@intel.com> > --- > v2: fixed formatting > """ > >> --- >> net/mptcp/pm_netlink.c | 56 ++++++++++++++++++++++++++++++++++++++++-- >> 1 file changed, 54 insertions(+), 2 deletions(-) >> >> diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c >> index 779ec9d375f0..e2211f3b8c8c 100644 >> --- a/net/mptcp/pm_netlink.c >> +++ b/net/mptcp/pm_netlink.c >> @@ -43,6 +43,7 @@ struct mptcp_pm_add_entry { >> struct mptcp_addr_info addr; >> struct timer_list add_timer; >> struct mptcp_sock *sock; >> + struct mptcp_local_lsk *lsk_ref; >> u8 retrans_times; >> }; >> >> @@ -66,6 +67,10 @@ struct pm_nl_pernet { >> #define MPTCP_PM_ADDR_MAX 8 >> #define ADD_ADDR_RETRANS_MAX 3 >> >> +static int mptcp_pm_nl_create_listen_socket(struct sock *sk, >> + struct mptcp_pm_addr_entry *entry, >> + struct socket **lsk); >> + >> static bool addresses_equal(const struct mptcp_addr_info *a, >> const struct mptcp_addr_info *b, bool use_port) >> { >> @@ -438,7 +443,8 @@ mptcp_pm_del_add_timer(struct mptcp_sock *msk, >> } >> >> static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, >> - struct mptcp_pm_addr_entry *entry) >> + struct mptcp_pm_addr_entry *entry, >> + struct mptcp_local_lsk *lsk_ref) >> { >> struct mptcp_pm_add_entry *add_entry = NULL; >> struct sock *sk = (struct sock *)msk; >> @@ -458,6 +464,10 @@ static bool mptcp_pm_alloc_anno_list(struct mptcp_sock *msk, >> add_entry->addr = entry->addr; >> add_entry->sock = msk; >> add_entry->retrans_times = 0; >> + add_entry->lsk_ref = lsk_ref; >> + >> + if (lsk_ref) >> + lsk_list_add_ref(lsk_ref); >> >> timer_setup(&add_entry->add_timer, mptcp_pm_add_timer, 0); >> sk_reset_timer(sk, &add_entry->add_timer, >> @@ -470,8 +480,11 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) >> { >> struct mptcp_pm_add_entry *entry, *tmp; >> struct sock *sk = (struct sock *)msk; >> + struct pm_nl_pernet *pernet; >> LIST_HEAD(free_list); >> >> + pernet = net_generic(sock_net(sk), pm_nl_pernet_id); >> + >> pr_debug("msk=%p", msk); >> >> spin_lock_bh(&msk->pm.lock); >> @@ -480,6 +493,8 @@ void mptcp_pm_free_anno_list(struct mptcp_sock *msk) >> >> list_for_each_entry_safe(entry, tmp, &free_list, list) { >> sk_stop_timer_sync(sk, &entry->add_timer); >> + if (entry->lsk_ref) >> + lsk_list_release(pernet, entry->lsk_ref); >> kfree(entry); >> } >> } >> @@ -570,13 +585,16 @@ lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *add >> } >> >> static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) >> + __must_hold(&msk->pm.lock) >> { >> + struct mptcp_local_lsk *lsk_ref = NULL; >> struct sock *sk = (struct sock *)msk; >> struct mptcp_pm_addr_entry *local; >> unsigned int add_addr_signal_max; >> unsigned int local_addr_max; >> struct pm_nl_pernet *pernet; >> unsigned int subflows_max; >> + struct socket *lsk; >> >> pernet = net_generic(sock_net(sk), pm_nl_pernet_id); >> >> @@ -607,12 +625,39 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) >> local = select_signal_address(pernet, msk); >> >> if (local) { >> - if (mptcp_pm_alloc_anno_list(msk, local)) { >> + if (!local->addr.port) { >> + local->addr.port = >> + ((struct inet_sock *)inet_sk >> + ((struct sock *)msk))->inet_sport; >> + >> + lsk_ref = lsk_list_find(pernet, &local->addr); >> + >> + if (!lsk_ref) { >> + spin_unlock_bh(&msk->pm.lock); >> + >> + mptcp_pm_nl_create_listen_socket(sk, local, &lsk); >> + >> + spin_lock_bh(&msk->pm.lock); >> + >> + if (lsk) >> + lsk_ref = lsk_list_add(pernet, &local->addr, lsk); >> + >> + if (lsk && !lsk_ref) >> + sock_release(lsk); > > Let's suppose an user-space application listen on 2 different address > (A, B) and does: > > """ > s1 = socket() > bind(s1, A) > listen(s1) > // at this point incoming MPTCP connection can be established on s1 > // and ADD_ADDR sub-options could be sent back > > s2 = socket() > bind(s2, B) > listen(s2) > """ > > If there is a signal endpoint on B, the above listen can race with the > mptcp_pm_nl_create_listen_socket() call, leading to hard to track > startup issues for user-space application. > > I really think we want at list a configuration option, off by default, > for this feature. Some specific self-test would be a plus. Looking at your example above, assuming both A and B are bound to the same port then yes, a race such as you suggest could occur. But if you consider bug #203, it arose only because there was no listener in the application (and unexpectedly so). So if the path manager creates a listener (at the time of the address advertisement) to facilitate MPJs then that would have the usual side effects of creating listeners (in general). For e.g,. I think this clash could also occur with the existing code in the kernel PM and using a port-based endpoint when the app happens to bind a socket to that specific addr+port. The other scenario where the path manager needs to always establish a listener is when running alongside an MPTCP client. We could certainly add an "attempt to create lsk" option to the ADD_ADDR netlink commands, as I believe you've both suggested, but perhaps we should think further about the guidance regarding usage of this option. For e.g., if creating lsks is not the default behavior, then bug #203 would persist unless the entity that issues the ADD_ADDR command exercises this option. > > It will help reviewing, splitting this series in at least 2 chunks: > * pre-reqs up to ~this patch > * user-space PM specific stuff > > Side note: it would be nice reducing the level of intentation, e.g. > factoring-out part of the inner code in some helper. > >> + } >> + >> + local->addr.port = 0; >> + } >> + >> + if (mptcp_pm_alloc_anno_list(msk, local, lsk_ref)) { >> __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); >> msk->pm.add_addr_signaled++; >> mptcp_pm_announce_addr(msk, &local->addr, false); >> mptcp_pm_nl_addr_send_ack(msk); >> } >> + >> + if (lsk_ref) >> + lsk_list_release(pernet, lsk_ref); > > Probaly not very relevant, but something alike: > > rcu_read_lock() > lsk_ref = __lsk_list_find(); > if (lst_ref) > if (mptcp_pm_alloc_anno_list(...) > rcu_read_unlock() > > would save a pair of possibly contended atomic operations in the common > case. > > Thanks! > > Paolo >
On Fri, 14 Jan 2022, Paolo Abeni wrote: ... > It will help reviewing, splitting this series in at least 2 chunks: > * pre-reqs up to ~this patch > * user-space PM specific stuff > Seems like the way to go here is to split patches 1-6 into another series, and have patch 7 as a standalone (or separate series w/ selftest) for mptcp-net. -- Mat Martineau Intel
© 2016 - 2022 Red Hat, Inc.