[v5] udp: Add 4-tuple hash for connected sockets

[PATCH v5 net-next 3/3] ipv4/udp: Add 4-tuple hash for connected socket

Posted by Philo Lu 1 year, 3 months ago

Currently, the udp_table has two hash table, the port hash and portaddr
hash. Usually for UDP servers, all sockets have the same local port and
addr, so they are all on the same hash slot within a reuseport group.

In some applications, UDP servers use connect() to manage clients. In
particular, when firstly receiving from an unseen 4 tuple, a new socket
is created and connect()ed to the remote addr:port, and then the fd is
used exclusively by the client.

Once there are connected sks in a reuseport group, udp has to score all
sks in the same hash2 slot to find the best match. This could be
inefficient with a large number of connections, resulting in high
softirq overhead.

To solve the problem, this patch implement 4-tuple hash for connected
udp sockets. During connect(), hash4 slot is updated, as well as a
corresponding counter, hash4_cnt, in hslot2. In __udp4_lib_lookup(),
hslot4 will be searched firstly if the counter is non-zero. Otherwise,
hslot2 is used like before. Note that only connected sockets enter this
hash4 path, while un-connected ones are not affected.

Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Cambda Zhu <cambda@linux.alibaba.com>
Signed-off-by: Fred Chen <fred.cc@alibaba-inc.com>
Signed-off-by: Yubing Qiu <yubing.qiuyubing@alibaba-inc.com>
---
 include/net/udp.h |   3 +-
 net/ipv4/udp.c    | 171 +++++++++++++++++++++++++++++++++++++++++++++-
 net/ipv6/udp.c    |   2 +-
 3 files changed, 171 insertions(+), 5 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 8aefdc404362..97c5ae83723c 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -293,7 +293,7 @@ static inline int udp_lib_hash(struct sock *sk)
 }
 
 void udp_lib_unhash(struct sock *sk);
-void udp_lib_rehash(struct sock *sk, u16 new_hash);
+void udp_lib_rehash(struct sock *sk, u16 new_hash, u16 new_hash4);
 
 static inline void udp_lib_close(struct sock *sk, long timeout)
 {
@@ -386,6 +386,7 @@ int udp_rcv(struct sk_buff *skb);
 int udp_ioctl(struct sock *sk, int cmd, int *karg);
 int udp_init_sock(struct sock *sk);
 int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
+int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
 int __udp_disconnect(struct sock *sk, int flags);
 int udp_disconnect(struct sock *sk, int flags);
 __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 74bfab0f44f8..5d944cec7a27 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -478,6 +478,134 @@ static struct sock *udp4_lib_lookup2(const struct net *net,
 	return result;
 }
 
+#if IS_ENABLED(CONFIG_BASE_SMALL)
+static struct sock *udp4_lib_lookup4(const struct net *net,
+				     __be32 saddr, __be16 sport,
+				     __be32 daddr, unsigned int hnum,
+				     int dif, int sdif,
+				     struct udp_table *udptable)
+{
+	return NULL;
+}
+
+static void udp4_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4)
+{
+}
+
+static void udp4_unhash4(struct udp_table *udptable, struct sock *sk)
+{
+}
+
+static void udp4_hash4(struct sock *sk)
+{
+}
+#else /* !CONFIG_BASE_SMALL */
+static struct sock *udp4_lib_lookup4(const struct net *net,
+				     __be32 saddr, __be16 sport,
+				     __be32 daddr, unsigned int hnum,
+				     int dif, int sdif,
+				     struct udp_table *udptable)
+{
+	unsigned int hash4 = udp_ehashfn(net, daddr, hnum, saddr, sport);
+	const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
+	struct udp_hslot *hslot4 = udp_hashslot4(udptable, hash4);
+	struct udp_sock *up;
+	struct sock *sk;
+
+	INET_ADDR_COOKIE(acookie, saddr, daddr);
+	udp_lrpa_for_each_entry_rcu(up, &hslot4->head) {
+		sk = (struct sock *)up;
+		if (inet_match(net, sk, acookie, ports, dif, sdif))
+			return sk;
+	}
+	return NULL;
+}
+
+/* In hash4, rehash can also happen in connect(), where hash4_cnt keeps unchanged. */
+static void udp4_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4)
+{
+	struct udp_hslot *hslot4, *nhslot4;
+
+	hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
+	nhslot4 = udp_hashslot4(udptable, newhash4);
+	udp_sk(sk)->udp_lrpa_hash = newhash4;
+
+	if (hslot4 != nhslot4) {
+		spin_lock_bh(&hslot4->lock);
+		hlist_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
+		hslot4->count--;
+		spin_unlock_bh(&hslot4->lock);
+
+		synchronize_rcu();
+
+		spin_lock_bh(&nhslot4->lock);
+		hlist_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, &nhslot4->head);
+		nhslot4->count++;
+		spin_unlock_bh(&nhslot4->lock);
+	}
+}
+
+static void udp4_unhash4(struct udp_table *udptable, struct sock *sk)
+{
+	struct udp_hslot *hslot2, *hslot4;
+
+	if (udp_hashed4(sk)) {
+		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+		hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
+
+		spin_lock(&hslot4->lock);
+		hlist_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
+		hslot4->count--;
+		spin_unlock(&hslot4->lock);
+
+		spin_lock(&hslot2->lock);
+		udp_hash4_dec(hslot2);
+		spin_unlock(&hslot2->lock);
+	}
+}
+
+/* call with sock lock */
+static void udp4_hash4(struct sock *sk)
+{
+	struct udp_hslot *hslot, *hslot2, *hslot4;
+	struct net *net = sock_net(sk);
+	struct udp_table *udptable;
+	unsigned int hash;
+
+	if (sk_unhashed(sk) || inet_sk(sk)->inet_rcv_saddr == htonl(INADDR_ANY))
+		return;
+
+	hash = udp_ehashfn(net, inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num,
+			   inet_sk(sk)->inet_daddr, inet_sk(sk)->inet_dport);
+
+	udptable = net->ipv4.udp_table;
+	if (udp_hashed4(sk)) {
+		udp4_rehash4(udptable, sk, hash);
+		return;
+	}
+
+	hslot = udp_hashslot(udptable, net, udp_sk(sk)->udp_port_hash);
+	hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
+	hslot4 = udp_hashslot4(udptable, hash);
+	udp_sk(sk)->udp_lrpa_hash = hash;
+
+	spin_lock_bh(&hslot->lock);
+	if (rcu_access_pointer(sk->sk_reuseport_cb))
+		reuseport_detach_sock(sk);
+
+	spin_lock(&hslot4->lock);
+	hlist_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, &hslot4->head);
+	hslot4->count++;
+	spin_unlock(&hslot4->lock);
+
+	spin_lock(&hslot2->lock);
+	udp_hash4_inc(hslot2);
+	spin_unlock(&hslot2->lock);
+
+	spin_unlock_bh(&hslot->lock);
+}
+#endif /* CONFIG_BASE_SMALL */
+
 /* UDP is nearly always wildcards out the wazoo, it makes no sense to try
  * harder than this. -DaveM
  */
@@ -493,6 +621,12 @@ struct sock *__udp4_lib_lookup(const struct net *net, __be32 saddr,
 	hash2 = ipv4_portaddr_hash(net, daddr, hnum);
 	hslot2 = udp_hashslot2(udptable, hash2);
 
+	if (udp_has_hash4(hslot2)) {
+		result = udp4_lib_lookup4(net, saddr, sport, daddr, hnum, dif, sdif, udptable);
+		if (result) /* udp4_lib_lookup4 return sk or NULL */
+			return result;
+	}
+
 	/* Lookup connected or non-wildcard socket */
 	result = udp4_lib_lookup2(net, saddr, sport,
 				  daddr, hnum, dif, sdif,
@@ -1931,6 +2065,19 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 }
 EXPORT_SYMBOL(udp_pre_connect);
 
+int udp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+{
+	int res;
+
+	lock_sock(sk);
+	res = __ip4_datagram_connect(sk, uaddr, addr_len);
+	if (!res)
+		udp4_hash4(sk);
+	release_sock(sk);
+	return res;
+}
+EXPORT_SYMBOL(udp_connect);
+
 int __udp_disconnect(struct sock *sk, int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
@@ -1990,6 +2137,8 @@ void udp_lib_unhash(struct sock *sk)
 			hlist_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
 			hslot2->count--;
 			spin_unlock(&hslot2->lock);
+
+			udp4_unhash4(udptable, sk);
 		}
 		spin_unlock_bh(&hslot->lock);
 	}
@@ -1999,7 +2148,7 @@ EXPORT_SYMBOL(udp_lib_unhash);
 /*
  * inet_rcv_saddr was changed, we must rehash secondary hash
  */
-void udp_lib_rehash(struct sock *sk, u16 newhash)
+void udp_lib_rehash(struct sock *sk, u16 newhash, u16 newhash4)
 {
 	if (sk_hashed(sk)) {
 		struct udp_table *udptable = udp_get_table_prot(sk);
@@ -2031,6 +2180,19 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
 				spin_unlock(&nhslot2->lock);
 			}
 
+			if (udp_hashed4(sk)) {
+				udp4_rehash4(udptable, sk, newhash4);
+
+				if (hslot2 != nhslot2) {
+					spin_lock(&hslot2->lock);
+					udp_hash4_dec(hslot2);
+					spin_unlock(&hslot2->lock);
+
+					spin_lock(&nhslot2->lock);
+					udp_hash4_inc(nhslot2);
+					spin_unlock(&nhslot2->lock);
+				}
+			}
 			spin_unlock_bh(&hslot->lock);
 		}
 	}
@@ -2042,7 +2204,10 @@ void udp_v4_rehash(struct sock *sk)
 	u16 new_hash = ipv4_portaddr_hash(sock_net(sk),
 					  inet_sk(sk)->inet_rcv_saddr,
 					  inet_sk(sk)->inet_num);
-	udp_lib_rehash(sk, new_hash);
+	u16 new_hash4 = udp_ehashfn(sock_net(sk),
+				    inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num,
+				    inet_sk(sk)->inet_daddr, inet_sk(sk)->inet_dport);
+	udp_lib_rehash(sk, new_hash, new_hash4);
 }
 
 static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
@@ -2935,7 +3100,7 @@ struct proto udp_prot = {
 	.owner			= THIS_MODULE,
 	.close			= udp_lib_close,
 	.pre_connect		= udp_pre_connect,
-	.connect		= ip4_datagram_connect,
+	.connect		= udp_connect,
 	.disconnect		= udp_disconnect,
 	.ioctl			= udp_ioctl,
 	.init			= udp_init_sock,
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index bbf3352213c4..4d3dfcb48a39 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -111,7 +111,7 @@ void udp_v6_rehash(struct sock *sk)
 					  &sk->sk_v6_rcv_saddr,
 					  inet_sk(sk)->inet_num);
 
-	udp_lib_rehash(sk, new_hash);
+	udp_lib_rehash(sk, new_hash, 0); /* 4-tuple hash not implemented */
 }
 
 static int compute_score(struct sock *sk, const struct net *net,
-- 
2.32.0.3.g01195cf9f

Re: [PATCH v5 net-next 3/3] ipv4/udp: Add 4-tuple hash for connected socket

Posted by Paolo Abeni 1 year, 3 months ago

On 10/18/24 13:45, Philo Lu wrote:
[...]
> +/* In hash4, rehash can also happen in connect(), where hash4_cnt keeps unchanged. */
> +static void udp4_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4)
> +{
> +	struct udp_hslot *hslot4, *nhslot4;
> +
> +	hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
> +	nhslot4 = udp_hashslot4(udptable, newhash4);
> +	udp_sk(sk)->udp_lrpa_hash = newhash4;
> +
> +	if (hslot4 != nhslot4) {
> +		spin_lock_bh(&hslot4->lock);
> +		hlist_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
> +		hslot4->count--;
> +		spin_unlock_bh(&hslot4->lock);
> +
> +		synchronize_rcu();

This deserve a comment explaining why it's needed. I had to dig in past
revision to understand it.

> +
> +		spin_lock_bh(&nhslot4->lock);
> +		hlist_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, &nhslot4->head);
> +		nhslot4->count++;
> +		spin_unlock_bh(&nhslot4->lock);
> +	}
> +}
> +
> +static void udp4_unhash4(struct udp_table *udptable, struct sock *sk)
> +{
> +	struct udp_hslot *hslot2, *hslot4;
> +
> +	if (udp_hashed4(sk)) {
> +		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
> +		hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
> +
> +		spin_lock(&hslot4->lock);
> +		hlist_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
> +		hslot4->count--;
> +		spin_unlock(&hslot4->lock);
> +
> +		spin_lock(&hslot2->lock);
> +		udp_hash4_dec(hslot2);
> +		spin_unlock(&hslot2->lock);
> +	}
> +}
> +
> +/* call with sock lock */
> +static void udp4_hash4(struct sock *sk)
> +{
> +	struct udp_hslot *hslot, *hslot2, *hslot4;
> +	struct net *net = sock_net(sk);
> +	struct udp_table *udptable;
> +	unsigned int hash;
> +
> +	if (sk_unhashed(sk) || inet_sk(sk)->inet_rcv_saddr == htonl(INADDR_ANY))
> +		return;
> +
> +	hash = udp_ehashfn(net, inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num,
> +			   inet_sk(sk)->inet_daddr, inet_sk(sk)->inet_dport);
> +
> +	udptable = net->ipv4.udp_table;
> +	if (udp_hashed4(sk)) {
> +		udp4_rehash4(udptable, sk, hash);

It's unclear to me how we can enter this branch. Also it's unclear why
here you don't need to call udp_hash4_inc()udp_hash4_dec, too. Why such
accounting can't be placed in udp4_rehash4()?

[...]
> @@ -2031,6 +2180,19 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
>  				spin_unlock(&nhslot2->lock);
>  			}
>  
> +			if (udp_hashed4(sk)) {
> +				udp4_rehash4(udptable, sk, newhash4);
> +
> +				if (hslot2 != nhslot2) {
> +					spin_lock(&hslot2->lock);
> +					udp_hash4_dec(hslot2);
> +					spin_unlock(&hslot2->lock);
> +
> +					spin_lock(&nhslot2->lock);
> +					udp_hash4_inc(nhslot2);
> +					spin_unlock(&nhslot2->lock);
> +				}
> +			}
>  			spin_unlock_bh(&hslot->lock);

The udp4_rehash4() call above is in atomic context and could end-up
calling synchronize_rcu() which is a blocking function. You must avoid that.

Cheers,

Paolo

Re: [PATCH v5 net-next 3/3] ipv4/udp: Add 4-tuple hash for connected socket

Posted by Philo Lu 1 year, 3 months ago


On 2024/10/24 23:01, Paolo Abeni wrote:
> On 10/18/24 13:45, Philo Lu wrote:
> [...]
>> +/* In hash4, rehash can also happen in connect(), where hash4_cnt keeps unchanged. */
>> +static void udp4_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4)
>> +{
>> +	struct udp_hslot *hslot4, *nhslot4;
>> +
>> +	hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
>> +	nhslot4 = udp_hashslot4(udptable, newhash4);
>> +	udp_sk(sk)->udp_lrpa_hash = newhash4;
>> +
>> +	if (hslot4 != nhslot4) {
>> +		spin_lock_bh(&hslot4->lock);
>> +		hlist_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
>> +		hslot4->count--;
>> +		spin_unlock_bh(&hslot4->lock);
>> +
>> +		synchronize_rcu();
> 
> This deserve a comment explaining why it's needed. I had to dig in past
> revision to understand it.
> 

Got it. And a short explanation here (see [1] for detail):

Here, we move a node from a hlist to another new one, i.e., update 
node->next from the old hlist to the new hlist. For readers traversing 
the old hlist, if we update node->next just when readers move onto the 
moved node, then the readers also move to the new hlist. This is unexpected.

     Reader(lookup)     Writer(rehash)
     -----------------  ---------------
1. rcu_read_lock()
2. pos = sk;
3.                     hlist_del_init_rcu(sk, old_slot)
4.                     hlist_add_head_rcu(sk, new_slot)
5. pos = pos->next; <=
6. rcu_read_unlock()

[1]
https://lore.kernel.org/all/0fb425e0-5482-4cdf-9dc1-3906751f8f81@linux.alibaba.com/

>> +
>> +		spin_lock_bh(&nhslot4->lock);
>> +		hlist_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, &nhslot4->head);
>> +		nhslot4->count++;
>> +		spin_unlock_bh(&nhslot4->lock);
>> +	}
>> +}
>> +
>> +static void udp4_unhash4(struct udp_table *udptable, struct sock *sk)
>> +{
>> +	struct udp_hslot *hslot2, *hslot4;
>> +
>> +	if (udp_hashed4(sk)) {
>> +		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
>> +		hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
>> +
>> +		spin_lock(&hslot4->lock);
>> +		hlist_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
>> +		hslot4->count--;
>> +		spin_unlock(&hslot4->lock);
>> +
>> +		spin_lock(&hslot2->lock);
>> +		udp_hash4_dec(hslot2);
>> +		spin_unlock(&hslot2->lock);
>> +	}
>> +}
>> +
>> +/* call with sock lock */
>> +static void udp4_hash4(struct sock *sk)
>> +{
>> +	struct udp_hslot *hslot, *hslot2, *hslot4;
>> +	struct net *net = sock_net(sk);
>> +	struct udp_table *udptable;
>> +	unsigned int hash;
>> +
>> +	if (sk_unhashed(sk) || inet_sk(sk)->inet_rcv_saddr == htonl(INADDR_ANY))
>> +		return;
>> +
>> +	hash = udp_ehashfn(net, inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num,
>> +			   inet_sk(sk)->inet_daddr, inet_sk(sk)->inet_dport);
>> +
>> +	udptable = net->ipv4.udp_table;
>> +	if (udp_hashed4(sk)) {
>> +		udp4_rehash4(udptable, sk, hash);
> 
> It's unclear to me how we can enter this branch. Also it's unclear why
> here you don't need to call udp_hash4_inc()udp_hash4_dec, too. Why such
> accounting can't be placed in udp4_rehash4()?
> 

It's possible that a connected udp socket _re-connect_ to another remote 
address. Then, because the local address is not changed, hash2 and its 
hash4_cnt keep unchanged. But rehash4 need to be done.

I'll also add a comment here.

> [...]
>> @@ -2031,6 +2180,19 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
>>   				spin_unlock(&nhslot2->lock);
>>   			}
>>   
>> +			if (udp_hashed4(sk)) {
>> +				udp4_rehash4(udptable, sk, newhash4);
>> +
>> +				if (hslot2 != nhslot2) {
>> +					spin_lock(&hslot2->lock);
>> +					udp_hash4_dec(hslot2);
>> +					spin_unlock(&hslot2->lock);
>> +
>> +					spin_lock(&nhslot2->lock);
>> +					udp_hash4_inc(nhslot2);
>> +					spin_unlock(&nhslot2->lock);
>> +				}
>> +			}
>>   			spin_unlock_bh(&hslot->lock);
> 
> The udp4_rehash4() call above is in atomic context and could end-up
> calling synchronize_rcu() which is a blocking function. You must avoid that.
> 

I see, synchronize_rcu() cannot be used with spinlock. However, I don't 
have a good idea to solve it by now. Do you have any thoughts or 
suggestions?

> Cheers,
> 
> Paolo

Thanks for your reviewing, Paolo. I'll address all your concerns in the 
next version.

-- 
Philo

Re: [PATCH v5 net-next 3/3] ipv4/udp: Add 4-tuple hash for connected socket

Posted by Paolo Abeni 1 year, 3 months ago

On 10/25/24 05:50, Philo Lu wrote:
> On 2024/10/24 23:01, Paolo Abeni wrote:
>> On 10/18/24 13:45, Philo Lu wrote:
>> [...]
>>> +/* In hash4, rehash can also happen in connect(), where hash4_cnt keeps unchanged. */
>>> +static void udp4_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4)
>>> +{
>>> +	struct udp_hslot *hslot4, *nhslot4;
>>> +
>>> +	hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
>>> +	nhslot4 = udp_hashslot4(udptable, newhash4);
>>> +	udp_sk(sk)->udp_lrpa_hash = newhash4;
>>> +
>>> +	if (hslot4 != nhslot4) {
>>> +		spin_lock_bh(&hslot4->lock);
>>> +		hlist_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
>>> +		hslot4->count--;
>>> +		spin_unlock_bh(&hslot4->lock);
>>> +
>>> +		synchronize_rcu();
>>
>> This deserve a comment explaining why it's needed. I had to dig in past
>> revision to understand it.
>>
> 
> Got it. And a short explanation here (see [1] for detail):
> 
> Here, we move a node from a hlist to another new one, i.e., update 
> node->next from the old hlist to the new hlist. For readers traversing 
> the old hlist, if we update node->next just when readers move onto the 
> moved node, then the readers also move to the new hlist. This is unexpected.
> 
>      Reader(lookup)     Writer(rehash)
>      -----------------  ---------------
> 1. rcu_read_lock()
> 2. pos = sk;
> 3.                     hlist_del_init_rcu(sk, old_slot)
> 4.                     hlist_add_head_rcu(sk, new_slot)
> 5. pos = pos->next; <=
> 6. rcu_read_unlock()
> 
> [1]
> https://lore.kernel.org/all/0fb425e0-5482-4cdf-9dc1-3906751f8f81@linux.alibaba.com/

Thanks. AFAICS the problem that such thing could cause is a lookup
failure for a socket positioned later in the same chain when a previous
entry is moved on a different slot during a concurrent lookup.

I think that could be solved the same way TCP is handling such scenario:
using hlist_null RCU list for the hash4 bucket, checking that a failed
lookup ends in the same bucket where it started and eventually
reiterating from the original bucket.

Have a look at __inet_lookup_established() for a more descriptive
reference, especially:

https://elixir.bootlin.com/linux/v6.12-rc4/source/net/ipv4/inet_hashtables.c#L528

>>> +
>>> +		spin_lock_bh(&nhslot4->lock);
>>> +		hlist_add_head_rcu(&udp_sk(sk)->udp_lrpa_node, &nhslot4->head);
>>> +		nhslot4->count++;
>>> +		spin_unlock_bh(&nhslot4->lock);
>>> +	}
>>> +}
>>> +
>>> +static void udp4_unhash4(struct udp_table *udptable, struct sock *sk)
>>> +{
>>> +	struct udp_hslot *hslot2, *hslot4;
>>> +
>>> +	if (udp_hashed4(sk)) {
>>> +		hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
>>> +		hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
>>> +
>>> +		spin_lock(&hslot4->lock);
>>> +		hlist_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
>>> +		hslot4->count--;
>>> +		spin_unlock(&hslot4->lock);
>>> +
>>> +		spin_lock(&hslot2->lock);
>>> +		udp_hash4_dec(hslot2);
>>> +		spin_unlock(&hslot2->lock);
>>> +	}
>>> +}
>>> +
>>> +/* call with sock lock */
>>> +static void udp4_hash4(struct sock *sk)
>>> +{
>>> +	struct udp_hslot *hslot, *hslot2, *hslot4;
>>> +	struct net *net = sock_net(sk);
>>> +	struct udp_table *udptable;
>>> +	unsigned int hash;
>>> +
>>> +	if (sk_unhashed(sk) || inet_sk(sk)->inet_rcv_saddr == htonl(INADDR_ANY))
>>> +		return;
>>> +
>>> +	hash = udp_ehashfn(net, inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num,
>>> +			   inet_sk(sk)->inet_daddr, inet_sk(sk)->inet_dport);
>>> +
>>> +	udptable = net->ipv4.udp_table;
>>> +	if (udp_hashed4(sk)) {
>>> +		udp4_rehash4(udptable, sk, hash);
>>
>> It's unclear to me how we can enter this branch. Also it's unclear why
>> here you don't need to call udp_hash4_inc()udp_hash4_dec, too. Why such
>> accounting can't be placed in udp4_rehash4()?
>>
> 
> It's possible that a connected udp socket _re-connect_ to another remote 
> address. Then, because the local address is not changed, hash2 and its 
> hash4_cnt keep unchanged. But rehash4 need to be done.
> I'll also add a comment here.

Right, UDP socket could actually connect() successfully twice in a row
without a disconnect in between...

I almost missed the point that the ipv6 implementation is planned to
land afterwards.

I'm sorry, but I think that would be problematic - i.e. if ipv4 support
will land in 6.13, but ipv6 will not make it - due to time constraints -
we will have (at least a release with inconsistent behavior between ipv4
and ipv6. I think it will be better bundle such changes together.

Thanks,

Paolo

Re: [PATCH v5 net-next 3/3] ipv4/udp: Add 4-tuple hash for connected socket

Posted by Philo Lu 1 year, 3 months ago


On 2024/10/25 17:02, Paolo Abeni wrote:
> On 10/25/24 05:50, Philo Lu wrote:
>> On 2024/10/24 23:01, Paolo Abeni wrote:
>>> On 10/18/24 13:45, Philo Lu wrote:
>>> [...]
>>>> +/* In hash4, rehash can also happen in connect(), where hash4_cnt keeps unchanged. */
>>>> +static void udp4_rehash4(struct udp_table *udptable, struct sock *sk, u16 newhash4)
>>>> +{
>>>> +	struct udp_hslot *hslot4, *nhslot4;
>>>> +
>>>> +	hslot4 = udp_hashslot4(udptable, udp_sk(sk)->udp_lrpa_hash);
>>>> +	nhslot4 = udp_hashslot4(udptable, newhash4);
>>>> +	udp_sk(sk)->udp_lrpa_hash = newhash4;
>>>> +
>>>> +	if (hslot4 != nhslot4) {
>>>> +		spin_lock_bh(&hslot4->lock);
>>>> +		hlist_del_init_rcu(&udp_sk(sk)->udp_lrpa_node);
>>>> +		hslot4->count--;
>>>> +		spin_unlock_bh(&hslot4->lock);
>>>> +
>>>> +		synchronize_rcu();
>>>
>>> This deserve a comment explaining why it's needed. I had to dig in past
>>> revision to understand it.
>>>
>>
>> Got it. And a short explanation here (see [1] for detail):
>>
>> Here, we move a node from a hlist to another new one, i.e., update
>> node->next from the old hlist to the new hlist. For readers traversing
>> the old hlist, if we update node->next just when readers move onto the
>> moved node, then the readers also move to the new hlist. This is unexpected.
>>
>>       Reader(lookup)     Writer(rehash)
>>       -----------------  ---------------
>> 1. rcu_read_lock()
>> 2. pos = sk;
>> 3.                     hlist_del_init_rcu(sk, old_slot)
>> 4.                     hlist_add_head_rcu(sk, new_slot)
>> 5. pos = pos->next; <=
>> 6. rcu_read_unlock()
>>
>> [1]
>> https://lore.kernel.org/all/0fb425e0-5482-4cdf-9dc1-3906751f8f81@linux.alibaba.com/
> 
> Thanks. AFAICS the problem that such thing could cause is a lookup
> failure for a socket positioned later in the same chain when a previous
> entry is moved on a different slot during a concurrent lookup.
> 

Yes, you're right.

> I think that could be solved the same way TCP is handling such scenario:
> using hlist_null RCU list for the hash4 bucket, checking that a failed
> lookup ends in the same bucket where it started and eventually
> reiterating from the original bucket.
> 
> Have a look at __inet_lookup_established() for a more descriptive
> reference, especially:
> 
> https://elixir.bootlin.com/linux/v6.12-rc4/source/net/ipv4/inet_hashtables.c#L528
> 

Thank you! I'll try it in the next version.

>>>> +
...
>>>> +
>>>> +/* call with sock lock */
>>>> +static void udp4_hash4(struct sock *sk)
>>>> +{
>>>> +	struct udp_hslot *hslot, *hslot2, *hslot4;
>>>> +	struct net *net = sock_net(sk);
>>>> +	struct udp_table *udptable;
>>>> +	unsigned int hash;
>>>> +
>>>> +	if (sk_unhashed(sk) || inet_sk(sk)->inet_rcv_saddr == htonl(INADDR_ANY))
>>>> +		return;
>>>> +
>>>> +	hash = udp_ehashfn(net, inet_sk(sk)->inet_rcv_saddr, inet_sk(sk)->inet_num,
>>>> +			   inet_sk(sk)->inet_daddr, inet_sk(sk)->inet_dport);
>>>> +
>>>> +	udptable = net->ipv4.udp_table;
>>>> +	if (udp_hashed4(sk)) {
>>>> +		udp4_rehash4(udptable, sk, hash);
>>>
>>> It's unclear to me how we can enter this branch. Also it's unclear why
>>> here you don't need to call udp_hash4_inc()udp_hash4_dec, too. Why such
>>> accounting can't be placed in udp4_rehash4()?
>>>
>>
>> It's possible that a connected udp socket _re-connect_ to another remote
>> address. Then, because the local address is not changed, hash2 and its
>> hash4_cnt keep unchanged. But rehash4 need to be done.
>> I'll also add a comment here.
> 
> Right, UDP socket could actually connect() successfully twice in a row
> without a disconnect in between...
> 
> I almost missed the point that the ipv6 implementation is planned to
> land afterwards.
> 
> I'm sorry, but I think that would be problematic - i.e. if ipv4 support
> will land in 6.13, but ipv6 will not make it - due to time constraints -
> we will have (at least a release with inconsistent behavior between ipv4
> and ipv6. I think it will be better bundle such changes together.
> 

No problem. I can add ipv6 support in the next version too.

Thanks.
-- 
Philo

Re: [PATCH v5 net-next 3/3] ipv4/udp: Add 4-tuple hash for connected socket

Posted by Paolo Abeni 1 year, 3 months ago

On 10/24/24 17:01, Paolo Abeni wrote:
> The udp4_rehash4() call above is in atomic context and could end-up
> calling synchronize_rcu() which is a blocking function. You must avoid that.

I almost forgot: please include in this commit message or in the cover
letter, the performance figures for unconnected sockets before and after
this series and for a stress test with a lot of connected sockets,
before and after this series.

Thanks,

Paolo

[PATCH v5 net-next 1/3] net/udp: Add a new struct for hash2 slot
[PATCH v5 net-next 2/3] net/udp: Add 4-tuple hash list basis
[PATCH v5 net-next 3/3] ipv4/udp: Add 4-tuple hash for connected socket