[v4] udp: Add 4-tuple hash for connected sockets

[PATCH v4 net-next 2/3] net/udp: Add 4-tuple hash list basis

Posted by Philo Lu 1 month, 2 weeks ago

Add a new hash list, hash4, in udp table. It will be used to implement
4-tuple hash for connected udp sockets. This patch adds the hlist to
table, and implements helpers and the initialization. 4-tuple hash is
implemented in the following patch.

Signed-off-by: Philo Lu <lulie@linux.alibaba.com>
Signed-off-by: Cambda Zhu <cambda@linux.alibaba.com>
Signed-off-by: Fred Chen <fred.cc@alibaba-inc.com>
Signed-off-by: Yubing Qiu <yubing.qiuyubing@alibaba-inc.com>
---
 include/linux/udp.h |  7 +++++++
 include/net/udp.h   | 16 +++++++++++++++-
 net/ipv4/udp.c      | 15 +++++++++++++--
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/include/linux/udp.h b/include/linux/udp.h
index 3eb3f2b9a2a0..c04808360a05 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -56,6 +56,10 @@ struct udp_sock {
 	int		 pending;	/* Any pending frames ? */
 	__u8		 encap_type;	/* Is this an Encapsulation socket? */
 
+	/* For UDP 4-tuple hash */
+	__u16 udp_lrpa_hash;
+	struct hlist_node udp_lrpa_node;
+
 	/*
 	 * Following member retains the information to create a UDP header
 	 * when the socket is uncorked.
@@ -206,6 +210,9 @@ static inline void udp_allow_gso(struct sock *sk)
 #define udp_portaddr_for_each_entry_rcu(__sk, list) \
 	hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
 
+#define udp_lrpa_for_each_entry_rcu(__up, list) \
+	hlist_for_each_entry_rcu(__up, list, udp_lrpa_node)
+
 #define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE)
 
 #endif	/* _LINUX_UDP_H */
diff --git a/include/net/udp.h b/include/net/udp.h
index 595364729138..80f9622d0db3 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -50,7 +50,7 @@ struct udp_skb_cb {
 #define UDP_SKB_CB(__skb)	((struct udp_skb_cb *)((__skb)->cb))
 
 /**
- *	struct udp_hslot - UDP hash slot used by udp_table.hash
+ *	struct udp_hslot - UDP hash slot used by udp_table.hash/hash4
  *
  *	@head:	head of list of sockets
  *	@count:	number of sockets in 'head' list
@@ -79,12 +79,15 @@ struct udp_hslot_main {
  *
  *	@hash:	hash table, sockets are hashed on (local port)
  *	@hash2:	hash table, sockets are hashed on (local port, local address)
+ *	@hash4:	hash table, connected sockets are hashed on
+ *		(local port, local address, remote port, remote address)
  *	@mask:	number of slots in hash tables, minus 1
  *	@log:	log2(number of slots in hash table)
  */
 struct udp_table {
 	struct udp_hslot	*hash;
 	struct udp_hslot_main	*hash2;
+	struct udp_hslot	*hash4;
 	unsigned int		mask;
 	unsigned int		log;
 };
@@ -113,6 +116,17 @@ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table,
 	return &table->hash2[hash & table->mask].hslot;
 }
 
+static inline struct udp_hslot *udp_hashslot4(struct udp_table *table,
+					      unsigned int hash)
+{
+	return &table->hash4[hash & table->mask];
+}
+
+static inline bool udp_hashed4(const struct sock *sk)
+{
+	return !hlist_unhashed(&udp_sk(sk)->udp_lrpa_node);
+}
+
 extern struct proto udp_prot;
 
 extern atomic_long_t udp_memory_allocated;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3a31e7d6d0dd..1498ccb79c58 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3425,7 +3425,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 {
 	unsigned int i, slot_size;
 
-	slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main);
+	slot_size = 2 * sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main);
 	table->hash = alloc_large_system_hash(name,
 					      slot_size,
 					      uhash_entries,
@@ -3437,6 +3437,7 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 					      UDP_HTABLE_SIZE_MAX);
 
 	table->hash2 = (void *)(table->hash + (table->mask + 1));
+	table->hash4 = (void *)(table->hash2 + (table->mask + 1));
 	for (i = 0; i <= table->mask; i++) {
 		INIT_HLIST_HEAD(&table->hash[i].head);
 		table->hash[i].count = 0;
@@ -3448,6 +3449,11 @@ void __init udp_table_init(struct udp_table *table, const char *name)
 		spin_lock_init(&table->hash2[i].hslot.lock);
 		table->hash2[i].hash4_cnt = 0;
 	}
+	for (i = 0; i <= table->mask; i++) {
+		INIT_HLIST_HEAD(&table->hash4[i].head);
+		table->hash4[i].count = 0;
+		spin_lock_init(&table->hash4[i].lock);
+	}
 }
 
 u32 udp_flow_hashrnd(void)
@@ -3480,13 +3486,14 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent
 	if (!udptable)
 		goto out;
 
-	slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main);
+	slot_size = 2 * sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main);
 	udptable->hash = vmalloc_huge(hash_entries * slot_size,
 				      GFP_KERNEL_ACCOUNT);
 	if (!udptable->hash)
 		goto free_table;
 
 	udptable->hash2 = (void *)(udptable->hash + hash_entries);
+	udptable->hash4 = (void *)(udptable->hash2 + hash_entries);
 	udptable->mask = hash_entries - 1;
 	udptable->log = ilog2(hash_entries);
 
@@ -3499,6 +3506,10 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent
 		udptable->hash2[i].hslot.count = 0;
 		spin_lock_init(&udptable->hash2[i].hslot.lock);
 		udptable->hash2[i].hash4_cnt = 0;
+
+		INIT_HLIST_HEAD(&udptable->hash4[i].head);
+		udptable->hash4[i].count = 0;
+		spin_lock_init(&udptable->hash4[i].lock);
 	}
 
 	return udptable;
-- 
2.32.0.3.g01195cf9f

Re: [PATCH v4 net-next 2/3] net/udp: Add 4-tuple hash list basis

Posted by Paolo Abeni 1 month, 1 week ago

Hi,

On 10/12/24 03:29, Philo Lu wrote:
> @@ -3480,13 +3486,14 @@ static struct udp_table __net_init *udp_pernet_table_alloc(unsigned int hash_ent
>   	if (!udptable)
>   		goto out;
>   
> -	slot_size = sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main);
> +	slot_size = 2 * sizeof(struct udp_hslot) + sizeof(struct udp_hslot_main);
>   	udptable->hash = vmalloc_huge(hash_entries * slot_size,
>   				      GFP_KERNEL_ACCOUNT);

I'm sorry for the late feedback.

I think it would be better to make the hash4 infra a no op (no lookup, 
no additional memory used) for CONFIG_BASE_SMALL=y builds.

It would be great if you could please share some benchmark showing the 
raw max receive PPS performances for unconnected sockets, with and 
without this series applied, to ensure this does not cause any real 
regression for such workloads.

Thanks,

Paolo

Re: [PATCH v4 net-next 2/3] net/udp: Add 4-tuple hash list basis

Posted by Philo Lu 1 month, 1 week ago


On 2024/10/14 18:07, Paolo Abeni wrote:
> Hi,
> 
> On 10/12/24 03:29, Philo Lu wrote:
>> @@ -3480,13 +3486,14 @@ static struct udp_table __net_init 
>> *udp_pernet_table_alloc(unsigned int hash_ent
>>       if (!udptable)
>>           goto out;
>> -    slot_size = sizeof(struct udp_hslot) + sizeof(struct 
>> udp_hslot_main);
>> +    slot_size = 2 * sizeof(struct udp_hslot) + sizeof(struct 
>> udp_hslot_main);
>>       udptable->hash = vmalloc_huge(hash_entries * slot_size,
>>                         GFP_KERNEL_ACCOUNT);
> 
> I'm sorry for the late feedback.
> 
> I think it would be better to make the hash4 infra a no op (no lookup, 
> no additional memory used) for CONFIG_BASE_SMALL=y builds.
> 

Got it. There are 2 affected structs, udp_hslot and udp_sock. They (as 
well as related helpers like udp4_hash4) can be wrapped with 
CONFIG_BASE_SMALL, and then we can enable BASE_SMALL to eliminate 
additional overhead of hash4.

```
+struct udp_hslot_main {
+	struct udp_hslot	hslot; /* must be the first member */
+#if !IS_ENABLED(CONFIG_BASE_SMALL)
+	u32			hash4_cnt;
+#endif
+} __aligned(2 * sizeof(long));


@@ -56,6 +56,12 @@ struct udp_sock {
  	int		 pending;	/* Any pending frames ? */
  	__u8		 encap_type;	/* Is this an Encapsulation socket? */

+#if !IS_ENABLED(CONFIG_BASE_SMALL)
+	/* For UDP 4-tuple hash */
+	__u16 udp_lrpa_hash;
+	struct hlist_node udp_lrpa_node;
+#endif
+
```

> It would be great if you could please share some benchmark showing the 
> raw max receive PPS performances for unconnected sockets, with and 
> without this series applied, to ensure this does not cause any real 
> regression for such workloads.
> 

Tested using sockperf tp with default msgsize (14B), 3 times for w/ and 
w/o the patch set, and results show no obvious difference:

[msg/sec]  test1    test2    test3    mean
w/o patch  514,664  519,040  527,115  520.3k
w/  patch  516,863  526,337  527,195  523.5k (+0.6%)

Thank you for review, Paolo.
-- 
Philo

Re: [PATCH v4 net-next 2/3] net/udp: Add 4-tuple hash list basis

Posted by Paolo Abeni 1 month, 1 week ago

On 10/16/24 08:30, Philo Lu wrote:
> On 2024/10/14 18:07, Paolo Abeni wrote:
>> It would be great if you could please share some benchmark showing the
>> raw max receive PPS performances for unconnected sockets, with and
>> without this series applied, to ensure this does not cause any real
>> regression for such workloads.
>>
> 
> Tested using sockperf tp with default msgsize (14B), 3 times for w/ and
> w/o the patch set, and results show no obvious difference:
> 
> [msg/sec]  test1    test2    test3    mean
> w/o patch  514,664  519,040  527,115  520.3k
> w/  patch  516,863  526,337  527,195  523.5k (+0.6%)
> 
> Thank you for review, Paolo.

Are the value in packet per seconds, or bytes per seconds? Are you doing 
a loopback test or over the wire? The most important question is: is the 
receiver side keeping (at least) 1 CPU fully busy? Otherwise the test is 
not very relevant.

It looks like you have some setup issue, or you are using a relatively 
low end H/W: the expected packet rate for reasonable server H/W is well 
above 1M (possibly much more than that, but I can't put my hands on 
recent H/W, so I can't provide a more accurate figure).

A single socket, user-space, UDP sender is usually unable to reach such 
tput without USO, and even with USO you likely need to do an 
over-the-wire test to really be able to keep the receiver fully busy. 
AFAICS sockperf does not support USO for the sender.

You could use the udpgso_bench_tx/udpgso_bench_rx pair from the net 
selftests directory instead.

Or you could use pktgen as traffic generator.

Thanks,

Paolo

Re: [PATCH v4 net-next 2/3] net/udp: Add 4-tuple hash list basis

Posted by Philo Lu 1 month, 1 week ago


On 2024/10/16 15:45, Paolo Abeni wrote:
> On 10/16/24 08:30, Philo Lu wrote:
>> On 2024/10/14 18:07, Paolo Abeni wrote:
>>> It would be great if you could please share some benchmark showing the
>>> raw max receive PPS performances for unconnected sockets, with and
>>> without this series applied, to ensure this does not cause any real
>>> regression for such workloads.
>>>
>>
>> Tested using sockperf tp with default msgsize (14B), 3 times for w/ and
>> w/o the patch set, and results show no obvious difference:
>>
>> [msg/sec]  test1    test2    test3    mean
>> w/o patch  514,664  519,040  527,115  520.3k
>> w/  patch  516,863  526,337  527,195  523.5k (+0.6%)
>>
>> Thank you for review, Paolo.
> 
> Are the value in packet per seconds, or bytes per seconds? Are you doing 
> a loopback test or over the wire? The most important question is: is the 
> receiver side keeping (at least) 1 CPU fully busy? Otherwise the test is 
> not very relevant.
> 
> It looks like you have some setup issue, or you are using a relatively 
> low end H/W: the expected packet rate for reasonable server H/W is well 
> above 1M (possibly much more than that, but I can't put my hands on 
> recent H/W, so I can't provide a more accurate figure).
> 
> A single socket, user-space, UDP sender is usually unable to reach such 
> tput without USO, and even with USO you likely need to do an over-the- 
> wire test to really be able to keep the receiver fully busy. AFAICS 
> sockperf does not support USO for the sender.
> 
> You could use the udpgso_bench_tx/udpgso_bench_rx pair from the net 
> selftests directory instead.
> 
> Or you could use pktgen as traffic generator.
> 

I test it again with udpgso_bench_tx/udpgso_bench_rx. In server, 2 cpus 
are involved, one for udpgso_bench_rx and the other for nic rx queue so 
that the si of nic rx cpu is 100%. udpgso_bench_tx runs with payload 
size 20, and the tx pps is larger than rx ensuring rx is the bottleneck.

The outputs of udpgso_bench_rx:
[without patchset]
udp rx:     20 MB/s  1092546 calls/s
udp rx:     20 MB/s  1095051 calls/s
udp rx:     20 MB/s  1094136 calls/s
udp rx:     20 MB/s  1098860 calls/s
udp rx:     20 MB/s  1097963 calls/s
udp rx:     20 MB/s  1097460 calls/s
udp rx:     20 MB/s  1098370 calls/s
udp rx:     20 MB/s  1098089 calls/s
udp rx:     20 MB/s  1095330 calls/s
udp rx:     20 MB/s  1095486 calls/s

[with patchset]
udp rx:     21 MB/s  1105533 calls/s
udp rx:     21 MB/s  1105475 calls/s
udp rx:     21 MB/s  1104244 calls/s
udp rx:     21 MB/s  1105600 calls/s
udp rx:     21 MB/s  1108019 calls/s
udp rx:     21 MB/s  1101971 calls/s
udp rx:     21 MB/s  1104147 calls/s
udp rx:     21 MB/s  1104874 calls/s
udp rx:     21 MB/s  1101987 calls/s
udp rx:     21 MB/s  1105500 calls/s

The averages w/ and w/o the patchset are 1104735 and 1096329, the gap is 
0.8%, which I think is negligible.

Besides, perf shows ~0.6% higher cpu consumption of __udp4_lib_lookup() 
with this patchset (increasing from 5.7% to 6.3%).

Thanks.
-- 
Philo

Re: [PATCH v4 net-next 2/3] net/udp: Add 4-tuple hash list basis

Posted by Philo Lu 1 month, 1 week ago


On 2024/10/16 15:45, Paolo Abeni wrote:
> On 10/16/24 08:30, Philo Lu wrote:
>> On 2024/10/14 18:07, Paolo Abeni wrote:
>>> It would be great if you could please share some benchmark showing the
>>> raw max receive PPS performances for unconnected sockets, with and
>>> without this series applied, to ensure this does not cause any real
>>> regression for such workloads.
>>>
>>
>> Tested using sockperf tp with default msgsize (14B), 3 times for w/ and
>> w/o the patch set, and results show no obvious difference:
>>
>> [msg/sec]  test1    test2    test3    mean
>> w/o patch  514,664  519,040  527,115  520.3k
>> w/  patch  516,863  526,337  527,195  523.5k (+0.6%)
>>
>> Thank you for review, Paolo.
> 
> Are the value in packet per seconds, or bytes per seconds? Are you doing 
> a loopback test or over the wire? The most important question is: is the 
> receiver side keeping (at least) 1 CPU fully busy? Otherwise the test is 
> not very relevant.
> 

It's in packet per seconds (msg/sec). I make the cpu fully busy by 
binding the nic irq the same cpu with the server socket. The consumption 
is like:

%Cpu0:
3.0 us,  35.0 sy,  0.0 ni,  0.0 id,  0.0 wa,  7.0 hi,  55.0 si,  0.0 st

> It looks like you have some setup issue, or you are using a relatively 
> low end H/W: the expected packet rate for reasonable server H/W is well 
> above 1M (possibly much more than that, but I can't put my hands on 
> recent H/W, so I can't provide a more accurate figure).
> 
> A single socket, user-space, UDP sender is usually unable to reach such 
> tput without USO, and even with USO you likely need to do an over-the- 
> wire test to really be able to keep the receiver fully busy. AFAICS 
> sockperf does not support USO for the sender.
> 
> You could use the udpgso_bench_tx/udpgso_bench_rx pair from the net 
> selftests directory instead.
> 

Thank you for your suggestion. I'll try it to see if I can get higher pps.
-- 
Philo