[v1] net/smc: Optimize the search method of reused buf_desc

[PATCH net-next] net/smc: Optimize the search method of reused buf_desc

Posted by liqiang 3 weeks, 2 days ago

We create a lock-less link list for the currently 
idle reusable smc_buf_desc.

When the 'used' filed mark to 0, it is added to 
the lock-less linked list. 

When a new connection is established, a suitable 
element is obtained directly, which eliminates the 
need for traversal and search, and does not require 
locking resource.

A lock-less linked list is a linked list that uses 
atomic operations to optimize the producer-consumer model.

I didn't find a suitable public benchmark, so I tested the 
time-consuming comparison of this function under multiple 
connections based on redis-benchmark (test in smc loopback-ism mode):

    1. On the current version:
        [x.832733] smc_buf_get_slot cost:602 ns, walk 10 buf_descs
        [x.832860] smc_buf_get_slot cost:329 ns, walk 12 buf_descs
        [x.832999] smc_buf_get_slot cost:479 ns, walk 17 buf_descs
        [x.833157] smc_buf_get_slot cost:679 ns, walk 13 buf_descs
        ...
        [x.045240] smc_buf_get_slot cost:5528 ns, walk 196 buf_descs
        [x.045389] smc_buf_get_slot cost:4721 ns, walk 197 buf_descs
        [x.045537] smc_buf_get_slot cost:4075 ns, walk 198 buf_descs
        [x.046010] smc_buf_get_slot cost:6476 ns, walk 199 buf_descs

    2. Apply this patch:
        [x.180857] smc_buf_get_slot_free cost:75 ns
        [x.181001] smc_buf_get_slot_free cost:147 ns
        [x.181128] smc_buf_get_slot_free cost:97 ns
        [x.181282] smc_buf_get_slot_free cost:132 ns
        [x.181451] smc_buf_get_slot_free cost:74 ns

It can be seen from the data that it takes about 5~6us to traverse 200 
times, and the time complexity of the lock-less linked algorithm is O(1).

And my test process is only single-threaded. If multiple threads 
establish SMC connections in parallel, locks will also become a 
bottleneck, and lock-less linked can solve this problem well.

SO I guess this patch should be beneficial in scenarios where a 
large number of short connections are parallel?


Signed-off-by: liqiang <liqiang64@huawei.com>
---
 net/smc/smc_core.c | 57 ++++++++++++++++++++++++++++++----------------
 net/smc/smc_core.h |  4 ++++
 2 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 500952c2e67b..85dbb366274a 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -16,6 +16,7 @@
 #include <linux/wait.h>
 #include <linux/reboot.h>
 #include <linux/mutex.h>
+#include <linux/llist.h>
 #include <linux/list.h>
 #include <linux/smc.h>
 #include <net/tcp.h>
@@ -909,6 +910,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
 		INIT_LIST_HEAD(&lgr->rmbs[i]);
+		init_llist_head(&lgr->rmbs_free[i]);
+		init_llist_head(&lgr->sndbufs_free[i]);
 	}
 	lgr->next_link_id = 0;
 	smc_lgr_list.num += SMC_LGR_NUM_INCR;
@@ -1183,6 +1186,10 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
 		/* memzero_explicit provides potential memory barrier semantics */
 		memzero_explicit(buf_desc->cpu_addr, buf_desc->len);
 		WRITE_ONCE(buf_desc->used, 0);
+		if (is_rmb)
+			llist_add(&buf_desc->llist, &lgr->rmbs_free[buf_desc->bufsiz_comp]);
+		else
+			llist_add(&buf_desc->llist, &lgr->sndbufs_free[buf_desc->bufsiz_comp]);
 	}
 }
 
@@ -1214,6 +1221,8 @@ static void smc_buf_unuse(struct smc_connection *conn,
 		} else {
 			memzero_explicit(conn->sndbuf_desc->cpu_addr, bufsize);
 			WRITE_ONCE(conn->sndbuf_desc->used, 0);
+			llist_add(&conn->sndbuf_desc->llist,
+				  &lgr->sndbufs_free[conn->sndbuf_desc->bufsiz_comp]);
 		}
 		SMC_STAT_RMB_SIZE(smc, is_smcd, false, false, bufsize);
 	}
@@ -1225,6 +1234,8 @@ static void smc_buf_unuse(struct smc_connection *conn,
 			bufsize += sizeof(struct smcd_cdc_msg);
 			memzero_explicit(conn->rmb_desc->cpu_addr, bufsize);
 			WRITE_ONCE(conn->rmb_desc->used, 0);
+			llist_add(&conn->rmb_desc->llist,
+				  &lgr->rmbs_free[conn->rmb_desc->bufsiz_comp]);
 		}
 		SMC_STAT_RMB_SIZE(smc, is_smcd, true, false, bufsize);
 	}
@@ -1413,13 +1424,20 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
 {
 	struct smc_buf_desc *buf_desc, *bf_desc;
 	struct list_head *buf_list;
+	struct llist_head *buf_llist;
 	int i;
 
 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
-		if (is_rmb)
+		if (is_rmb) {
 			buf_list = &lgr->rmbs[i];
-		else
+			buf_llist = &lgr->rmbs_free[i];
+		} else {
 			buf_list = &lgr->sndbufs[i];
+			buf_llist = &lgr->sndbufs_free[i];
+		}
+		// just invalid this list first, and then free the memory
+		// in the following loop
+		llist_del_all(buf_llist);
 		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
 					 list) {
 			smc_lgr_buf_list_del(lgr, is_rmb, buf_desc);
@@ -2087,24 +2105,19 @@ int smc_uncompress_bufsize(u8 compressed)
 	return (int)size;
 }
 
-/* try to reuse a sndbuf or rmb description slot for a certain
- * buffer size; if not available, return NULL
- */
-static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
-					     struct rw_semaphore *lock,
-					     struct list_head *buf_list)
+/* use lock less list to save and find reuse buf desc */
+static struct smc_buf_desc *smc_buf_get_slot_free(struct llist_head *buf_llist)
 {
-	struct smc_buf_desc *buf_slot;
+	struct smc_buf_desc *buf_free;
+	struct llist_node *llnode;
 
-	down_read(lock);
-	list_for_each_entry(buf_slot, buf_list, list) {
-		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
-			up_read(lock);
-			return buf_slot;
-		}
-	}
-	up_read(lock);
-	return NULL;
+	if (llist_empty(buf_llist))
+		return NULL;
+	// lock-less link list don't need an lock
+	llnode = llist_del_first(buf_llist);
+	buf_free = llist_entry(llnode, struct smc_buf_desc, llist);
+	WRITE_ONCE(buf_free->used, 1);
+	return buf_free;
 }
 
 /* one of the conditions for announcing a receiver's current window size is
@@ -2409,6 +2422,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 	struct smc_connection *conn = &smc->conn;
 	struct smc_link_group *lgr = conn->lgr;
 	struct list_head *buf_list;
+	struct llist_head *buf_llist;
 	int bufsize, bufsize_comp;
 	struct rw_semaphore *lock;	/* lock buffer list */
 	bool is_dgraded = false;
@@ -2424,15 +2438,17 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 	     bufsize_comp >= 0; bufsize_comp--) {
 		if (is_rmb) {
 			lock = &lgr->rmbs_lock;
+			buf_llist = &lgr->rmbs_free[bufsize_comp];
 			buf_list = &lgr->rmbs[bufsize_comp];
 		} else {
 			lock = &lgr->sndbufs_lock;
+			buf_llist = &lgr->sndbufs_free[bufsize_comp];
 			buf_list = &lgr->sndbufs[bufsize_comp];
 		}
 		bufsize = smc_uncompress_bufsize(bufsize_comp);
 
 		/* check for reusable slot in the link group */
-		buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list);
+		buf_desc = smc_buf_get_slot_free(buf_llist);
 		if (buf_desc) {
 			buf_desc->is_dma_need_sync = 0;
 			SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize);
@@ -2457,7 +2473,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 
 		SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb);
 		SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize);
-		buf_desc->used = 1;
+		WRITE_ONCE(buf_desc->used, 1);
+		WRITE_ONCE(buf_desc->bufsiz_comp, bufsize_comp);
 		down_write(lock);
 		smc_lgr_buf_list_add(lgr, is_rmb, buf_list, buf_desc);
 		up_write(lock);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 69b54ecd6503..076ee15f5c10 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -188,10 +188,12 @@ struct smc_link {
 /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
 struct smc_buf_desc {
 	struct list_head	list;
+	struct llist_node	llist;
 	void			*cpu_addr;	/* virtual address of buffer */
 	struct page		*pages;
 	int			len;		/* length of buffer */
 	u32			used;		/* currently used / unused */
+	int			bufsiz_comp;
 	union {
 		struct { /* SMC-R */
 			struct sg_table	sgt[SMC_LINKS_PER_LGR_MAX];
@@ -278,8 +280,10 @@ struct smc_link_group {
 	unsigned short		vlan_id;	/* vlan id of link group */
 
 	struct list_head	sndbufs[SMC_RMBE_SIZES];/* tx buffers */
+	struct llist_head	sndbufs_free[SMC_RMBE_SIZES]; /* tx buffer free list */
 	struct rw_semaphore	sndbufs_lock;	/* protects tx buffers */
 	struct list_head	rmbs[SMC_RMBE_SIZES];	/* rx buffers */
+	struct llist_head	rmbs_free[SMC_RMBE_SIZES]; /* rx buffer free list */
 	struct rw_semaphore	rmbs_lock;	/* protects rx buffers */
 	u64			alloc_sndbufs;	/* stats of tx buffers */
 	u64			alloc_rmbs;	/* stats of rx buffers */
-- 
2.43.0

Re: [PATCH net-next] net/smc: Optimize the search method of reused buf_desc

Posted by Dust Li 3 weeks, 2 days ago

On 2024-11-01 16:23:42, liqiang wrote:
>We create a lock-less link list for the currently 
>idle reusable smc_buf_desc.
>
>When the 'used' filed mark to 0, it is added to 
>the lock-less linked list. 
>
>When a new connection is established, a suitable 
>element is obtained directly, which eliminates the 
>need for traversal and search, and does not require 
>locking resource.
>
>A lock-less linked list is a linked list that uses 
>atomic operations to optimize the producer-consumer model.
>
>I didn't find a suitable public benchmark, so I tested the 
>time-consuming comparison of this function under multiple 
>connections based on redis-benchmark (test in smc loopback-ism mode):

I think you can run test wrk/nginx test with short-lived connection.
For example:

```
# client
wrk -H "Connection: close" http://$serverIp

# server
nginx
```

>
>    1. On the current version:
>        [x.832733] smc_buf_get_slot cost:602 ns, walk 10 buf_descs
>        [x.832860] smc_buf_get_slot cost:329 ns, walk 12 buf_descs
>        [x.832999] smc_buf_get_slot cost:479 ns, walk 17 buf_descs
>        [x.833157] smc_buf_get_slot cost:679 ns, walk 13 buf_descs
>        ...
>        [x.045240] smc_buf_get_slot cost:5528 ns, walk 196 buf_descs
>        [x.045389] smc_buf_get_slot cost:4721 ns, walk 197 buf_descs
>        [x.045537] smc_buf_get_slot cost:4075 ns, walk 198 buf_descs
>        [x.046010] smc_buf_get_slot cost:6476 ns, walk 199 buf_descs
>
>    2. Apply this patch:
>        [x.180857] smc_buf_get_slot_free cost:75 ns
>        [x.181001] smc_buf_get_slot_free cost:147 ns
>        [x.181128] smc_buf_get_slot_free cost:97 ns
>        [x.181282] smc_buf_get_slot_free cost:132 ns
>        [x.181451] smc_buf_get_slot_free cost:74 ns
>
>It can be seen from the data that it takes about 5~6us to traverse 200 
>times, and the time complexity of the lock-less linked algorithm is O(1).
>
>And my test process is only single-threaded. If multiple threads 
>establish SMC connections in parallel, locks will also become a 
>bottleneck, and lock-less linked can solve this problem well.
>
>SO I guess this patch should be beneficial in scenarios where a 
>large number of short connections are parallel?

Based on your data, I'm afraid the short-lived connection
test won't show much benificial. Since the time to complete a
SMC-R connection should be several orders of magnitude larger
than 100ns.

Best regards,
Dust

Re: [PATCH net-next] net/smc: Optimize the search method of reused buf_desc

Posted by Li Qiang 3 weeks, 1 day ago


在 2024/11/1 18:52, Dust Li 写道:
> On 2024-11-01 16:23:42, liqiang wrote:
>> connections based on redis-benchmark (test in smc loopback-ism mode):
> 
> I think you can run test wrk/nginx test with short-lived connection.
> For example:
> 
> ```
> # client
> wrk -H "Connection: close" http://$serverIp
> 
> # server
> nginx
> ```

I tested with nginx, the test command is:
# server
smc_run nginx

# client
smc_run wrk -t <2,4,8,16,32,64> -c 200 -H "Connection: close" http://127.0.0.1

Requests/sec
--------+---------------+---------------+
req/s	| without patch	| apply patch	|
--------+---------------+---------------+
-t 2	|6924.18	|7456.54	|
--------+---------------+---------------+
-t 4	|8731.68	|9660.33	|
--------+---------------+---------------+
-t 8	|11363.22	|13802.08	|
--------+---------------+---------------+
-t 16	|12040.12	|18666.69	|
--------+---------------+---------------+
-t 32	|11460.82	|17017.28	|
--------+---------------+---------------+
-t 64	|11018.65	|14974.80	|
--------+---------------+---------------+

Transfer/sec
--------+---------------+---------------+
trans/s	| without patch	| apply patch	|
--------+---------------+---------------+
-t 2	|24.72MB	|26.62MB	|
--------+---------------+---------------+
-t 4	|31.18MB	|34.49MB	|
--------+---------------+---------------+
-t 8	|40.57MB	|49.28MB	|
--------+---------------+---------------+
-t 16	|42.99MB	|66.65MB	|
--------+---------------+---------------+
-t 32	|40.92MB	|60.76MB	|
--------+---------------+---------------+
-t 64	|39.34MB	|53.47MB	|
--------+---------------+---------------+

> 
>>
>>    1. On the current version:
>>        [x.832733] smc_buf_get_slot cost:602 ns, walk 10 buf_descs
>>        [x.832860] smc_buf_get_slot cost:329 ns, walk 12 buf_descs
>>        [x.832999] smc_buf_get_slot cost:479 ns, walk 17 buf_descs
>>        [x.833157] smc_buf_get_slot cost:679 ns, walk 13 buf_descs
>>        ...
>>        [x.045240] smc_buf_get_slot cost:5528 ns, walk 196 buf_descs
>>        [x.045389] smc_buf_get_slot cost:4721 ns, walk 197 buf_descs
>>        [x.045537] smc_buf_get_slot cost:4075 ns, walk 198 buf_descs
>>        [x.046010] smc_buf_get_slot cost:6476 ns, walk 199 buf_descs
>>
>>    2. Apply this patch:
>>        [x.180857] smc_buf_get_slot_free cost:75 ns
>>        [x.181001] smc_buf_get_slot_free cost:147 ns
>>        [x.181128] smc_buf_get_slot_free cost:97 ns
>>        [x.181282] smc_buf_get_slot_free cost:132 ns
>>        [x.181451] smc_buf_get_slot_free cost:74 ns
>>
>> It can be seen from the data that it takes about 5~6us to traverse 200 
> 
> Based on your data, I'm afraid the short-lived connection
> test won't show much benificial. Since the time to complete a
> SMC-R connection should be several orders of magnitude larger
> than 100ns.

Sorry, I didn't explain my test data well before.

The main optimized functions of this patch are as follows:

```
struct smc_buf_desc *smc_buf_get_slot(...)
{
	struct smc_buf_desc *buf_slot;
        down_read(lock);
        list_for_each_entry(buf_slot, buf_list, list) {
                if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
                        up_read(lock);
                        return buf_slot;
                }
        }
        up_read(lock);
        return NULL;
}
```
The above data is the time-consuming data of this function.
If the current system has 200 active links, then during the
process of establishing a new SMC connection, this function
must traverse all 200 active links, which will take 5~6us.
If there are already 1,000 for active links, it takes about 30us.

After optimization, this function takes <100ns, it has nothing
to do with the number of active links.

Moreover, the lock has been removed, which is firendly to multi-thread
parallel scenarios.

The optimized code is as follows:

```
static struct smc_buf_desc *smc_buf_get_slot_free(struct llist_head *buf_llist)
{
        struct smc_buf_desc *buf_free;
        struct llist_node *llnode;

        if (llist_empty(buf_llist))
                return NULL;
        // lock-less link list don't need an lock
        llnode = llist_del_first(buf_llist);
        buf_free = llist_entry(llnode, struct smc_buf_desc, llist);
        WRITE_ONCE(buf_free->used, 1);
        return buf_free;
}
```

-- 
Cheers,
Li Qiang

Re: [PATCH net-next] net/smc: Optimize the search method of reused buf_desc

Posted by Dust Li 2 weeks, 6 days ago

On 2024-11-02 14:43:52, Li Qiang wrote:
>
>
>在 2024/11/1 18:52, Dust Li 写道:
>> On 2024-11-01 16:23:42, liqiang wrote:
>>> connections based on redis-benchmark (test in smc loopback-ism mode):
>> 
>> I think you can run test wrk/nginx test with short-lived connection.
>> For example:
>> 
>> ```
>> # client
>> wrk -H "Connection: close" http://$serverIp
>> 
>> # server
>> nginx
>> ```
>
>I tested with nginx, the test command is:
># server
>smc_run nginx
>
># client
>smc_run wrk -t <2,4,8,16,32,64> -c 200 -H "Connection: close" http://127.0.0.1
>
>Requests/sec
>--------+---------------+---------------+
>req/s	| without patch	| apply patch	|
>--------+---------------+---------------+
>-t 2	|6924.18	|7456.54	|
>--------+---------------+---------------+
>-t 4	|8731.68	|9660.33	|
>--------+---------------+---------------+
>-t 8	|11363.22	|13802.08	|
>--------+---------------+---------------+
>-t 16	|12040.12	|18666.69	|
>--------+---------------+---------------+
>-t 32	|11460.82	|17017.28	|
>--------+---------------+---------------+
>-t 64	|11018.65	|14974.80	|
>--------+---------------+---------------+
>
>Transfer/sec
>--------+---------------+---------------+
>trans/s	| without patch	| apply patch	|
>--------+---------------+---------------+
>-t 2	|24.72MB	|26.62MB	|
>--------+---------------+---------------+
>-t 4	|31.18MB	|34.49MB	|
>--------+---------------+---------------+
>-t 8	|40.57MB	|49.28MB	|
>--------+---------------+---------------+
>-t 16	|42.99MB	|66.65MB	|
>--------+---------------+---------------+
>-t 32	|40.92MB	|60.76MB	|
>--------+---------------+---------------+
>-t 64	|39.34MB	|53.47MB	|
>--------+---------------+---------------+
>
>> 
>>>
>>>    1. On the current version:
>>>        [x.832733] smc_buf_get_slot cost:602 ns, walk 10 buf_descs
>>>        [x.832860] smc_buf_get_slot cost:329 ns, walk 12 buf_descs
>>>        [x.832999] smc_buf_get_slot cost:479 ns, walk 17 buf_descs
>>>        [x.833157] smc_buf_get_slot cost:679 ns, walk 13 buf_descs
>>>        ...
>>>        [x.045240] smc_buf_get_slot cost:5528 ns, walk 196 buf_descs
>>>        [x.045389] smc_buf_get_slot cost:4721 ns, walk 197 buf_descs
>>>        [x.045537] smc_buf_get_slot cost:4075 ns, walk 198 buf_descs
>>>        [x.046010] smc_buf_get_slot cost:6476 ns, walk 199 buf_descs
>>>
>>>    2. Apply this patch:
>>>        [x.180857] smc_buf_get_slot_free cost:75 ns
>>>        [x.181001] smc_buf_get_slot_free cost:147 ns
>>>        [x.181128] smc_buf_get_slot_free cost:97 ns
>>>        [x.181282] smc_buf_get_slot_free cost:132 ns
>>>        [x.181451] smc_buf_get_slot_free cost:74 ns
>>>
>>> It can be seen from the data that it takes about 5~6us to traverse 200 
>> 
>> Based on your data, I'm afraid the short-lived connection
>> test won't show much benificial. Since the time to complete a
>> SMC-R connection should be several orders of magnitude larger
>> than 100ns.
>
>Sorry, I didn't explain my test data well before.
>
>The main optimized functions of this patch are as follows:
>
>```
>struct smc_buf_desc *smc_buf_get_slot(...)
>{
>	struct smc_buf_desc *buf_slot;
>        down_read(lock);
>        list_for_each_entry(buf_slot, buf_list, list) {
>                if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
>                        up_read(lock);
>                        return buf_slot;
>                }
>        }
>        up_read(lock);
>        return NULL;
>}
>```
>The above data is the time-consuming data of this function.
>If the current system has 200 active links, then during the
>process of establishing a new SMC connection, this function
>must traverse all 200 active links, which will take 5~6us.
>If there are already 1,000 for active links, it takes about 30us.
>
>After optimization, this function takes <100ns, it has nothing
>to do with the number of active links.
>
>Moreover, the lock has been removed, which is firendly to multi-thread
>parallel scenarios.
>
>The optimized code is as follows:
>
>```
>static struct smc_buf_desc *smc_buf_get_slot_free(struct llist_head *buf_llist)
>{
>        struct smc_buf_desc *buf_free;
>        struct llist_node *llnode;
>
>        if (llist_empty(buf_llist))
>                return NULL;
>        // lock-less link list don't need an lock
         ^^^ kernel use /**/ for comments

>        llnode = llist_del_first(buf_llist);
>        buf_free = llist_entry(llnode, struct smc_buf_desc, llist);

If 2 CPU both passed the llist_empty() check, only 1 CPU can get llnode,
the other one should be NULL ?

>        WRITE_ONCE(buf_free->used, 1);
>        return buf_free;
>}
>```
>
>-- 
>Cheers,
>Li Qiang

Re: [PATCH net-next] net/smc: Optimize the search method of reused buf_desc

Posted by Li Qiang 2 weeks, 6 days ago


在 2024/11/4 16:13, Dust Li 写道:
> On 2024-11-02 14:43:52, Li Qiang wrote:
>>
>>
>> 在 2024/11/1 18:52, Dust Li 写道:
>>> On 2024-11-01 16:23:42, liqiang wrote:
>>>> connections based on redis-benchmark (test in smc loopback-ism mode):
>>> ...
>>> ```
>>
>> I tested with nginx, the test command is:
>> # server
>> smc_run nginx
>>
>> # client
>> smc_run wrk -t <2,4,8,16,32,64> -c 200 -H "Connection: close" http://127.0.0.1
>>
>> Requests/sec
>> --------+---------------+---------------+
>> req/s	| without patch	| apply patch	|
>> --------+---------------+---------------+
>> -t 2	|6924.18	|7456.54	|
>> --------+---------------+---------------+
>> -t 4	|8731.68	|9660.33	|
>> --------+---------------+---------------+
>> -t 8	|11363.22	|13802.08	|
>> --------+---------------+---------------+
>> -t 16	|12040.12	|18666.69	|
>> --------+---------------+---------------+
>> -t 32	|11460.82	|17017.28	|
>> --------+---------------+---------------+
>> -t 64	|11018.65	|14974.80	|
>> --------+---------------+---------------+
>>
>> Transfer/sec
>> --------+---------------+---------------+
>> trans/s	| without patch	| apply patch	|
>> --------+---------------+---------------+
>> -t 2	|24.72MB	|26.62MB	|
>> --------+---------------+---------------+
>> -t 4	|31.18MB	|34.49MB	|
>> --------+---------------+---------------+
>> -t 8	|40.57MB	|49.28MB	|
>> --------+---------------+---------------+
>> -t 16	|42.99MB	|66.65MB	|
>> --------+---------------+---------------+
>> -t 32	|40.92MB	|60.76MB	|
>> --------+---------------+---------------+
>> -t 64	|39.34MB	|53.47MB	|
>> --------+---------------+---------------+
>>
>>>
>>>>
>>>>    1. On the current version:
>>>>        [x.832733] smc_buf_get_slot cost:602 ns, walk 10 buf_descs
>>>>        [x.832860] smc_buf_get_slot cost:329 ns, walk 12 buf_descs
>>>>        [x.832999] smc_buf_get_slot cost:479 ns, walk 17 buf_descs
>>>>        [x.833157] smc_buf_get_slot cost:679 ns, walk 13 buf_descs
>>>>        ...
>>>>        [x.045240] smc_buf_get_slot cost:5528 ns, walk 196 buf_descs
>>>>        [x.045389] smc_buf_get_slot cost:4721 ns, walk 197 buf_descs
>>>>        [x.045537] smc_buf_get_slot cost:4075 ns, walk 198 buf_descs
>>>>        [x.046010] smc_buf_get_slot cost:6476 ns, walk 199 buf_descs
>>>>
>>>>    2. Apply this patch:
>>>>        [x.180857] smc_buf_get_slot_free cost:75 ns
>>>>        [x.181001] smc_buf_get_slot_free cost:147 ns
>>>>        [x.181128] smc_buf_get_slot_free cost:97 ns
>>>>        [x.181282] smc_buf_get_slot_free cost:132 ns
>>>>        [x.181451] smc_buf_get_slot_free cost:74 ns
>>>>
>>>> It can be seen from the data that it takes about 5~6us to traverse 200 
>>>
>>> Based on your data, I'm afraid the short-lived connection
>>> test won't show much benificial. Since the time to complete a
>>> SMC-R connection should be several orders of magnitude larger
>>> than 100ns.
>>
>> Sorry, I didn't explain my test data well before.
>>
>> The main optimized functions of this patch are as follows:
>>
>> ```
>> struct smc_buf_desc *smc_buf_get_slot(...)
>> {
>> 	struct smc_buf_desc *buf_slot;
>>        down_read(lock);
>>        list_for_each_entry(buf_slot, buf_list, list) {
>>                if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
>>                        up_read(lock);
>>                        return buf_slot;
>>                }
>>        }
>>        up_read(lock);
>>        return NULL;
>> }
>> ```
>> ...
>>
>> The optimized code is as follows:
>>
>> ```
>> static struct smc_buf_desc *smc_buf_get_slot_free(struct llist_head *buf_llist)
>> {
>>        struct smc_buf_desc *buf_free;
>>        struct llist_node *llnode;
>>
>>        if (llist_empty(buf_llist))
>>                return NULL;
>>        // lock-less link list don't need an lock
>          ^^^ kernel use /**/ for comments

Ok I will change it. :-)

> 
>>        llnode = llist_del_first(buf_llist);
>>        buf_free = llist_entry(llnode, struct smc_buf_desc, llist);
> 
> If 2 CPU both passed the llist_empty() check, only 1 CPU can get llnode,
> the other one should be NULL ?

Well, what you said makes sense, I think the previous judgment of llist_empty
is useless and can be deleted. This function should be changed to:
```
static struct smc_buf_desc *smc_buf_get_slot_free(struct llist_head *buf_llist)
{
	struct smc_buf_desc *buf_free;
	struct llist_node *llnode;

	/* lock-less link list don't need an lock */
	llnode = llist_del_first(buf_llist);
        if (llnode == NULL)
            return NULL;
	buf_free = llist_entry(llnode, struct smc_buf_desc, llist);
	WRITE_ONCE(buf_free->used, 1);
	return buf_free;
}
```

If there is only one node left in the linked list, multiple CPUs will
compete based on CAS instructions in llist_del_first. In the end, only
one consumer will get the node, and other consumers will get the null pointer.

Thank you!

> 
>>        WRITE_ONCE(buf_free->used, 1);
>>        return buf_free;
>> }
>> ```

-- 
Best regards,
Li Qiang

[PATCH v2 net-next] net/smc: Optimize the search method of reused buf_desc

Posted by liqiang 2 weeks, 5 days ago

We create a lock-less link list for the currently
idle reusable smc_buf_desc.

When the 'used' filed mark to 0, it is added to
the lock-less linked list.

When a new connection is established, a suitable
element is obtained directly, which eliminates the
need for traversal and search, and does not require
locking resource.

A lock-free linked list is a linked list that uses
atomic operations to optimize the producer-consumer model.

I tested the time-consuming comparison of this function
under multiple connections based on redis-benchmark
(test in smc loopback-ism mode):
The function 'smc_buf_get_slot' takes less time when a
new SMC link is established:
1. 5us->100ns (when there are 200 active links);
2. 30us->100ns (when there are 1000 active links).

Test data with wrk+nginx command:
On server:
smc_run nginx

On client:
smc_run wrk -t <2~64> -c 200 -H "Connection: close" http://127.0.0.1

Requests/sec
--------+---------------+---------------+
req/s   | without patch | apply patch   |
--------+---------------+---------------+
-t 2    |6924.18        |7456.54        |
--------+---------------+---------------+
-t 4    |8731.68        |9660.33        |
--------+---------------+---------------+
-t 8    |11363.22       |13802.08       |
--------+---------------+---------------+
-t 16   |12040.12       |18666.69       |
--------+---------------+---------------+
-t 32   |11460.82       |17017.28       |
--------+---------------+---------------+
-t 64   |11018.65       |14974.80       |
--------+---------------+---------------+

Transfer/sec
--------+---------------+---------------+
trans/s | without patch | apply patch   |
--------+---------------+---------------+
-t 2    |24.72MB        |26.62MB        |
--------+---------------+---------------+
-t 4    |31.18MB        |34.49MB        |
--------+---------------+---------------+
-t 8    |40.57MB        |49.28MB        |
--------+---------------+---------------+
-t 16   |42.99MB        |66.65MB        |
--------+---------------+---------------+
-t 32   |40.92MB        |60.76MB        |
--------+---------------+---------------+
-t 64   |39.34MB        |53.47MB        |
--------+---------------+---------------+


Signed-off-by: liqiang <liqiang64@huawei.com>
---
v2:
- Correct the acquisition logic of a lock-less linked list.(Dust.Li)
- fix comment symbol '//' -> '/**/'.(Dust.Li)
v1: https://lore.kernel.org/all/20241101082342.1254-1-liqiang64@huawei.com/

 net/smc/smc_core.c | 58 ++++++++++++++++++++++++++++++----------------
 net/smc/smc_core.h |  4 ++++
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 500952c2e67b..6f26e70c7c4d 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -16,6 +16,7 @@
 #include <linux/wait.h>
 #include <linux/reboot.h>
 #include <linux/mutex.h>
+#include <linux/llist.h>
 #include <linux/list.h>
 #include <linux/smc.h>
 #include <net/tcp.h>
@@ -909,6 +910,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
 		INIT_LIST_HEAD(&lgr->rmbs[i]);
+		init_llist_head(&lgr->rmbs_free[i]);
+		init_llist_head(&lgr->sndbufs_free[i]);
 	}
 	lgr->next_link_id = 0;
 	smc_lgr_list.num += SMC_LGR_NUM_INCR;
@@ -1183,6 +1186,10 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
 		/* memzero_explicit provides potential memory barrier semantics */
 		memzero_explicit(buf_desc->cpu_addr, buf_desc->len);
 		WRITE_ONCE(buf_desc->used, 0);
+		if (is_rmb)
+			llist_add(&buf_desc->llist, &lgr->rmbs_free[buf_desc->bufsiz_comp]);
+		else
+			llist_add(&buf_desc->llist, &lgr->sndbufs_free[buf_desc->bufsiz_comp]);
 	}
 }
 
@@ -1214,6 +1221,8 @@ static void smc_buf_unuse(struct smc_connection *conn,
 		} else {
 			memzero_explicit(conn->sndbuf_desc->cpu_addr, bufsize);
 			WRITE_ONCE(conn->sndbuf_desc->used, 0);
+			llist_add(&conn->sndbuf_desc->llist,
+				  &lgr->sndbufs_free[conn->sndbuf_desc->bufsiz_comp]);
 		}
 		SMC_STAT_RMB_SIZE(smc, is_smcd, false, false, bufsize);
 	}
@@ -1225,6 +1234,8 @@ static void smc_buf_unuse(struct smc_connection *conn,
 			bufsize += sizeof(struct smcd_cdc_msg);
 			memzero_explicit(conn->rmb_desc->cpu_addr, bufsize);
 			WRITE_ONCE(conn->rmb_desc->used, 0);
+			llist_add(&conn->rmb_desc->llist,
+				  &lgr->rmbs_free[conn->rmb_desc->bufsiz_comp]);
 		}
 		SMC_STAT_RMB_SIZE(smc, is_smcd, true, false, bufsize);
 	}
@@ -1413,13 +1424,21 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
 {
 	struct smc_buf_desc *buf_desc, *bf_desc;
 	struct list_head *buf_list;
+	struct llist_head *buf_llist;
 	int i;
 
 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
-		if (is_rmb)
+		if (is_rmb) {
 			buf_list = &lgr->rmbs[i];
-		else
+			buf_llist = &lgr->rmbs_free[i];
+		} else {
 			buf_list = &lgr->sndbufs[i];
+			buf_llist = &lgr->sndbufs_free[i];
+		}
+		/* just invalid this list first, and then free the memory
+		 * in the following loop
+		 */
+		llist_del_all(buf_llist);
 		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
 					 list) {
 			smc_lgr_buf_list_del(lgr, is_rmb, buf_desc);
@@ -2087,24 +2106,19 @@ int smc_uncompress_bufsize(u8 compressed)
 	return (int)size;
 }
 
-/* try to reuse a sndbuf or rmb description slot for a certain
- * buffer size; if not available, return NULL
- */
-static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
-					     struct rw_semaphore *lock,
-					     struct list_head *buf_list)
+/* use lock less list to save and find reuse buf desc */
+static struct smc_buf_desc *smc_buf_get_slot_free(struct llist_head *buf_llist)
 {
-	struct smc_buf_desc *buf_slot;
+	struct smc_buf_desc *buf_free;
+	struct llist_node *llnode;
 
-	down_read(lock);
-	list_for_each_entry(buf_slot, buf_list, list) {
-		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
-			up_read(lock);
-			return buf_slot;
-		}
-	}
-	up_read(lock);
-	return NULL;
+	/* lock-less link list don't need an lock */
+	llnode = llist_del_first(buf_llist);
+	if (!llnode)
+		return NULL;
+	buf_free = llist_entry(llnode, struct smc_buf_desc, llist);
+	WRITE_ONCE(buf_free->used, 1);
+	return buf_free;
 }
 
 /* one of the conditions for announcing a receiver's current window size is
@@ -2409,6 +2423,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 	struct smc_connection *conn = &smc->conn;
 	struct smc_link_group *lgr = conn->lgr;
 	struct list_head *buf_list;
+	struct llist_head *buf_llist;
 	int bufsize, bufsize_comp;
 	struct rw_semaphore *lock;	/* lock buffer list */
 	bool is_dgraded = false;
@@ -2424,15 +2439,17 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 	     bufsize_comp >= 0; bufsize_comp--) {
 		if (is_rmb) {
 			lock = &lgr->rmbs_lock;
+			buf_llist = &lgr->rmbs_free[bufsize_comp];
 			buf_list = &lgr->rmbs[bufsize_comp];
 		} else {
 			lock = &lgr->sndbufs_lock;
+			buf_llist = &lgr->sndbufs_free[bufsize_comp];
 			buf_list = &lgr->sndbufs[bufsize_comp];
 		}
 		bufsize = smc_uncompress_bufsize(bufsize_comp);
 
 		/* check for reusable slot in the link group */
-		buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list);
+		buf_desc = smc_buf_get_slot_free(buf_llist);
 		if (buf_desc) {
 			buf_desc->is_dma_need_sync = 0;
 			SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize);
@@ -2457,7 +2474,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 
 		SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb);
 		SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize);
-		buf_desc->used = 1;
+		WRITE_ONCE(buf_desc->used, 1);
+		WRITE_ONCE(buf_desc->bufsiz_comp, bufsize_comp);
 		down_write(lock);
 		smc_lgr_buf_list_add(lgr, is_rmb, buf_list, buf_desc);
 		up_write(lock);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 69b54ecd6503..076ee15f5c10 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -188,10 +188,12 @@ struct smc_link {
 /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
 struct smc_buf_desc {
 	struct list_head	list;
+	struct llist_node	llist;
 	void			*cpu_addr;	/* virtual address of buffer */
 	struct page		*pages;
 	int			len;		/* length of buffer */
 	u32			used;		/* currently used / unused */
+	int			bufsiz_comp;
 	union {
 		struct { /* SMC-R */
 			struct sg_table	sgt[SMC_LINKS_PER_LGR_MAX];
@@ -278,8 +280,10 @@ struct smc_link_group {
 	unsigned short		vlan_id;	/* vlan id of link group */
 
 	struct list_head	sndbufs[SMC_RMBE_SIZES];/* tx buffers */
+	struct llist_head	sndbufs_free[SMC_RMBE_SIZES]; /* tx buffer free list */
 	struct rw_semaphore	sndbufs_lock;	/* protects tx buffers */
 	struct list_head	rmbs[SMC_RMBE_SIZES];	/* rx buffers */
+	struct llist_head	rmbs_free[SMC_RMBE_SIZES]; /* rx buffer free list */
 	struct rw_semaphore	rmbs_lock;	/* protects rx buffers */
 	u64			alloc_sndbufs;	/* stats of tx buffers */
 	u64			alloc_rmbs;	/* stats of rx buffers */
-- 
2.43.0

Re: [PATCH v2 net-next] net/smc: Optimize the search method of reused buf_desc

Posted by Dust Li 2 weeks, 5 days ago

On 2024-11-05 11:19:38, liqiang wrote:
>We create a lock-less link list for the currently
>idle reusable smc_buf_desc.
>
>When the 'used' filed mark to 0, it is added to
>the lock-less linked list.
>
>When a new connection is established, a suitable
>element is obtained directly, which eliminates the
>need for traversal and search, and does not require
>locking resource.
>
>A lock-free linked list is a linked list that uses
>atomic operations to optimize the producer-consumer model.
>
>I tested the time-consuming comparison of this function
>under multiple connections based on redis-benchmark
>(test in smc loopback-ism mode):
>The function 'smc_buf_get_slot' takes less time when a
>new SMC link is established:
>1. 5us->100ns (when there are 200 active links);
>2. 30us->100ns (when there are 1000 active links).
>
>Test data with wrk+nginx command:
>On server:
>smc_run nginx
>
>On client:
>smc_run wrk -t <2~64> -c 200 -H "Connection: close" http://127.0.0.1
>
>Requests/sec
>--------+---------------+---------------+
>req/s   | without patch | apply patch   |
>--------+---------------+---------------+
>-t 2    |6924.18        |7456.54        |
>--------+---------------+---------------+
>-t 4    |8731.68        |9660.33        |
>--------+---------------+---------------+
>-t 8    |11363.22       |13802.08       |
>--------+---------------+---------------+
>-t 16   |12040.12       |18666.69       |
>--------+---------------+---------------+
>-t 32   |11460.82       |17017.28       |
>--------+---------------+---------------+
>-t 64   |11018.65       |14974.80       |
>--------+---------------+---------------+
>
>Transfer/sec
>--------+---------------+---------------+
>trans/s | without patch | apply patch   |
>--------+---------------+---------------+
>-t 2    |24.72MB        |26.62MB        |
>--------+---------------+---------------+
>-t 4    |31.18MB        |34.49MB        |
>--------+---------------+---------------+
>-t 8    |40.57MB        |49.28MB        |
>--------+---------------+---------------+
>-t 16   |42.99MB        |66.65MB        |
>--------+---------------+---------------+
>-t 32   |40.92MB        |60.76MB        |
>--------+---------------+---------------+
>-t 64   |39.34MB        |53.47MB        |
>--------+---------------+---------------+
>
>
>Signed-off-by: liqiang <liqiang64@huawei.com>
>---
>v2:
>- Correct the acquisition logic of a lock-less linked list.(Dust.Li)
>- fix comment symbol '//' -> '/**/'.(Dust.Li)
>v1: https://lore.kernel.org/all/20241101082342.1254-1-liqiang64@huawei.com/
>
> net/smc/smc_core.c | 58 ++++++++++++++++++++++++++++++----------------
> net/smc/smc_core.h |  4 ++++
> 2 files changed, 42 insertions(+), 20 deletions(-)
>
>diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
>index 500952c2e67b..6f26e70c7c4d 100644
>--- a/net/smc/smc_core.c
>+++ b/net/smc/smc_core.c
>@@ -16,6 +16,7 @@
> #include <linux/wait.h>
> #include <linux/reboot.h>
> #include <linux/mutex.h>
>+#include <linux/llist.h>
> #include <linux/list.h>
> #include <linux/smc.h>
> #include <net/tcp.h>
>@@ -909,6 +910,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
> 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
> 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
> 		INIT_LIST_HEAD(&lgr->rmbs[i]);
>+		init_llist_head(&lgr->rmbs_free[i]);
>+		init_llist_head(&lgr->sndbufs_free[i]);
> 	}
> 	lgr->next_link_id = 0;
> 	smc_lgr_list.num += SMC_LGR_NUM_INCR;
>@@ -1183,6 +1186,10 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
> 		/* memzero_explicit provides potential memory barrier semantics */
> 		memzero_explicit(buf_desc->cpu_addr, buf_desc->len);
> 		WRITE_ONCE(buf_desc->used, 0);
>+		if (is_rmb)
>+			llist_add(&buf_desc->llist, &lgr->rmbs_free[buf_desc->bufsiz_comp]);
>+		else
>+			llist_add(&buf_desc->llist, &lgr->sndbufs_free[buf_desc->bufsiz_comp]);
> 	}
> }
> 
>@@ -1214,6 +1221,8 @@ static void smc_buf_unuse(struct smc_connection *conn,
> 		} else {
> 			memzero_explicit(conn->sndbuf_desc->cpu_addr, bufsize);
> 			WRITE_ONCE(conn->sndbuf_desc->used, 0);
>+			llist_add(&conn->sndbuf_desc->llist,
>+				  &lgr->sndbufs_free[conn->sndbuf_desc->bufsiz_comp]);
> 		}
> 		SMC_STAT_RMB_SIZE(smc, is_smcd, false, false, bufsize);
> 	}
>@@ -1225,6 +1234,8 @@ static void smc_buf_unuse(struct smc_connection *conn,
> 			bufsize += sizeof(struct smcd_cdc_msg);
> 			memzero_explicit(conn->rmb_desc->cpu_addr, bufsize);
> 			WRITE_ONCE(conn->rmb_desc->used, 0);
>+			llist_add(&conn->rmb_desc->llist,
>+				  &lgr->rmbs_free[conn->rmb_desc->bufsiz_comp]);
> 		}
> 		SMC_STAT_RMB_SIZE(smc, is_smcd, true, false, bufsize);
> 	}
>@@ -1413,13 +1424,21 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
> {
> 	struct smc_buf_desc *buf_desc, *bf_desc;
> 	struct list_head *buf_list;
>+	struct llist_head *buf_llist;
> 	int i;
> 
> 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
>-		if (is_rmb)
>+		if (is_rmb) {
> 			buf_list = &lgr->rmbs[i];
>-		else
>+			buf_llist = &lgr->rmbs_free[i];
>+		} else {
> 			buf_list = &lgr->sndbufs[i];
>+			buf_llist = &lgr->sndbufs_free[i];
>+		}
>+		/* just invalid this list first, and then free the memory
>+		 * in the following loop
>+		 */
>+		llist_del_all(buf_llist);
> 		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
> 					 list) {
> 			smc_lgr_buf_list_del(lgr, is_rmb, buf_desc);
>@@ -2087,24 +2106,19 @@ int smc_uncompress_bufsize(u8 compressed)
> 	return (int)size;
> }
> 
>-/* try to reuse a sndbuf or rmb description slot for a certain
>- * buffer size; if not available, return NULL
>- */
>-static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
>-					     struct rw_semaphore *lock,
>-					     struct list_head *buf_list)
>+/* use lock less list to save and find reuse buf desc */
>+static struct smc_buf_desc *smc_buf_get_slot_free(struct llist_head *buf_llist)
> {
>-	struct smc_buf_desc *buf_slot;
>+	struct smc_buf_desc *buf_free;
>+	struct llist_node *llnode;
> 
>-	down_read(lock);
>-	list_for_each_entry(buf_slot, buf_list, list) {
>-		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
>-			up_read(lock);
>-			return buf_slot;
>-		}
>-	}
>-	up_read(lock);
>-	return NULL;
>+	/* lock-less link list don't need an lock */
>+	llnode = llist_del_first(buf_llist);
>+	if (!llnode)
>+		return NULL;
>+	buf_free = llist_entry(llnode, struct smc_buf_desc, llist);
>+	WRITE_ONCE(buf_free->used, 1);
>+	return buf_free;

Sorry for the late reply.

It looks this is not right here.

The rw_semaphore here is not used to protect against adding/deleting
the buf_list since we don't even add/remove elements on the buf_list.
The cmpxchg already makes sure only one will get an unused smc_buf_desc.

Removing the down_read()/up_read() would cause mapping/unmapping link
on the link group race agains the buf_slot alloc/free here. For exmaple
_smcr_buf_map_lgr() take the write lock of the rw_semaphore.

But I agree the lgr->rmbs_lock/sndbufs_lock should be improved. Would
you like digging into it and improve the usage of the lock here ?

Best regrads,
Dust

Re: [PATCH v2 net-next] net/smc: Optimize the search method of reused buf_desc

Posted by Li Qiang 2 weeks, 4 days ago


在 2024/11/5 22:44, Dust Li 写道:
> On 2024-11-05 11:19:38, liqiang wrote:
>> [...]
>> I tested the time-consuming comparison of this function
>> under multiple connections based on redis-benchmark
>> (test in smc loopback-ism mode):
>> The function 'smc_buf_get_slot' takes less time when a
>> new SMC link is established:
>> 1. 5us->100ns (when there are 200 active links);
>> 2. 30us->100ns (when there are 1000 active links).
>> [...]
>> -/* try to reuse a sndbuf or rmb description slot for a certain
>> - * buffer size; if not available, return NULL
>> - */
>> -static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
>> -					     struct rw_semaphore *lock,
>> -					     struct list_head *buf_list)
>> +/* use lock less list to save and find reuse buf desc */
>> +static struct smc_buf_desc *smc_buf_get_slot_free(struct llist_head *buf_llist)
>> {
>> -	struct smc_buf_desc *buf_slot;
>> +	struct smc_buf_desc *buf_free;
>> +	struct llist_node *llnode;
>>
>> -	down_read(lock);
>> -	list_for_each_entry(buf_slot, buf_list, list) {
>> -		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
>> -			up_read(lock);
>> -			return buf_slot;
>> -		}
>> -	}
>> -	up_read(lock);
>> -	return NULL;
>> +	/* lock-less link list don't need an lock */
>> +	llnode = llist_del_first(buf_llist);
>> +	if (!llnode)
>> +		return NULL;
>> +	buf_free = llist_entry(llnode, struct smc_buf_desc, llist);
>> +	WRITE_ONCE(buf_free->used, 1);
>> +	return buf_free;
> 
> Sorry for the late reply.
> 
> It looks this is not right here.
> 
> The rw_semaphore here is not used to protect against adding/deleting
> the buf_list since we don't even add/remove elements on the buf_list.
> The cmpxchg already makes sure only one will get an unused smc_buf_desc.

I first came up with the idea of optimizing because this function needs to
traverse all rmbs/sndbufs, which includes all active links and is a waste
of time and unnecessary.

Changing to an llist linked list implementation can ensure that a free buf_slot
with a 'used' mark of 0 can be directly obtained every time, without the need
to start traversing from the first element of the rmbs/sndbufs linked list.

> 
> Removing the down_read()/up_read() would cause mapping/unmapping link
> on the link group race agains the buf_slot alloc/free here. For exmaple
> _smcr_buf_map_lgr() take the write lock of the rw_semaphore.

Read from the relevant code, here only a buf_slot is found in the
down_read/up_read and 'used' is set to 0, while the 'used' is read
in other down_write/up_write code.

so I have two questions:

1. Is the read lock of rw_semaphore necessary here? (The read lock here
   is mutually exclusive with the write lock elsewhere, what is guaranteed
   should be that in the critical section of the write lock, all
   'smc_buf_desc->used' statuses in rmbs/sndbufs will not change.)
2. If is necessary, can we add it in new implement of this patch, like this?

```
{
        struct smc_buf_desc *buf_free;
        struct llist_node *llnode;

        /* lock-less link list don't need an lock */
        llnode = llist_del_first(buf_llist);
        if (!llnode)
                return NULL;
        buf_free = llist_entry(llnode, struct smc_buf_desc, llist);
	up_read(lock);
        WRITE_ONCE(buf_free->used, 1);
	down_read(lock);
        return buf_free;
}
```
I think this can also ensure that all 'used' marks remain unchanged during
the write lock process.
Anyway, use llist to manage free rmbs/sndbufs is a better choice than traverse,
and it doesn't conflict with useing or not using rw_semaphore. :)

> 
> But I agree the lgr->rmbs_lock/sndbufs_lock should be improved. Would
> you like digging into it and improve the usage of the lock here ?
> 

Maybe I can try it, I need to spend some time to read in detail the
code used in every place of this lock.

Thanks for taking the time to read this email. ;-)

-- 
Best regards,
Li Qiang

[PATCH net-next v3] net/smc: Optimize the search method of reused buf_desc

Posted by liqiang 1 week, 5 days ago

We create a lock-less link list for the currently
idle reusable smc_buf_desc.

When the 'used' filed mark to 0, it is added to
the lock-less linked list.

When a new connection is established, a suitable
element is obtained directly, which eliminates the
need for traversal and search, and does not require
locking resource.

Through my testing, this patch can significantly improve 
the link establishment speed of SMC, especially in the 
multi-threaded short connection benchmark.

I tested the time-consuming comparison of this function
under multiple connections based on redis-benchmark
(test in smc loopback-ism mode):

The function 'smc_buf_get_slot' takes less time when a
new SMC link is established:
1. 5us->100ns (when there are 200 active links);
2. 30us->100ns (when there are 1000 active links).

Test data with wrk+nginx command:
On server:
smc_run nginx

On client:
smc_run wrk -t <2~64> -c 200 -H "Connection: close" http://127.0.0.1

Requests/sec
--------+---------------+---------------+
req/s   | without patch | apply patch   |
--------+---------------+---------------+
-t 2    |6924.18        |7456.54        |
--------+---------------+---------------+
-t 4    |8731.68        |9660.33        |
--------+---------------+---------------+
-t 8    |11363.22       |13802.08       |
--------+---------------+---------------+
-t 16   |12040.12       |18666.69       |
--------+---------------+---------------+
-t 32   |11460.82       |17017.28       |
--------+---------------+---------------+
-t 64   |11018.65       |14974.80       |
--------+---------------+---------------+

Transfer/sec
--------+---------------+---------------+
trans/s | without patch | apply patch   |
--------+---------------+---------------+
-t 2    |24.72MB        |26.62MB        |
--------+---------------+---------------+
-t 4    |31.18MB        |34.49MB        |
--------+---------------+---------------+
-t 8    |40.57MB        |49.28MB        |
--------+---------------+---------------+
-t 16   |42.99MB        |66.65MB        |
--------+---------------+---------------+
-t 32   |40.92MB        |60.76MB        |
--------+---------------+---------------+
-t 64   |39.34MB        |53.47MB        |
--------+---------------+---------------+

Test environment:
QEMU emulator version 1.5.3 @ Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GHz

Signed-off-by: liqiang <liqiang64@huawei.com>
---
v3:
- Add lock protection to llist_del_first according to the module description.
- Restore the read-write lock with the used mark set.
v2: https://lore.kernel.org/all/20241105031938.1319-1-liqiang64@huawei.com/
- Correct the acquisition logic of a lock-less linked list.(Dust.Li)
- fix comment symbol '//' -> '/**/'.(Dust.Li)
v1: https://lore.kernel.org/all/20241101082342.1254-1-liqiang64@huawei.com/

 net/smc/smc_core.c | 65 +++++++++++++++++++++++++++++++++-------------
 net/smc/smc_core.h |  6 +++++
 2 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 500952c2e67b..238eb61ac653 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -16,6 +16,7 @@
 #include <linux/wait.h>
 #include <linux/reboot.h>
 #include <linux/mutex.h>
+#include <linux/llist.h>
 #include <linux/list.h>
 #include <linux/smc.h>
 #include <net/tcp.h>
@@ -906,9 +907,13 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 	init_rwsem(&lgr->sndbufs_lock);
 	init_rwsem(&lgr->rmbs_lock);
 	rwlock_init(&lgr->conns_lock);
+	spin_lock_init(&lgr->sndbufs_free_lock);
+	spin_lock_init(&lgr->rmbs_free_lock);
 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
 		INIT_LIST_HEAD(&lgr->sndbufs[i]);
 		INIT_LIST_HEAD(&lgr->rmbs[i]);
+		init_llist_head(&lgr->rmbs_free[i]);
+		init_llist_head(&lgr->sndbufs_free[i]);
 	}
 	lgr->next_link_id = 0;
 	smc_lgr_list.num += SMC_LGR_NUM_INCR;
@@ -1183,6 +1188,10 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
 		/* memzero_explicit provides potential memory barrier semantics */
 		memzero_explicit(buf_desc->cpu_addr, buf_desc->len);
 		WRITE_ONCE(buf_desc->used, 0);
+		if (is_rmb)
+			llist_add(&buf_desc->llist, &lgr->rmbs_free[buf_desc->bufsiz_comp]);
+		else
+			llist_add(&buf_desc->llist, &lgr->sndbufs_free[buf_desc->bufsiz_comp]);
 	}
 }
 
@@ -1214,6 +1223,8 @@ static void smc_buf_unuse(struct smc_connection *conn,
 		} else {
 			memzero_explicit(conn->sndbuf_desc->cpu_addr, bufsize);
 			WRITE_ONCE(conn->sndbuf_desc->used, 0);
+			llist_add(&conn->sndbuf_desc->llist,
+				  &lgr->sndbufs_free[conn->sndbuf_desc->bufsiz_comp]);
 		}
 		SMC_STAT_RMB_SIZE(smc, is_smcd, false, false, bufsize);
 	}
@@ -1225,6 +1236,8 @@ static void smc_buf_unuse(struct smc_connection *conn,
 			bufsize += sizeof(struct smcd_cdc_msg);
 			memzero_explicit(conn->rmb_desc->cpu_addr, bufsize);
 			WRITE_ONCE(conn->rmb_desc->used, 0);
+			llist_add(&conn->rmb_desc->llist,
+				  &lgr->rmbs_free[conn->rmb_desc->bufsiz_comp]);
 		}
 		SMC_STAT_RMB_SIZE(smc, is_smcd, true, false, bufsize);
 	}
@@ -1413,13 +1426,21 @@ static void __smc_lgr_free_bufs(struct smc_link_group *lgr, bool is_rmb)
 {
 	struct smc_buf_desc *buf_desc, *bf_desc;
 	struct list_head *buf_list;
+	struct llist_head *buf_llist;
 	int i;
 
 	for (i = 0; i < SMC_RMBE_SIZES; i++) {
-		if (is_rmb)
+		if (is_rmb) {
 			buf_list = &lgr->rmbs[i];
-		else
+			buf_llist = &lgr->rmbs_free[i];
+		} else {
 			buf_list = &lgr->sndbufs[i];
+			buf_llist = &lgr->sndbufs_free[i];
+		}
+		/* just invalid this list first, and then free the memory
+		 * in the following loop
+		 */
+		llist_del_all(buf_llist);
 		list_for_each_entry_safe(buf_desc, bf_desc, buf_list,
 					 list) {
 			smc_lgr_buf_list_del(lgr, is_rmb, buf_desc);
@@ -2087,24 +2108,25 @@ int smc_uncompress_bufsize(u8 compressed)
 	return (int)size;
 }
 
-/* try to reuse a sndbuf or rmb description slot for a certain
- * buffer size; if not available, return NULL
- */
-static struct smc_buf_desc *smc_buf_get_slot(int compressed_bufsize,
-					     struct rw_semaphore *lock,
-					     struct list_head *buf_list)
+/* use lock less list to save and find reuse buf desc */
+static struct smc_buf_desc *smc_buf_get_slot_free(struct llist_head *buf_llist,
+						  spinlock_t *llock, struct rw_semaphore *lock)
 {
-	struct smc_buf_desc *buf_slot;
+	struct smc_buf_desc *buf_free;
+	struct llist_node *llnode;
+
+	/* lock-less link list don't need an lock */
+	spin_lock(llock);
+	llnode = llist_del_first(buf_llist);
+	spin_unlock(llock);
+	if (!llnode)
+		return NULL;
+	buf_free = llist_entry(llnode, struct smc_buf_desc, llist);
 
 	down_read(lock);
-	list_for_each_entry(buf_slot, buf_list, list) {
-		if (cmpxchg(&buf_slot->used, 0, 1) == 0) {
-			up_read(lock);
-			return buf_slot;
-		}
-	}
+	WRITE_ONCE(buf_free->used, 1);
 	up_read(lock);
-	return NULL;
+	return buf_free;
 }
 
 /* one of the conditions for announcing a receiver's current window size is
@@ -2409,8 +2431,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 	struct smc_connection *conn = &smc->conn;
 	struct smc_link_group *lgr = conn->lgr;
 	struct list_head *buf_list;
+	struct llist_head *buf_llist;
 	int bufsize, bufsize_comp;
 	struct rw_semaphore *lock;	/* lock buffer list */
+	spinlock_t *llock;
 	bool is_dgraded = false;
 
 	if (is_rmb)
@@ -2424,15 +2448,19 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 	     bufsize_comp >= 0; bufsize_comp--) {
 		if (is_rmb) {
 			lock = &lgr->rmbs_lock;
+			llock = &lgr->rmbs_free_lock;
+			buf_llist = &lgr->rmbs_free[bufsize_comp];
 			buf_list = &lgr->rmbs[bufsize_comp];
 		} else {
 			lock = &lgr->sndbufs_lock;
+			llock = &lgr->sndbufs_free_lock;
+			buf_llist = &lgr->sndbufs_free[bufsize_comp];
 			buf_list = &lgr->sndbufs[bufsize_comp];
 		}
 		bufsize = smc_uncompress_bufsize(bufsize_comp);
 
 		/* check for reusable slot in the link group */
-		buf_desc = smc_buf_get_slot(bufsize_comp, lock, buf_list);
+		buf_desc = smc_buf_get_slot_free(buf_llist, llock, lock);
 		if (buf_desc) {
 			buf_desc->is_dma_need_sync = 0;
 			SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize);
@@ -2457,7 +2485,8 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb)
 
 		SMC_STAT_RMB_ALLOC(smc, is_smcd, is_rmb);
 		SMC_STAT_RMB_SIZE(smc, is_smcd, is_rmb, true, bufsize);
-		buf_desc->used = 1;
+		WRITE_ONCE(buf_desc->used, 1);
+		WRITE_ONCE(buf_desc->bufsiz_comp, bufsize_comp);
 		down_write(lock);
 		smc_lgr_buf_list_add(lgr, is_rmb, buf_list, buf_desc);
 		up_write(lock);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 69b54ecd6503..2f45a21796ae 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -188,10 +188,12 @@ struct smc_link {
 /* tx/rx buffer list element for sndbufs list and rmbs list of a lgr */
 struct smc_buf_desc {
 	struct list_head	list;
+	struct llist_node	llist;
 	void			*cpu_addr;	/* virtual address of buffer */
 	struct page		*pages;
 	int			len;		/* length of buffer */
 	u32			used;		/* currently used / unused */
+	int			bufsiz_comp;
 	union {
 		struct { /* SMC-R */
 			struct sg_table	sgt[SMC_LINKS_PER_LGR_MAX];
@@ -278,8 +280,12 @@ struct smc_link_group {
 	unsigned short		vlan_id;	/* vlan id of link group */
 
 	struct list_head	sndbufs[SMC_RMBE_SIZES];/* tx buffers */
+	struct llist_head	sndbufs_free[SMC_RMBE_SIZES]; /* tx buffer free list */
+	spinlock_t		sndbufs_free_lock;
 	struct rw_semaphore	sndbufs_lock;	/* protects tx buffers */
 	struct list_head	rmbs[SMC_RMBE_SIZES];	/* rx buffers */
+	struct llist_head	rmbs_free[SMC_RMBE_SIZES]; /* rx buffer free list */
+	spinlock_t		rmbs_free_lock;
 	struct rw_semaphore	rmbs_lock;	/* protects rx buffers */
 	u64			alloc_sndbufs;	/* stats of tx buffers */
 	u64			alloc_rmbs;	/* stats of rx buffers */
-- 
2.43.0

Re: [PATCH net-next v3] net/smc: Optimize the search method of reused buf_desc

Posted by Jakub Kicinski 5 days, 17 hours ago

On Tue, 12 Nov 2024 17:22:16 +0800 liqiang wrote:
> We create a lock-less link list for the currently
> idle reusable smc_buf_desc.
> 
> When the 'used' filed mark to 0, it is added to
> the lock-less linked list.
> 
> When a new connection is established, a suitable
> element is obtained directly, which eliminates the
> need for traversal and search, and does not require
> locking resource.
> 
> Through my testing, this patch can significantly improve 
> the link establishment speed of SMC, especially in the 
> multi-threaded short connection benchmark.
> 
> I tested the time-consuming comparison of this function
> under multiple connections based on redis-benchmark
> (test in smc loopback-ism mode):
> 
> The function 'smc_buf_get_slot' takes less time when a
> new SMC link is established:
> 1. 5us->100ns (when there are 200 active links);
> 2. 30us->100ns (when there are 1000 active links).

We're closing net-next and didn't get any review tags on this patch,
either from IBM or Alibaba folks. Please try again after the merge
window (after Dec 2nd, once v6.13-rc1 is tagged).
-- 
pw-bot: defer