[v6] ipc/msg: mitigate the lock contention in ipc/msg

[PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter

Posted by Jiebin Sun 3 years, 6 months ago

The msg_bytes and msg_hdrs atomic counters are frequently
updated when IPC msg queue is in heavy use, causing heavy
cache bounce and overhead. Change them to percpu_counter
greatly improve the performance. Since there is one percpu
struct per namespace, additional memory cost is minimal.
Reading of the count done in msgctl call, which is infrequent.
So the need to sum up the counts in each CPU is infrequent.

Apply the patch and test the pts/stress-ng-1.4.0
-- system v message passing (160 threads).

Score gain: 3.99x

CPU: ICX 8380 x 2 sockets
Core number: 40 x 2 physical cores
Benchmark: pts/stress-ng-1.4.0
-- system v message passing (160 threads)

Signed-off-by: Jiebin Sun <jiebin.sun@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 include/linux/ipc_namespace.h |  5 ++--
 ipc/msg.c                     | 44 ++++++++++++++++++++++++-----------
 ipc/namespace.c               |  5 +++-
 ipc/util.h                    |  4 ++--
 4 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h
index e3e8c8662b49..e8240cf2611a 100644
--- a/include/linux/ipc_namespace.h
+++ b/include/linux/ipc_namespace.h
@@ -11,6 +11,7 @@
 #include <linux/refcount.h>
 #include <linux/rhashtable-types.h>
 #include <linux/sysctl.h>
+#include <linux/percpu_counter.h>
 
 struct user_namespace;
 
@@ -36,8 +37,8 @@ struct ipc_namespace {
 	unsigned int	msg_ctlmax;
 	unsigned int	msg_ctlmnb;
 	unsigned int	msg_ctlmni;
-	atomic_t	msg_bytes;
-	atomic_t	msg_hdrs;
+	struct percpu_counter percpu_msg_bytes;
+	struct percpu_counter percpu_msg_hdrs;
 
 	size_t		shm_ctlmax;
 	size_t		shm_ctlall;
diff --git a/ipc/msg.c b/ipc/msg.c
index a0d05775af2c..f2bb4c193ecf 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -39,6 +39,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
 #include <linux/rhashtable.h>
+#include <linux/percpu_counter.h>
 
 #include <asm/current.h>
 #include <linux/uaccess.h>
@@ -285,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp)
 	rcu_read_unlock();
 
 	list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
-		atomic_dec(&ns->msg_hdrs);
+		percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1);
 		free_msg(msg);
 	}
-	atomic_sub(msq->q_cbytes, &ns->msg_bytes);
+	percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes);
 	ipc_update_pid(&msq->q_lspid, NULL);
 	ipc_update_pid(&msq->q_lrpid, NULL);
 	ipc_rcu_putref(&msq->q_perm, msg_rcu_free);
@@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
 	msginfo->msgssz = MSGSSZ;
 	msginfo->msgseg = MSGSEG;
 	down_read(&msg_ids(ns).rwsem);
-	if (cmd == MSG_INFO) {
+	if (cmd == MSG_INFO)
 		msginfo->msgpool = msg_ids(ns).in_use;
-		msginfo->msgmap = atomic_read(&ns->msg_hdrs);
-		msginfo->msgtql = atomic_read(&ns->msg_bytes);
+	max_idx = ipc_get_maxidx(&msg_ids(ns));
+	up_read(&msg_ids(ns).rwsem);
+	if (cmd == MSG_INFO) {
+		msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
+		msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
 	} else {
 		msginfo->msgmap = MSGMAP;
 		msginfo->msgpool = MSGPOOL;
 		msginfo->msgtql = MSGTQL;
 	}
-	max_idx = ipc_get_maxidx(&msg_ids(ns));
-	up_read(&msg_ids(ns).rwsem);
 	return (max_idx < 0) ? 0 : max_idx;
 }
 
@@ -935,8 +937,8 @@ static long do_msgsnd(int msqid, long mtype, void __user *mtext,
 		list_add_tail(&msg->m_list, &msq->q_messages);
 		msq->q_cbytes += msgsz;
 		msq->q_qnum++;
-		atomic_add(msgsz, &ns->msg_bytes);
-		atomic_inc(&ns->msg_hdrs);
+		percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz);
+		percpu_counter_add_local(&ns->percpu_msg_hdrs, 1);
 	}
 
 	err = 0;
@@ -1159,8 +1161,8 @@ static long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, in
 			msq->q_rtime = ktime_get_real_seconds();
 			ipc_update_pid(&msq->q_lrpid, task_tgid(current));
 			msq->q_cbytes -= msg->m_ts;
-			atomic_sub(msg->m_ts, &ns->msg_bytes);
-			atomic_dec(&ns->msg_hdrs);
+			percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts);
+			percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1);
 			ss_wakeup(msq, &wake_q, false);
 
 			goto out_unlock0;
@@ -1297,20 +1299,34 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_uptr_t, msgp,
 }
 #endif
 
-void msg_init_ns(struct ipc_namespace *ns)
+int msg_init_ns(struct ipc_namespace *ns)
 {
+	int ret;
+
 	ns->msg_ctlmax = MSGMAX;
 	ns->msg_ctlmnb = MSGMNB;
 	ns->msg_ctlmni = MSGMNI;
 
-	atomic_set(&ns->msg_bytes, 0);
-	atomic_set(&ns->msg_hdrs, 0);
+	ret = percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL);
+	if (ret)
+		goto fail_msg_bytes;
+	ret = percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL);
+	if (ret)
+		goto fail_msg_hdrs;
 	ipc_init_ids(&ns->ids[IPC_MSG_IDS]);
+	return 0;
+
+	fail_msg_hdrs:
+		percpu_counter_destroy(&ns->percpu_msg_bytes);
+	fail_msg_bytes:
+		return ret;
 }
 
 #ifdef CONFIG_IPC_NS
 void msg_exit_ns(struct ipc_namespace *ns)
 {
+	percpu_counter_destroy(&ns->percpu_msg_bytes);
+	percpu_counter_destroy(&ns->percpu_msg_hdrs);
 	free_ipcs(ns, &msg_ids(ns), freeque);
 	idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr);
 	rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht);
diff --git a/ipc/namespace.c b/ipc/namespace.c
index e1fcaedba4fa..8316ea585733 100644
--- a/ipc/namespace.c
+++ b/ipc/namespace.c
@@ -66,8 +66,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_namespace *user_ns,
 	if (!setup_ipc_sysctls(ns))
 		goto fail_mq;
 
+	err = msg_init_ns(ns);
+	if (err)
+		goto fail_put;
+
 	sem_init_ns(ns);
-	msg_init_ns(ns);
 	shm_init_ns(ns);
 
 	return ns;
diff --git a/ipc/util.h b/ipc/util.h
index 2dd7ce0416d8..1b0086c6346f 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -64,7 +64,7 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) { }
 
 #ifdef CONFIG_SYSVIPC
 void sem_init_ns(struct ipc_namespace *ns);
-void msg_init_ns(struct ipc_namespace *ns);
+int msg_init_ns(struct ipc_namespace *ns);
 void shm_init_ns(struct ipc_namespace *ns);
 
 void sem_exit_ns(struct ipc_namespace *ns);
@@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns);
 void shm_exit_ns(struct ipc_namespace *ns);
 #else
 static inline void sem_init_ns(struct ipc_namespace *ns) { }
-static inline void msg_init_ns(struct ipc_namespace *ns) { }
+static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;}
 static inline void shm_init_ns(struct ipc_namespace *ns) { }
 
 static inline void sem_exit_ns(struct ipc_namespace *ns) { }
-- 
2.31.1

Re: [PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter

Posted by Manfred Spraul 3 years, 6 months ago

Hi Jiebin,

On 9/13/22 21:25, Jiebin Sun wrote:
> The msg_bytes and msg_hdrs atomic counters are frequently
> updated when IPC msg queue is in heavy use, causing heavy
> cache bounce and overhead. Change them to percpu_counter
> greatly improve the performance. Since there is one percpu
> struct per namespace, additional memory cost is minimal.
> Reading of the count done in msgctl call, which is infrequent.
> So the need to sum up the counts in each CPU is infrequent.
>
> Apply the patch and test the pts/stress-ng-1.4.0
> -- system v message passing (160 threads).
>
> Score gain: 3.99x
>
> CPU: ICX 8380 x 2 sockets
> Core number: 40 x 2 physical cores
> Benchmark: pts/stress-ng-1.4.0
> -- system v message passing (160 threads)
>
> Signed-off-by: Jiebin Sun <jiebin.sun@intel.com>
> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Manfred Spraul <manfred@colorfullif.com>
> @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
>   	msginfo->msgssz = MSGSSZ;
>   	msginfo->msgseg = MSGSEG;
>   	down_read(&msg_ids(ns).rwsem);
> -	if (cmd == MSG_INFO) {
> +	if (cmd == MSG_INFO)
>   		msginfo->msgpool = msg_ids(ns).in_use;
> -		msginfo->msgmap = atomic_read(&ns->msg_hdrs);
> -		msginfo->msgtql = atomic_read(&ns->msg_bytes);
> +	max_idx = ipc_get_maxidx(&msg_ids(ns));
> +	up_read(&msg_ids(ns).rwsem);
> +	if (cmd == MSG_INFO) {
> +		msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
> +		msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);

Not caused by your change, it just now becomes obvious:

msginfo->msgmap and ->msgtql are type int, i.e. signed 32-bit, and the 
actual counters are 64-bit.
This can overflow - and I think the code should handle this. Just clamp 
the values to INT_MAX.

-- 

     Manfred

Re: [PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter

Posted by Sun, Jiebin 3 years, 6 months ago

On 9/18/2022 8:53 PM, Manfred Spraul wrote:
> Hi Jiebin,
>
> On 9/13/22 21:25, Jiebin Sun wrote:
>> The msg_bytes and msg_hdrs atomic counters are frequently
>> updated when IPC msg queue is in heavy use, causing heavy
>> cache bounce and overhead. Change them to percpu_counter
>> greatly improve the performance. Since there is one percpu
>> struct per namespace, additional memory cost is minimal.
>> Reading of the count done in msgctl call, which is infrequent.
>> So the need to sum up the counts in each CPU is infrequent.
>>
>> Apply the patch and test the pts/stress-ng-1.4.0
>> -- system v message passing (160 threads).
>>
>> Score gain: 3.99x
>>
>> CPU: ICX 8380 x 2 sockets
>> Core number: 40 x 2 physical cores
>> Benchmark: pts/stress-ng-1.4.0
>> -- system v message passing (160 threads)
>>
>> Signed-off-by: Jiebin Sun <jiebin.sun@intel.com>
>> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
> Reviewed-by: Manfred Spraul <manfred@colorfullif.com>
>> @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace 
>> *ns, int msqid,
>>       msginfo->msgssz = MSGSSZ;
>>       msginfo->msgseg = MSGSEG;
>>       down_read(&msg_ids(ns).rwsem);
>> -    if (cmd == MSG_INFO) {
>> +    if (cmd == MSG_INFO)
>>           msginfo->msgpool = msg_ids(ns).in_use;
>> -        msginfo->msgmap = atomic_read(&ns->msg_hdrs);
>> -        msginfo->msgtql = atomic_read(&ns->msg_bytes);
>> +    max_idx = ipc_get_maxidx(&msg_ids(ns));
>> +    up_read(&msg_ids(ns).rwsem);
>> +    if (cmd == MSG_INFO) {
>> +        msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
>> +        msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
>
> Not caused by your change, it just now becomes obvious:
>
> msginfo->msgmap and ->msgtql are type int, i.e. signed 32-bit, and the 
> actual counters are 64-bit.
> This can overflow - and I think the code should handle this. Just 
> clamp the values to INT_MAX.
>
Hi Manfred,

Thanks for your advice. But I'm not sure if we could fix the overflow 
issue in ipc/msg totally by

clamp(val, low, INT_MAX). If the value is over s32, we might avoid the 
reversal sign, but still could

not get the accurate value.

Re: [PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter

Posted by Manfred Spraul 3 years, 6 months ago

On 9/20/22 04:36, Sun, Jiebin wrote:
>
> On 9/18/2022 8:53 PM, Manfred Spraul wrote:
>> Hi Jiebin,
>>
>> On 9/13/22 21:25, Jiebin Sun wrote:
>>> The msg_bytes and msg_hdrs atomic counters are frequently
>>> updated when IPC msg queue is in heavy use, causing heavy
>>> cache bounce and overhead. Change them to percpu_counter
>>> greatly improve the performance. Since there is one percpu
>>> struct per namespace, additional memory cost is minimal.
>>> Reading of the count done in msgctl call, which is infrequent.
>>> So the need to sum up the counts in each CPU is infrequent.
>>>
>>> Apply the patch and test the pts/stress-ng-1.4.0
>>> -- system v message passing (160 threads).
>>>
>>> Score gain: 3.99x
>>>
>>> CPU: ICX 8380 x 2 sockets
>>> Core number: 40 x 2 physical cores
>>> Benchmark: pts/stress-ng-1.4.0
>>> -- system v message passing (160 threads)
>>>
>>> Signed-off-by: Jiebin Sun <jiebin.sun@intel.com>
>>> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
>> Reviewed-by: Manfred Spraul <manfred@colorfullif.com>
>>> @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace 
>>> *ns, int msqid,
>>>       msginfo->msgssz = MSGSSZ;
>>>       msginfo->msgseg = MSGSEG;
>>>       down_read(&msg_ids(ns).rwsem);
>>> -    if (cmd == MSG_INFO) {
>>> +    if (cmd == MSG_INFO)
>>>           msginfo->msgpool = msg_ids(ns).in_use;
>>> -        msginfo->msgmap = atomic_read(&ns->msg_hdrs);
>>> -        msginfo->msgtql = atomic_read(&ns->msg_bytes);
>>> +    max_idx = ipc_get_maxidx(&msg_ids(ns));
>>> +    up_read(&msg_ids(ns).rwsem);
>>> +    if (cmd == MSG_INFO) {
>>> +        msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
>>> +        msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
>>
>> Not caused by your change, it just now becomes obvious:
>>
>> msginfo->msgmap and ->msgtql are type int, i.e. signed 32-bit, and 
>> the actual counters are 64-bit.
>> This can overflow - and I think the code should handle this. Just 
>> clamp the values to INT_MAX.
>>
> Hi Manfred,
>
> Thanks for your advice. But I'm not sure if we could fix the overflow 
> issue in ipc/msg totally by
>
> clamp(val, low, INT_MAX). If the value is over s32, we might avoid the 
> reversal sign, but still could
>
> not get the accurate value.

I think just clamping it to INT_MAX is the best approach.
Reporting negative values is worse than clamping. If (and only if) there 
are real users that need to know the total amount of memory allocated 
for messages queues in one namespace, then we could add a MSG_INFO64 
with long values. But I would not add that right now, I do not see a 
real use case where the value would be needed.

Any other opinions?

--

     Manfred

Re: [PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter

Posted by Sun, Jiebin 3 years, 6 months ago

On 9/20/2022 12:53 PM, Manfred Spraul wrote:
> On 9/20/22 04:36, Sun, Jiebin wrote:
>>
>> On 9/18/2022 8:53 PM, Manfred Spraul wrote:
>>> Hi Jiebin,
>>>
>>> On 9/13/22 21:25, Jiebin Sun wrote:
>>>> The msg_bytes and msg_hdrs atomic counters are frequently
>>>> updated when IPC msg queue is in heavy use, causing heavy
>>>> cache bounce and overhead. Change them to percpu_counter
>>>> greatly improve the performance. Since there is one percpu
>>>> struct per namespace, additional memory cost is minimal.
>>>> Reading of the count done in msgctl call, which is infrequent.
>>>> So the need to sum up the counts in each CPU is infrequent.
>>>>
>>>> Apply the patch and test the pts/stress-ng-1.4.0
>>>> -- system v message passing (160 threads).
>>>>
>>>> Score gain: 3.99x
>>>>
>>>> CPU: ICX 8380 x 2 sockets
>>>> Core number: 40 x 2 physical cores
>>>> Benchmark: pts/stress-ng-1.4.0
>>>> -- system v message passing (160 threads)
>>>>
>>>> Signed-off-by: Jiebin Sun <jiebin.sun@intel.com>
>>>> Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
>>> Reviewed-by: Manfred Spraul <manfred@colorfullif.com>
>>>> @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace 
>>>> *ns, int msqid,
>>>>       msginfo->msgssz = MSGSSZ;
>>>>       msginfo->msgseg = MSGSEG;
>>>>       down_read(&msg_ids(ns).rwsem);
>>>> -    if (cmd == MSG_INFO) {
>>>> +    if (cmd == MSG_INFO)
>>>>           msginfo->msgpool = msg_ids(ns).in_use;
>>>> -        msginfo->msgmap = atomic_read(&ns->msg_hdrs);
>>>> -        msginfo->msgtql = atomic_read(&ns->msg_bytes);
>>>> +    max_idx = ipc_get_maxidx(&msg_ids(ns));
>>>> +    up_read(&msg_ids(ns).rwsem);
>>>> +    if (cmd == MSG_INFO) {
>>>> +        msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
>>>> +        msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
>>>
>>> Not caused by your change, it just now becomes obvious:
>>>
>>> msginfo->msgmap and ->msgtql are type int, i.e. signed 32-bit, and 
>>> the actual counters are 64-bit.
>>> This can overflow - and I think the code should handle this. Just 
>>> clamp the values to INT_MAX.
>>>
>> Hi Manfred,
>>
>> Thanks for your advice. But I'm not sure if we could fix the overflow 
>> issue in ipc/msg totally by
>>
>> clamp(val, low, INT_MAX). If the value is over s32, we might avoid 
>> the reversal sign, but still could
>>
>> not get the accurate value.
>
> I think just clamping it to INT_MAX is the best approach.
> Reporting negative values is worse than clamping. If (and only if) 
> there are real users that need to know the total amount of memory 
> allocated for messages queues in one namespace, then we could add a 
> MSG_INFO64 with long values. But I would not add that right now, I do 
> not see a real use case where the value would be needed.
>
> Any other opinions?
>
> -- 
>
>     Manfred
>
>
OK. I will work on it and send it out for review.

[PATCH] ipc/msg: avoid negative value by overflow in msginfo

Posted by Jiebin Sun 3 years, 6 months ago

The 32-bit value in msginfo struct could be negative if we get it
from signed 64-bit. Clamping it to INT_MAX helps to avoid the
negative value by overflow.

Signed-off-by: Jiebin Sun <jiebin.sun@intel.com>
Reviewed-by: Manfred Spraul <manfred@colorfullif.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
---
 ipc/msg.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ipc/msg.c b/ipc/msg.c
index f2bb4c193ecf..65f437e28c9b 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -501,8 +501,8 @@ static int msgctl_info(struct ipc_namespace *ns, int msqid,
 	max_idx = ipc_get_maxidx(&msg_ids(ns));
 	up_read(&msg_ids(ns).rwsem);
 	if (cmd == MSG_INFO) {
-		msginfo->msgmap = percpu_counter_sum(&ns->percpu_msg_hdrs);
-		msginfo->msgtql = percpu_counter_sum(&ns->percpu_msg_bytes);
+		msginfo->msgmap = min(percpu_counter_sum(&ns->percpu_msg_hdrs), INT_MAX);
+		msginfo->msgtql = min(percpu_counter_sum(&ns->percpu_msg_bytes), INT_MAX);
 	} else {
 		msginfo->msgmap = MSGMAP;
 		msginfo->msgpool = MSGPOOL;
-- 
2.31.1