From nobody Sun Apr 5 22:50:21 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id E144BC6FA82 for ; Tue, 13 Sep 2022 11:07:18 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231889AbiIMLHR (ORCPT ); Tue, 13 Sep 2022 07:07:17 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:51886 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231886AbiIMLHG (ORCPT ); Tue, 13 Sep 2022 07:07:06 -0400 Received: from mga02.intel.com (mga02.intel.com [134.134.136.20]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 662C05F9A3 for ; Tue, 13 Sep 2022 04:06:56 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1663067216; x=1694603216; h=from:to:cc:subject:date:message-id:in-reply-to: references:mime-version:content-transfer-encoding; bh=MiEDfww/B/2De+3MVM2YwC08RnmErrMIpkB17KjlA6Y=; b=Y5yjog+L8vTo3owYBw8Eob7QTbPIHQpdBw6ao1ZMOL0pybaL359rl5mR FRFPH1FHrMwQFQ9lUYZIixboV5U0LolUDo/GL7XqnHfBh4ujsOHcylsRb qAMekPV3Qp8ADJ2tSnvaYcoc1YmxSFsMgFdrq1+D7Rg9B7H6IShq1SPoK r72WTmKI6LBapT3us6qbTzkaGj5UWcf3SaPIfOjeQkQ3qlDkhFR6PxJvH 2E/dYB/PodtVRMFkEPHCtgQdQHznNvTt3XJTjikRuhoTzmn/Xhdd3cQQW oOfYqOPyHC/Jx6C3vSBnNdvaM3M/03RopBajF0kcY1/NkfU4ncPoHuzFq g==; X-IronPort-AV: E=McAfee;i="6500,9779,10468"; a="285130178" X-IronPort-AV: E=Sophos;i="5.93,312,1654585200"; d="scan'208";a="285130178" Received: from fmsmga008.fm.intel.com ([10.253.24.58]) by orsmga101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 13 Sep 2022 04:06:55 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.93,312,1654585200"; d="scan'208";a="678522192" Received: from linux-pnp-server-13.sh.intel.com ([10.239.176.176]) by fmsmga008.fm.intel.com with ESMTP; 13 Sep 2022 04:06:50 -0700 From: Jiebin Sun To: akpm@linux-foundation.org, vasily.averin@linux.dev, shakeelb@google.com, dennis@kernel.org, tj@kernel.org, cl@linux.com, ebiederm@xmission.com, legion@kernel.org, manfred@colorfullife.com, alexander.mikhalitsyn@virtuozzo.com, linux-mm@kvack.org, linux-kernel@vger.kernel.org Cc: tim.c.chen@intel.com, feng.tang@intel.com, ying.huang@intel.com, tianyou.li@intel.com, wangyang.guo@intel.com, jiebin.sun@intel.com, Tim Chen , kernel test robot Subject: [PATCH v6 1/2] percpu: Add percpu_counter_add_local and percpu_counter_sub_local Date: Wed, 14 Sep 2022 03:25:37 +0800 Message-Id: <20220913192538.3023708-2-jiebin.sun@intel.com> X-Mailer: git-send-email 2.31.1 In-Reply-To: <20220913192538.3023708-1-jiebin.sun@intel.com> References: <20220902152243.479592-1-jiebin.sun@intel.com> <20220913192538.3023708-1-jiebin.sun@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" The batch size in percpu_counter_add_batch should be very large in heavy writing and rare reading case. Add the "_local" version, and mostly it will do local adding, reduce the global updating and mitigate lock contention in writing. Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Reported-by: kernel test robot --- include/linux/percpu_counter.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 01861eebed79..8ed5fba6d156 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -15,6 +15,9 @@ #include #include =20 +/* percpu_counter batch for local add or sub */ +#define PERCPU_COUNTER_LOCAL_BATCH INT_MAX + #ifdef CONFIG_SMP =20 struct percpu_counter { @@ -56,6 +59,22 @@ static inline void percpu_counter_add(struct percpu_coun= ter *fbc, s64 amount) percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } =20 +/* + * With percpu_counter_add_local() and percpu_counter_sub_local(), counts + * are accumulated in local per cpu counter and not in fbc->count until + * local count overflows PERCPU_COUNTER_LOCAL_BATCH. This makes counter + * write efficient. + * But percpu_counter_sum(), instead of percpu_counter_read(), needs to be + * used to add up the counts from each CPU to account for all the local + * counts. So percpu_counter_add_local() and percpu_counter_sub_local() + * should be used when a counter is updated frequently and read rarely. + */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_batch(fbc, amount, PERCPU_COUNTER_LOCAL_BATCH); +} + static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { s64 ret =3D __percpu_counter_sum(fbc); @@ -138,6 +157,13 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amo= unt) preempt_enable(); } =20 +/* non-SMP percpu_counter_add_local is the same with percpu_counter_add */ +static inline void +percpu_counter_add_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add(fbc, amount); +} + static inline void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { @@ -193,4 +219,10 @@ static inline void percpu_counter_sub(struct percpu_co= unter *fbc, s64 amount) percpu_counter_add(fbc, -amount); } =20 +static inline void +percpu_counter_sub_local(struct percpu_counter *fbc, s64 amount) +{ + percpu_counter_add_local(fbc, -amount); +} + #endif /* _LINUX_PERCPU_COUNTER_H */ --=20 2.31.1 From nobody Sun Apr 5 22:50:21 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id DC32EC54EE9 for ; Tue, 13 Sep 2022 11:07:24 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S230494AbiIMLHW (ORCPT ); Tue, 13 Sep 2022 07:07:22 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:51858 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231867AbiIMLHJ (ORCPT ); Tue, 13 Sep 2022 07:07:09 -0400 Received: from mga17.intel.com (mga17.intel.com [192.55.52.151]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id BE2035FAEE for ; Tue, 13 Sep 2022 04:07:03 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1663067223; x=1694603223; h=from:to:cc:subject:date:message-id:in-reply-to: references:mime-version:content-transfer-encoding; bh=5knzSwtfIr70idY7Uq53KmegV/XH0F4ijHoYuGt0KYw=; b=bTJ9Q3zQVEbOzeTBl3k8328pQ2Wf8NTs0NiuvLqr4TeJ62siCE3kbQF+ lv42mKe73hhNANZAeLy7o67+CRbrQskbMJKGBz0FnoROhJvQd2fqg0lUj WpILD7HlBYyVVt43YnwU0OxGLr9lFT6W1f7BeIDiNGopO5GkJkdHm3Cw7 pE2bkQRURJtplufq+k99fNwtwjZ0MDQVtmwNfzWeJTU8Rw+FLg0mT0k1D eZwoZw6odGerO5NoumiZ2KAFlPunHbl7CgMusjJ1kX0GocFPAKpVIpfuH ezmSbCfh+BHVdmFh0X0rpS/ugW3paGK0FTFGPI4XF+P6KQR62JF0nths7 A==; X-IronPort-AV: E=McAfee;i="6500,9779,10468"; a="278501233" X-IronPort-AV: E=Sophos;i="5.93,312,1654585200"; d="scan'208";a="278501233" Received: from fmsmga008.fm.intel.com ([10.253.24.58]) by fmsmga107.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 13 Sep 2022 04:07:02 -0700 X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="5.93,312,1654585200"; d="scan'208";a="678522235" Received: from linux-pnp-server-13.sh.intel.com ([10.239.176.176]) by fmsmga008.fm.intel.com with ESMTP; 13 Sep 2022 04:06:57 -0700 From: Jiebin Sun To: akpm@linux-foundation.org, vasily.averin@linux.dev, shakeelb@google.com, dennis@kernel.org, tj@kernel.org, cl@linux.com, ebiederm@xmission.com, legion@kernel.org, manfred@colorfullife.com, alexander.mikhalitsyn@virtuozzo.com, linux-mm@kvack.org, linux-kernel@vger.kernel.org Cc: tim.c.chen@intel.com, feng.tang@intel.com, ying.huang@intel.com, tianyou.li@intel.com, wangyang.guo@intel.com, jiebin.sun@intel.com, Tim Chen Subject: [PATCH v6 2/2] ipc/msg: mitigate the lock contention with percpu counter Date: Wed, 14 Sep 2022 03:25:38 +0800 Message-Id: <20220913192538.3023708-3-jiebin.sun@intel.com> X-Mailer: git-send-email 2.31.1 In-Reply-To: <20220913192538.3023708-1-jiebin.sun@intel.com> References: <20220902152243.479592-1-jiebin.sun@intel.com> <20220913192538.3023708-1-jiebin.sun@intel.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" The msg_bytes and msg_hdrs atomic counters are frequently updated when IPC msg queue is in heavy use, causing heavy cache bounce and overhead. Change them to percpu_counter greatly improve the performance. Since there is one percpu struct per namespace, additional memory cost is minimal. Reading of the count done in msgctl call, which is infrequent. So the need to sum up the counts in each CPU is infrequent. Apply the patch and test the pts/stress-ng-1.4.0 -- system v message passing (160 threads). Score gain: 3.99x CPU: ICX 8380 x 2 sockets Core number: 40 x 2 physical cores Benchmark: pts/stress-ng-1.4.0 -- system v message passing (160 threads) Signed-off-by: Jiebin Sun Reviewed-by: Tim Chen Reviewed-by: Manfred Spraul --- include/linux/ipc_namespace.h | 5 ++-- ipc/msg.c | 44 ++++++++++++++++++++++++----------- ipc/namespace.c | 5 +++- ipc/util.h | 4 ++-- 4 files changed, 39 insertions(+), 19 deletions(-) diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index e3e8c8662b49..e8240cf2611a 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -11,6 +11,7 @@ #include #include #include +#include =20 struct user_namespace; =20 @@ -36,8 +37,8 @@ struct ipc_namespace { unsigned int msg_ctlmax; unsigned int msg_ctlmnb; unsigned int msg_ctlmni; - atomic_t msg_bytes; - atomic_t msg_hdrs; + struct percpu_counter percpu_msg_bytes; + struct percpu_counter percpu_msg_hdrs; =20 size_t shm_ctlmax; size_t shm_ctlall; diff --git a/ipc/msg.c b/ipc/msg.c index a0d05775af2c..f2bb4c193ecf 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -39,6 +39,7 @@ #include #include #include +#include =20 #include #include @@ -285,10 +286,10 @@ static void freeque(struct ipc_namespace *ns, struct = kern_ipc_perm *ipcp) rcu_read_unlock(); =20 list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) { - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); free_msg(msg); } - atomic_sub(msq->q_cbytes, &ns->msg_bytes); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes); ipc_update_pid(&msq->q_lspid, NULL); ipc_update_pid(&msq->q_lrpid, NULL); ipc_rcu_putref(&msq->q_perm, msg_rcu_free); @@ -495,17 +496,18 @@ static int msgctl_info(struct ipc_namespace *ns, int = msqid, msginfo->msgssz =3D MSGSSZ; msginfo->msgseg =3D MSGSEG; down_read(&msg_ids(ns).rwsem); - if (cmd =3D=3D MSG_INFO) { + if (cmd =3D=3D MSG_INFO) msginfo->msgpool =3D msg_ids(ns).in_use; - msginfo->msgmap =3D atomic_read(&ns->msg_hdrs); - msginfo->msgtql =3D atomic_read(&ns->msg_bytes); + max_idx =3D ipc_get_maxidx(&msg_ids(ns)); + up_read(&msg_ids(ns).rwsem); + if (cmd =3D=3D MSG_INFO) { + msginfo->msgmap =3D percpu_counter_sum(&ns->percpu_msg_hdrs); + msginfo->msgtql =3D percpu_counter_sum(&ns->percpu_msg_bytes); } else { msginfo->msgmap =3D MSGMAP; msginfo->msgpool =3D MSGPOOL; msginfo->msgtql =3D MSGTQL; } - max_idx =3D ipc_get_maxidx(&msg_ids(ns)); - up_read(&msg_ids(ns).rwsem); return (max_idx < 0) ? 0 : max_idx; } =20 @@ -935,8 +937,8 @@ static long do_msgsnd(int msqid, long mtype, void __use= r *mtext, list_add_tail(&msg->m_list, &msq->q_messages); msq->q_cbytes +=3D msgsz; msq->q_qnum++; - atomic_add(msgsz, &ns->msg_bytes); - atomic_inc(&ns->msg_hdrs); + percpu_counter_add_local(&ns->percpu_msg_bytes, msgsz); + percpu_counter_add_local(&ns->percpu_msg_hdrs, 1); } =20 err =3D 0; @@ -1159,8 +1161,8 @@ static long do_msgrcv(int msqid, void __user *buf, si= ze_t bufsz, long msgtyp, in msq->q_rtime =3D ktime_get_real_seconds(); ipc_update_pid(&msq->q_lrpid, task_tgid(current)); msq->q_cbytes -=3D msg->m_ts; - atomic_sub(msg->m_ts, &ns->msg_bytes); - atomic_dec(&ns->msg_hdrs); + percpu_counter_sub_local(&ns->percpu_msg_bytes, msg->m_ts); + percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1); ss_wakeup(msq, &wake_q, false); =20 goto out_unlock0; @@ -1297,20 +1299,34 @@ COMPAT_SYSCALL_DEFINE5(msgrcv, int, msqid, compat_u= ptr_t, msgp, } #endif =20 -void msg_init_ns(struct ipc_namespace *ns) +int msg_init_ns(struct ipc_namespace *ns) { + int ret; + ns->msg_ctlmax =3D MSGMAX; ns->msg_ctlmnb =3D MSGMNB; ns->msg_ctlmni =3D MSGMNI; =20 - atomic_set(&ns->msg_bytes, 0); - atomic_set(&ns->msg_hdrs, 0); + ret =3D percpu_counter_init(&ns->percpu_msg_bytes, 0, GFP_KERNEL); + if (ret) + goto fail_msg_bytes; + ret =3D percpu_counter_init(&ns->percpu_msg_hdrs, 0, GFP_KERNEL); + if (ret) + goto fail_msg_hdrs; ipc_init_ids(&ns->ids[IPC_MSG_IDS]); + return 0; + + fail_msg_hdrs: + percpu_counter_destroy(&ns->percpu_msg_bytes); + fail_msg_bytes: + return ret; } =20 #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { + percpu_counter_destroy(&ns->percpu_msg_bytes); + percpu_counter_destroy(&ns->percpu_msg_hdrs); free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); diff --git a/ipc/namespace.c b/ipc/namespace.c index e1fcaedba4fa..8316ea585733 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -66,8 +66,11 @@ static struct ipc_namespace *create_ipc_ns(struct user_n= amespace *user_ns, if (!setup_ipc_sysctls(ns)) goto fail_mq; =20 + err =3D msg_init_ns(ns); + if (err) + goto fail_put; + sem_init_ns(ns); - msg_init_ns(ns); shm_init_ns(ns); =20 return ns; diff --git a/ipc/util.h b/ipc/util.h index 2dd7ce0416d8..1b0086c6346f 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -64,7 +64,7 @@ static inline void mq_put_mnt(struct ipc_namespace *ns) {= } =20 #ifdef CONFIG_SYSVIPC void sem_init_ns(struct ipc_namespace *ns); -void msg_init_ns(struct ipc_namespace *ns); +int msg_init_ns(struct ipc_namespace *ns); void shm_init_ns(struct ipc_namespace *ns); =20 void sem_exit_ns(struct ipc_namespace *ns); @@ -72,7 +72,7 @@ void msg_exit_ns(struct ipc_namespace *ns); void shm_exit_ns(struct ipc_namespace *ns); #else static inline void sem_init_ns(struct ipc_namespace *ns) { } -static inline void msg_init_ns(struct ipc_namespace *ns) { } +static inline int msg_init_ns(struct ipc_namespace *ns) { return 0;} static inline void shm_init_ns(struct ipc_namespace *ns) { } =20 static inline void sem_exit_ns(struct ipc_namespace *ns) { } --=20 2.31.1