From nobody Wed Sep 17 07:50:33 2025
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 1913AC4332F
	for <linux-kernel@archiver.kernel.org>; Tue, 20 Dec 2022 18:49:33 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S230075AbiLTStb (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Tue, 20 Dec 2022 13:49:31 -0500
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:40022 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S234183AbiLTSs5 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Tue, 20 Dec 2022 13:48:57 -0500
Received: from out2.migadu.com (out2.migadu.com [IPv6:2001:41d0:2:aacc::])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id C216C1AD85
        for <linux-kernel@vger.kernel.org>;
 Tue, 20 Dec 2022 10:48:43 -0800 (PST)
X-Report-Abuse: Please report any abuse attempt to abuse@migadu.com and
 include these headers.
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.dev; s=key1;
        t=1671562122;
        h=from:from:reply-to:subject:subject:date:date:message-id:message-id:
         to:to:cc:cc:mime-version:mime-version:
         content-transfer-encoding:content-transfer-encoding;
        bh=/dnH9RYPPLTjIw2WCUFM/32RZYGdnaOT5UpnmSHjE2I=;
        b=rCkLvIPOdnRkbm0tFZupJXUJI8sv+NBbuWllfWz8+rNqUpkljcDxqur1kaMOev6cPnazSk
        fDT+xPF21vfQy/j2+NPOhG/0Yr3VG387Ua09ZRVvhcUsQPFSwkhkd7hH7ZNKucP+0lcbm3
        Xa4MbUYLMxe7awX+pT6KdT0gmZvRGgE=
From: Roman Gushchin <roman.gushchin@linux.dev>
To: linux-mm@kvack.org
Cc: linux-kernel@vger.kernel.org, Shakeel Butt <shakeelb@google.com>,
        Johannes Weiner <hannes@cmpxchg.org>,
        Michal Hocko <mhocko@kernel.org>,
        Muchun Song <muchun.song@linux.dev>,
        Andrew Morton <akpm@linux-foundation.org>,
        Waiman Long <longman@redhat.com>,
        Roman Gushchin <roman.gushchin@linux.dev>,
        Sven Luther <Sven.Luther@windriver.com>
Subject: [PATCH RFC] ipc/mqueue: introduce msg cache
Date: Tue, 20 Dec 2022 10:48:13 -0800
Message-Id: <20221220184813.1908318-1-roman.gushchin@linux.dev>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-Migadu-Flow: FLOW_OUT
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

Sven Luther reported a regression in the posix message queues
performance caused by switching to the per-object tracking of
slab objects introduced by patch series ending with the
commit 10befea91b61 ("mm: memcg/slab: use a single set of kmem_caches for a=
ll
allocations").

To mitigate the regression cache allocated mqueue messages on a small
percpu cache instead of releasing and re-allocating them every time.

This change brings the performance measured by a benchmark kindly
provided by Sven [1] almost back to the original (or even better)
numbers. Measurements results are also provided by Sven.

+------------+---------------------------------+---------------------------=
-----+
| kernel     | mqueue_rcv (ns)     variation   | mqueue_send (ns)   variati=
on   |
| version    | min avg  max      min      avg  | min avg max       min     =
avg  |
+------------+--------------------------+----------------------------------=
-----+
| 4.18.45    | 351 382 17533    base     base  | 383 410 13178     base    =
base |
| 5.8-good   | 380 392  7156   -7,63%  -2,55%  | 376 384  6225    1,86%   6=
,77% |
| 5.8-bad    | 524 530  5310  -33,02% -27,92%  | 512 519  8775  -25,20% -21=
,00% |
| 5.10       | 520 533  4078  -32,20% -28,33%  | 518 534  8108  -26,06% -23=
,22% |
| 5.15       | 431 444  8440  -18,56% -13,96%  | 425 437  6170   -9,88%  -6=
,18% |
| 6.0.3      | 474 614  3881  -25,95% -37,79%  | 482 693   931  -20,54% -40=
,84% |
| 6.1-rc8    | 496 509  8804  -29,23% -24,95%  | 493 512  5748  -22,31% -19=
,92% |
| 6.1-rc8+p  | 392 397  5479  -10,46%  -3,78%  | 364 369 10776    5,22%  11=
,11% |
+------------+---------------------------------+---------------------------=
-----+

The last line reflects the result with this patch ("6.1-rc8+p").

[1]: https://lore.kernel.org/linux-mm/Y46lqCToUa%2FBgt%2Fc@P9FQF9L96D/T/

Reported-by: Sven Luther <Sven.Luther@windriver.com>
Tested-by: Sven Luther <Sven.Luther@windriver.com>
Signed-off-by: Roman Gushchin <roman.gushchin@linux.dev>
---
 include/linux/memcontrol.h |  12 +++++
 ipc/mqueue.c               |  20 ++++++--
 ipc/msg.c                  |  12 ++---
 ipc/msgutil.c              | 101 +++++++++++++++++++++++++++++++++----
 ipc/util.h                 |   8 ++-
 mm/memcontrol.c            |  62 +++++++++++++++++++++++
 6 files changed, 194 insertions(+), 21 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index e1644a24009c..8b7f00d562a2 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -1734,8 +1734,10 @@ bool mem_cgroup_kmem_disabled(void);
 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge_page(struct page *page, int order);
=20
+struct obj_cgroup *obj_cgroup_from_current(void);
 struct obj_cgroup *get_obj_cgroup_from_current(void);
 struct obj_cgroup *get_obj_cgroup_from_page(struct page *page);
+struct obj_cgroup *get_obj_cgroup_from_slab_obj(void *p);
=20
 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size);
 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size);
@@ -1813,11 +1815,21 @@ static inline void __memcg_kmem_uncharge_page(struc=
t page *page, int order)
 {
 }
=20
+static inline struct obj_cgroup *obj_cgroup_from_current(void)
+{
+	return NULL;
+}
+
 static inline struct obj_cgroup *get_obj_cgroup_from_page(struct page *pag=
e)
 {
 	return NULL;
 }
=20
+static inline struct obj_cgroup *get_obj_cgroup_from_slab_obj(void *p)
+{
+	return NULL;
+}
+
 static inline bool memcg_kmem_enabled(void)
 {
 	return false;
diff --git a/ipc/mqueue.c b/ipc/mqueue.c
index 467a194b8a2e..75bf83a8a36f 100644
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -131,6 +131,11 @@ struct ext_wait_queue {		/* queue of sleeping tasks */
 	int state;		/* one of STATE_* values */
 };
=20
+struct pcpu_msg_cache;
+struct msg_cache {
+	struct pcpu_msg_cache __percpu *pcpu_cache;
+};
+
 struct mqueue_inode_info {
 	spinlock_t lock;
 	struct inode vfs_inode;
@@ -152,6 +157,8 @@ struct mqueue_inode_info {
 	/* for tasks waiting for free space and messages, respectively */
 	struct ext_wait_queue e_wait_q[2];
=20
+	struct msg_cache msg_cache;
+
 	unsigned long qsize; /* size of queue in memory (sum of all msgs) */
 };
=20
@@ -368,6 +375,9 @@ static struct inode *mqueue_get_inode(struct super_bloc=
k *sb,
 		mq_bytes =3D info->attr.mq_maxmsg * info->attr.mq_msgsize;
 		if (mq_bytes + mq_treesize < mq_bytes)
 			goto out_inode;
+		ret =3D init_msg_cache(&info->msg_cache);
+		if (ret)
+			goto out_inode;
 		mq_bytes +=3D mq_treesize;
 		info->ucounts =3D get_ucounts(current_ucounts());
 		if (info->ucounts) {
@@ -531,9 +541,11 @@ static void mqueue_evict_inode(struct inode *inode)
=20
 	list_for_each_entry_safe(msg, nmsg, &tmp_msg, m_list) {
 		list_del(&msg->m_list);
-		free_msg(msg);
+		free_msg(msg, NULL);
 	}
=20
+	free_msg_cache(&info->msg_cache);
+
 	if (info->ucounts) {
 		unsigned long mq_bytes, mq_treesize;
=20
@@ -1108,7 +1120,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __=
user *u_msg_ptr,
=20
 	/* First try to allocate memory, before doing anything with
 	 * existing queues. */
-	msg_ptr =3D load_msg(u_msg_ptr, msg_len);
+	msg_ptr =3D load_msg(u_msg_ptr, msg_len, &info->msg_cache);
 	if (IS_ERR(msg_ptr)) {
 		ret =3D PTR_ERR(msg_ptr);
 		goto out_fput;
@@ -1170,7 +1182,7 @@ static int do_mq_timedsend(mqd_t mqdes, const char __=
user *u_msg_ptr,
 	wake_up_q(&wake_q);
 out_free:
 	if (ret)
-		free_msg(msg_ptr);
+		free_msg(msg_ptr, &info->msg_cache);
 out_fput:
 	fdput(f);
 out:
@@ -1273,7 +1285,7 @@ static int do_mq_timedreceive(mqd_t mqdes, char __use=
r *u_msg_ptr,
 			store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) {
 			ret =3D -EFAULT;
 		}
-		free_msg(msg_ptr);
+		free_msg(msg_ptr, &info->msg_cache);
 	}
 out_fput:
 	fdput(f);
diff --git a/ipc/msg.c b/ipc/msg.c
index fd08b3cb36d7..fcc09f848490 100644
--- a/ipc/msg.c
+++ b/ipc/msg.c
@@ -287,7 +287,7 @@ static void freeque(struct ipc_namespace *ns, struct ke=
rn_ipc_perm *ipcp)
=20
 	list_for_each_entry_safe(msg, t, &msq->q_messages, m_list) {
 		percpu_counter_sub_local(&ns->percpu_msg_hdrs, 1);
-		free_msg(msg);
+		free_msg(msg, NULL);
 	}
 	percpu_counter_sub_local(&ns->percpu_msg_bytes, msq->q_cbytes);
 	ipc_update_pid(&msq->q_lspid, NULL);
@@ -861,7 +861,7 @@ static long do_msgsnd(int msqid, long mtype, void __use=
r *mtext,
 	if (mtype < 1)
 		return -EINVAL;
=20
-	msg =3D load_msg(mtext, msgsz);
+	msg =3D load_msg(mtext, msgsz, NULL);
 	if (IS_ERR(msg))
 		return PTR_ERR(msg);
=20
@@ -954,7 +954,7 @@ static long do_msgsnd(int msqid, long mtype, void __use=
r *mtext,
 out_unlock1:
 	rcu_read_unlock();
 	if (msg !=3D NULL)
-		free_msg(msg);
+		free_msg(msg, NULL);
 	return err;
 }
=20
@@ -1049,7 +1049,7 @@ static inline struct msg_msg *prepare_copy(void __use=
r *buf, size_t bufsz)
 	/*
 	 * Create dummy message to copy real message to.
 	 */
-	copy =3D load_msg(buf, bufsz);
+	copy =3D load_msg(buf, bufsz, NULL);
 	if (!IS_ERR(copy))
 		copy->m_ts =3D bufsz;
 	return copy;
@@ -1058,7 +1058,7 @@ static inline struct msg_msg *prepare_copy(void __use=
r *buf, size_t bufsz)
 static inline void free_copy(struct msg_msg *copy)
 {
 	if (copy)
-		free_msg(copy);
+		free_msg(copy, NULL);
 }
 #else
 static inline struct msg_msg *prepare_copy(void __user *buf, size_t bufsz)
@@ -1256,7 +1256,7 @@ static long do_msgrcv(int msqid, void __user *buf, si=
ze_t bufsz, long msgtyp, in
 	}
=20
 	bufsz =3D msg_handler(buf, msg, bufsz);
-	free_msg(msg);
+	free_msg(msg, NULL);
=20
 	return bufsz;
 }
diff --git a/ipc/msgutil.c b/ipc/msgutil.c
index d0a0e877cadd..8667708fc00a 100644
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -15,6 +15,7 @@
 #include <linux/proc_ns.h>
 #include <linux/uaccess.h>
 #include <linux/sched.h>
+#include <linux/memcontrol.h>
=20
 #include "util.h"
=20
@@ -39,16 +40,76 @@ struct msg_msgseg {
 	/* the next part of the message follows immediately */
 };
=20
+struct pcpu_msg_cache {
+	struct msg_msg *msg;
+	struct obj_cgroup *objcg;
+	size_t len;
+};
+
+struct msg_cache {
+	struct pcpu_msg_cache __percpu *pcpu_cache;
+};
+
 #define DATALEN_MSG	((size_t)PAGE_SIZE-sizeof(struct msg_msg))
 #define DATALEN_SEG	((size_t)PAGE_SIZE-sizeof(struct msg_msgseg))
=20
+int init_msg_cache(struct msg_cache *cache)
+{
+	cache->pcpu_cache =3D alloc_percpu(struct pcpu_msg_cache);
+	if (!cache->pcpu_cache)
+		return -ENOMEM;
+
+	return 0;
+}
=20
-static struct msg_msg *alloc_msg(size_t len)
+void free_msg_cache(struct msg_cache *cache)
+{
+	int cpu;
+
+	if (!cache->pcpu_cache)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct pcpu_msg_cache *pc =3D per_cpu_ptr(cache->pcpu_cache, cpu);
+
+		if (pc->msg) {
+			if (pc->objcg)
+				obj_cgroup_put(pc->objcg);
+			free_msg(pc->msg, NULL);
+		}
+	}
+
+	free_percpu(cache->pcpu_cache);
+}
+
+static struct msg_msg *alloc_msg(size_t len, struct msg_cache *cache)
 {
 	struct msg_msg *msg;
 	struct msg_msgseg **pseg;
 	size_t alen;
=20
+	if (cache) {
+		struct pcpu_msg_cache *pc;
+
+		msg =3D NULL;
+		pc =3D get_cpu_ptr(cache->pcpu_cache);
+		if (pc->msg && pc->len =3D=3D len) {
+			struct obj_cgroup *objcg;
+
+			rcu_read_lock();
+			objcg =3D obj_cgroup_from_current();
+			if (objcg =3D=3D pc->objcg) {
+				msg =3D pc->msg;
+				pc->msg =3D NULL;
+				obj_cgroup_put(pc->objcg);
+			}
+			rcu_read_unlock();
+		}
+		put_cpu_ptr(cache->pcpu_cache);
+		if (msg)
+			return msg;
+	}
+
 	alen =3D min(len, DATALEN_MSG);
 	msg =3D kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT);
 	if (msg =3D=3D NULL)
@@ -77,18 +138,19 @@ static struct msg_msg *alloc_msg(size_t len)
 	return msg;
=20
 out_err:
-	free_msg(msg);
+	free_msg(msg, NULL);
 	return NULL;
 }
=20
-struct msg_msg *load_msg(const void __user *src, size_t len)
+struct msg_msg *load_msg(const void __user *src, size_t len,
+			 struct msg_cache *cache)
 {
 	struct msg_msg *msg;
 	struct msg_msgseg *seg;
 	int err =3D -EFAULT;
 	size_t alen;
=20
-	msg =3D alloc_msg(len);
+	msg =3D alloc_msg(len, cache);
 	if (msg =3D=3D NULL)
 		return ERR_PTR(-ENOMEM);
=20
@@ -104,14 +166,16 @@ struct msg_msg *load_msg(const void __user *src, size=
_t len)
 			goto out_err;
 	}
=20
-	err =3D security_msg_msg_alloc(msg);
-	if (err)
-		goto out_err;
+	if (!msg->security) {
+		err =3D security_msg_msg_alloc(msg);
+		if (err)
+			goto out_err;
+	}
=20
 	return msg;
=20
 out_err:
-	free_msg(msg);
+	free_msg(msg, NULL);
 	return ERR_PTR(err);
 }
 #ifdef CONFIG_CHECKPOINT_RESTORE
@@ -166,10 +230,29 @@ int store_msg(void __user *dest, struct msg_msg *msg,=
 size_t len)
 	return 0;
 }
=20
-void free_msg(struct msg_msg *msg)
+void free_msg(struct msg_msg *msg, struct msg_cache *cache)
 {
 	struct msg_msgseg *seg;
=20
+	if (cache) {
+		struct pcpu_msg_cache *pc;
+		bool cached =3D false;
+
+		pc =3D get_cpu_ptr(cache->pcpu_cache);
+		if (!pc->msg) {
+			pc->objcg =3D get_obj_cgroup_from_slab_obj(msg);
+			pc->len =3D msg->m_ts;
+			pc->msg =3D msg;
+
+			if (pc->objcg)
+				cached =3D true;
+		}
+		put_cpu_ptr(cache->pcpu_cache);
+
+		if (cached)
+			return;
+	}
+
 	security_msg_msg_free(msg);
=20
 	seg =3D msg->next;
diff --git a/ipc/util.h b/ipc/util.h
index b2906e366539..a2da266386aa 100644
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -196,8 +196,12 @@ static inline void ipc_update_pid(struct pid **pos, st=
ruct pid *pid)
 int ipc_parse_version(int *cmd);
 #endif
=20
-extern void free_msg(struct msg_msg *msg);
-extern struct msg_msg *load_msg(const void __user *src, size_t len);
+struct msg_cache;
+extern int init_msg_cache(struct msg_cache *cache);
+extern void free_msg_cache(struct msg_cache *cache);
+extern void free_msg(struct msg_msg *msg, struct msg_cache *cache);
+extern struct msg_msg *load_msg(const void __user *src, size_t len,
+				struct msg_cache *cache);
 extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst);
 extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len);
=20
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a1a35c12635e..28528b4da0fb 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3004,6 +3004,28 @@ static struct obj_cgroup *__get_obj_cgroup_from_memc=
g(struct mem_cgroup *memcg)
 	return objcg;
 }
=20
+__always_inline struct obj_cgroup *obj_cgroup_from_current(void)
+{
+	struct obj_cgroup *objcg =3D NULL;
+	struct mem_cgroup *memcg;
+
+	if (memcg_kmem_bypass())
+		return NULL;
+
+	if (unlikely(active_memcg()))
+		memcg =3D active_memcg();
+	else
+		memcg =3D mem_cgroup_from_task(current);
+
+	for (; memcg !=3D root_mem_cgroup; memcg =3D parent_mem_cgroup(memcg)) {
+		objcg =3D rcu_dereference(memcg->objcg);
+		if (likely(objcg))
+			return objcg;
+	}
+
+	return NULL;
+}
+
 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
 {
 	struct obj_cgroup *objcg =3D NULL;
@@ -3046,6 +3068,46 @@ struct obj_cgroup *get_obj_cgroup_from_page(struct p=
age *page)
 	return objcg;
 }
=20
+/*
+ * A passed kernel object must be a slab object or a generic kernel page.
+ * Not suitable for objects, allocated using vmalloc().
+ */
+struct obj_cgroup *get_obj_cgroup_from_slab_obj(void *p)
+{
+	struct obj_cgroup *objcg =3D NULL;
+	struct folio *folio;
+
+	if (mem_cgroup_disabled())
+		return NULL;
+
+	folio =3D virt_to_folio(p);
+	/*
+	 * Slab object can be either a true slab object, which are accounted
+	 * individually with objcg pointers stored in a separate objcg array,
+	 * or it can a generic folio with MEMCG_DATA_KMEM flag set.
+	 */
+	if (folio_test_slab(folio)) {
+		struct obj_cgroup **objcgs;
+		struct slab *slab;
+		unsigned int off;
+
+		slab =3D folio_slab(folio);
+		objcgs =3D slab_objcgs(slab);
+		if (!objcgs)
+			return NULL;
+
+		off =3D obj_to_index(slab->slab_cache, slab, p);
+		objcg =3D objcgs[off];
+	} else {
+		objcg =3D __folio_objcg(folio);
+	}
+
+	if (objcg)
+		obj_cgroup_get(objcg);
+
+	return objcg;
+}
+
 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
 {
 	mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
--=20
2.39.0