From nobody Sun May 19 01:15:30 2024
Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7257916C852
	for <linux-kernel@vger.kernel.org>; Thu, 18 Apr 2024 14:20:22 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=45.249.212.188
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1713450027; cv=none;
 b=kuNVnUpZhV2XTynYgcY8CqZxn8sQsnFz/IJcF2WQ4JAQpWBekBDxU2/kQ5f0tJpaVb3ChHVQhBgWORA7zO5Eby1PYfwUIytMiR7CduJf8ZruBKIDyjF/0VjqSV3ZjWO2G9rcJZqzNZJ+NXM6/ESnkZ2fQUNKDwmiSziVOPJyH98=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1713450027; c=relaxed/simple;
	bh=yN02vDdA73XbkzuEVpO9Jpre9qVR28t2iJxR7y3j8a8=;
	h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=DiRxZ9ixsUg0JI/9hHCAmo1M+rHqSjiq0NSIM+xBLR22880hCAMouZDafDItX4Pzugm03dWdYCJ8xV0TnU20WCyE4acP3VB/rjKB9P+KnqsmH/7P9Lprha/uZ+pn3U3qLpcGUgr6ekVBKId4UiGwmqC9aitjgcOYUNPnX3CBLIk=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com;
 spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.188
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=huawei.com
Received: from mail.maildlp.com (unknown [172.19.163.252])
	by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4VL0G20QnKzXlRQ;
	Thu, 18 Apr 2024 22:16:58 +0800 (CST)
Received: from kwepemm600020.china.huawei.com (unknown [7.193.23.147])
	by mail.maildlp.com (Postfix) with ESMTPS id 98AD218006B;
	Thu, 18 Apr 2024 22:20:18 +0800 (CST)
Received: from localhost.localdomain (10.175.112.125) by
 kwepemm600020.china.huawei.com (7.193.23.147) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id
 15.1.2507.35; Thu, 18 Apr 2024 22:20:17 +0800
From: Peng Zhang <zhangpeng362@huawei.com>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>
CC: <akpm@linux-foundation.org>, <dennisszhou@gmail.com>,
	<shakeelb@google.com>, <jack@suse.cz>, <surenb@google.com>,
	<kent.overstreet@linux.dev>, <mhocko@suse.cz>, <vbabka@suse.cz>,
	<yuzhao@google.com>, <yu.ma@intel.com>, <wangkefeng.wang@huawei.com>,
	<sunnanyong@huawei.com>, <zhangpeng362@huawei.com>
Subject: [RFC PATCH v2 1/2] percpu_counter: introduce atomic mode for
 percpu_counter
Date: Thu, 18 Apr 2024 22:20:07 +0800
Message-ID: <20240418142008.2775308-2-zhangpeng362@huawei.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20240418142008.2775308-1-zhangpeng362@huawei.com>
References: <20240418142008.2775308-1-zhangpeng362@huawei.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To
 kwepemm600020.china.huawei.com (7.193.23.147)
Content-Type: text/plain; charset="utf-8"

From: ZhangPeng <zhangpeng362@huawei.com>

Depending on whether counters is NULL, we can support two modes:
atomic mode and perpcu mode. We implement both modes by grouping
the s64 count and atomic64_t count_atomic in a union. At the same time,
we create the interface for adding and reading in atomic mode and for
switching atomic mode to percpu mode.

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 include/linux/percpu_counter.h | 43 +++++++++++++++++++++++++++++++---
 lib/percpu_counter.c           | 31 ++++++++++++++++++++++--
 2 files changed, 69 insertions(+), 5 deletions(-)

diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 3a44dd1e33d2..160f9734c0bb 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -21,7 +21,13 @@
=20
 struct percpu_counter {
 	raw_spinlock_t lock;
-	s64 count;
+	/* Depending on whether counters is NULL, we can support two modes,
+	 * atomic mode using count_atomic and perpcu mode using count.
+	 */
+	union {
+		s64 count;
+		atomic64_t count_atomic;
+	};
 #ifdef CONFIG_HOTPLUG_CPU
 	struct list_head list;	/* All percpu_counters are on a list */
 #endif
@@ -32,14 +38,14 @@ extern int percpu_counter_batch;
=20
 int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
 			       gfp_t gfp, u32 nr_counters,
-			       struct lock_class_key *key);
+			       struct lock_class_key *key, bool switch_mode);
=20
 #define percpu_counter_init_many(fbc, value, gfp, nr_counters)		\
 	({								\
 		static struct lock_class_key __key;			\
 									\
 		__percpu_counter_init_many(fbc, value, gfp, nr_counters,\
-					   &__key);			\
+					   &__key, false);		\
 	})
=20
=20
@@ -130,6 +136,20 @@ static inline bool percpu_counter_initialized(struct p=
ercpu_counter *fbc)
 	return (fbc->counters !=3D NULL);
 }
=20
+static inline s64 percpu_counter_atomic_read(struct percpu_counter *fbc)
+{
+	return atomic64_read(&fbc->count_atomic);
+}
+
+static inline void percpu_counter_atomic_add(struct percpu_counter *fbc,
+					     s64 amount)
+{
+	atomic64_add(amount, &fbc->count_atomic);
+}
+
+int percpu_counter_switch_to_pcpu_many(struct percpu_counter *fbc,
+				       u32 nr_counters);
+
 #else /* !CONFIG_SMP */
=20
 struct percpu_counter {
@@ -260,6 +280,23 @@ static inline bool percpu_counter_initialized(struct p=
ercpu_counter *fbc)
 static inline void percpu_counter_sync(struct percpu_counter *fbc)
 {
 }
+
+static inline s64 percpu_counter_atomic_read(struct percpu_counter *fbc)
+{
+	return fbc->count;
+}
+
+static inline void percpu_counter_atomic_add(struct percpu_counter *fbc,
+					     s64 amount)
+{
+	percpu_counter_add(fbc, amount);
+}
+
+static inline int percpu_counter_switch_to_pcpu_many(struct percpu_counter=
 *fbc,
+						     u32 nr_counters)
+{
+	return 0;
+}
 #endif	/* CONFIG_SMP */
=20
 static inline void percpu_counter_inc(struct percpu_counter *fbc)
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 44dd133594d4..95c4e038051a 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -153,7 +153,7 @@ EXPORT_SYMBOL(__percpu_counter_sum);
=20
 int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount,
 			       gfp_t gfp, u32 nr_counters,
-			       struct lock_class_key *key)
+			       struct lock_class_key *key, bool switch_mode)
 {
 	unsigned long flags __maybe_unused;
 	size_t counter_size;
@@ -174,7 +174,8 @@ int __percpu_counter_init_many(struct percpu_counter *f=
bc, s64 amount,
 #ifdef CONFIG_HOTPLUG_CPU
 		INIT_LIST_HEAD(&fbc[i].list);
 #endif
-		fbc[i].count =3D amount;
+		if (likely(!switch_mode))
+			fbc[i].count =3D amount;
 		fbc[i].counters =3D (void *)counters + (i * counter_size);
=20
 		debug_percpu_counter_activate(&fbc[i]);
@@ -357,6 +358,32 @@ bool __percpu_counter_limited_add(struct percpu_counte=
r *fbc,
 	return good;
 }
=20
+/*
+ * percpu_counter_switch_to_pcpu_many: Converts struct percpu_counters from
+ * atomic mode to percpu mode.
+ */
+int percpu_counter_switch_to_pcpu_many(struct percpu_counter *fbc,
+				       u32 nr_counters)
+{
+	static struct lock_class_key __key;
+	unsigned long flags;
+	bool ret =3D 0;
+
+	if (percpu_counter_initialized(fbc))
+		return 0;
+
+	preempt_disable();
+	local_irq_save(flags);
+	if (likely(!percpu_counter_initialized(fbc)))
+		ret =3D __percpu_counter_init_many(fbc, 0,
+					GFP_ATOMIC|__GFP_NOWARN|__GFP_ZERO,
+					nr_counters, &__key, true);
+	local_irq_restore(flags);
+	preempt_enable();
+
+	return ret;
+}
+
 static int __init percpu_counter_startup(void)
 {
 	int ret;
--=20
2.25.1
From nobody Sun May 19 01:15:30 2024
Received: from szxga08-in.huawei.com (szxga08-in.huawei.com [45.249.212.255])
	(using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id AB27616C868
	for <linux-kernel@vger.kernel.org>; Thu, 18 Apr 2024 14:20:27 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=45.249.212.255
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1713450030; cv=none;
 b=KBlVG4JOA72OsuamPDzP0Hv6x6j7uiOFOaE7rIuDgfwylWTIh98QOxdMcyC60jKTNFklqCxDZ76cqcjXzSIFbxQIDTLhpx8y1ozluOdVXUVfPzqoVt88KW3QDzA0y32Et4jfXVUgfqvRskPLk3vnB/3pgVIv/L6Objb/gHeGETU=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1713450030; c=relaxed/simple;
	bh=VToe+G5gCpIh0JC/YRdvGcXhSR6YzrScmagb3i00W1Q=;
	h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References:
	 MIME-Version:Content-Type;
 b=XkE5nZJhYJt32SjiZBdao8qncTQ8UiCnaeukk+jNREjC9WI6oNO0P5IHgUL74rEWIqiIuXf4T7n+oCMAYvCDUO+pJylDvT8oTQzMFfxpIAEB+ynkA6II4nB5IGn3T8twfo4PruhXiS4JLQ3oexbPgMuNColZNhQdzKtcWclwwAU=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com;
 spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.255
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=quarantine dis=none) header.from=huawei.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=huawei.com
Received: from mail.maildlp.com (unknown [172.19.88.105])
	by szxga08-in.huawei.com (SkyGuard) with ESMTP id 4VL0GY5dl6z1R5WL;
	Thu, 18 Apr 2024 22:17:25 +0800 (CST)
Received: from kwepemm600020.china.huawei.com (unknown [7.193.23.147])
	by mail.maildlp.com (Postfix) with ESMTPS id A4877140382;
	Thu, 18 Apr 2024 22:20:19 +0800 (CST)
Received: from localhost.localdomain (10.175.112.125) by
 kwepemm600020.china.huawei.com (7.193.23.147) with Microsoft SMTP Server
 (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id
 15.1.2507.35; Thu, 18 Apr 2024 22:20:18 +0800
From: Peng Zhang <zhangpeng362@huawei.com>
To: <linux-mm@kvack.org>, <linux-kernel@vger.kernel.org>
CC: <akpm@linux-foundation.org>, <dennisszhou@gmail.com>,
	<shakeelb@google.com>, <jack@suse.cz>, <surenb@google.com>,
	<kent.overstreet@linux.dev>, <mhocko@suse.cz>, <vbabka@suse.cz>,
	<yuzhao@google.com>, <yu.ma@intel.com>, <wangkefeng.wang@huawei.com>,
	<sunnanyong@huawei.com>, <zhangpeng362@huawei.com>
Subject: [RFC PATCH v2 2/2] mm: convert mm's rss stats to use atomic mode
Date: Thu, 18 Apr 2024 22:20:08 +0800
Message-ID: <20240418142008.2775308-3-zhangpeng362@huawei.com>
X-Mailer: git-send-email 2.25.1
In-Reply-To: <20240418142008.2775308-1-zhangpeng362@huawei.com>
References: <20240418142008.2775308-1-zhangpeng362@huawei.com>
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To
 kwepemm600020.china.huawei.com (7.193.23.147)
Content-Type: text/plain; charset="utf-8"

From: ZhangPeng <zhangpeng362@huawei.com>

Since commit f1a7941243c1 ("mm: convert mm's rss stats into
percpu_counter"), the rss_stats have converted into percpu_counter,
which convert the error margin from (nr_threads * 64) to approximately
(nr_cpus ^ 2). However, the new percpu allocation in mm_init() causes a
performance regression on fork/exec/shell. Even after commit 14ef95be6f55
("kernel/fork: group allocation/free of per-cpu counters for mm struct"),
the performance of fork/exec/shell is still poor compared to previous
kernel versions.

To mitigate performance regression, we delay the allocation of percpu
memory for rss_stats. Therefore, we convert mm's rss stats to use
percpu_counter atomic mode. For single-thread processes, rss_stat is in
atomic mode, which reduces the memory consumption and performance
regression caused by using percpu. For multiple-thread processes,
rss_stat is switched to the percpu mode to reduce the error margin.
We convert rss_stats from atomic mode to percpu mode only when the
second thread is created.

After lmbench test, we can get 2% ~ 4% performance improvement
for lmbench fork_proc/exec_proc/shell_proc and 6.7% performance
improvement for lmbench page_fault (before batch mode[1]).

The test results are as follows:

             base           base+revert        base+this patch

fork_proc    416.3ms        400.0ms  (3.9%)    398.6ms  (4.2%)
exec_proc    2095.9ms       2061.1ms (1.7%)    2047.7ms (2.3%)
shell_proc   3028.2ms       2954.7ms (2.4%)    2961.2ms (2.2%)
page_fault   0.3603ms       0.3358ms (6.8%)    0.3361ms (6.7%)

[1] https://lore.kernel.org/all/20240412064751.119015-1-wangkefeng.wang@hua=
wei.com/

Suggested-by: Jan Kara <jack@suse.cz>
Signed-off-by: ZhangPeng <zhangpeng362@huawei.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
---
 include/linux/mm.h          | 50 +++++++++++++++++++++++++++++++------
 include/trace/events/kmem.h |  4 +--
 kernel/fork.c               | 18 +++++++------
 3 files changed, 56 insertions(+), 16 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d261e45bb29b..8f1bfbd54697 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2631,30 +2631,66 @@ static inline bool get_user_page_fast_only(unsigned=
 long addr,
  */
 static inline unsigned long get_mm_counter(struct mm_struct *mm, int membe=
r)
 {
-	return percpu_counter_read_positive(&mm->rss_stat[member]);
+	struct percpu_counter *fbc =3D &mm->rss_stat[member];
+
+	if (percpu_counter_initialized(fbc))
+		return percpu_counter_read_positive(fbc);
+
+	return percpu_counter_atomic_read(fbc);
 }
=20
 void mm_trace_rss_stat(struct mm_struct *mm, int member);
=20
 static inline void add_mm_counter(struct mm_struct *mm, int member, long v=
alue)
 {
-	percpu_counter_add(&mm->rss_stat[member], value);
+	struct percpu_counter *fbc =3D &mm->rss_stat[member];
+
+	if (percpu_counter_initialized(fbc))
+		percpu_counter_add(fbc, value);
+	else
+		percpu_counter_atomic_add(fbc, value);
=20
 	mm_trace_rss_stat(mm, member);
 }
=20
 static inline void inc_mm_counter(struct mm_struct *mm, int member)
 {
-	percpu_counter_inc(&mm->rss_stat[member]);
-
-	mm_trace_rss_stat(mm, member);
+	add_mm_counter(mm, member, 1);
 }
=20
 static inline void dec_mm_counter(struct mm_struct *mm, int member)
 {
-	percpu_counter_dec(&mm->rss_stat[member]);
+	add_mm_counter(mm, member, -1);
+}
=20
-	mm_trace_rss_stat(mm, member);
+static inline s64 mm_counter_sum(struct mm_struct *mm, int member)
+{
+	struct percpu_counter *fbc =3D &mm->rss_stat[member];
+
+	if (percpu_counter_initialized(fbc))
+		return percpu_counter_sum(fbc);
+
+	return percpu_counter_atomic_read(fbc);
+}
+
+static inline s64 mm_counter_sum_positive(struct mm_struct *mm, int member)
+{
+	struct percpu_counter *fbc =3D &mm->rss_stat[member];
+
+	if (percpu_counter_initialized(fbc))
+		return percpu_counter_sum_positive(fbc);
+
+	return percpu_counter_atomic_read(fbc);
+}
+
+static inline int mm_counter_switch_to_pcpu_many(struct mm_struct *mm)
+{
+	return percpu_counter_switch_to_pcpu_many(mm->rss_stat, NR_MM_COUNTERS);
+}
+
+static inline void mm_counter_destroy_many(struct mm_struct *mm)
+{
+	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
 }
=20
 /* Optimized variant when folio is already known not to be anon */
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index 6e62cc64cd92..a4e40ae6a8c8 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -399,8 +399,8 @@ TRACE_EVENT(rss_stat,
 		__entry->mm_id =3D mm_ptr_to_hash(mm);
 		__entry->curr =3D !!(current->mm =3D=3D mm);
 		__entry->member =3D member;
-		__entry->size =3D (percpu_counter_sum_positive(&mm->rss_stat[member])
-							    << PAGE_SHIFT);
+		__entry->size =3D (mm_counter_sum_positive(mm, member)
+							<< PAGE_SHIFT);
 	),
=20
 	TP_printk("mm_id=3D%u curr=3D%d type=3D%s size=3D%ldB",
diff --git a/kernel/fork.c b/kernel/fork.c
index 99076dbe27d8..0214273798c5 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -823,7 +823,7 @@ static void check_mm(struct mm_struct *mm)
 			 "Please make sure 'struct resident_page_types[]' is updated as well");
=20
 	for (i =3D 0; i < NR_MM_COUNTERS; i++) {
-		long x =3D percpu_counter_sum(&mm->rss_stat[i]);
+		long x =3D mm_counter_sum(mm, i);
=20
 		if (unlikely(x))
 			pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n",
@@ -1301,16 +1301,10 @@ static struct mm_struct *mm_init(struct mm_struct *=
mm, struct task_struct *p,
 	if (mm_alloc_cid(mm))
 		goto fail_cid;
=20
-	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT,
-				     NR_MM_COUNTERS))
-		goto fail_pcpu;
-
 	mm->user_ns =3D get_user_ns(user_ns);
 	lru_gen_init_mm(mm);
 	return mm;
=20
-fail_pcpu:
-	mm_destroy_cid(mm);
 fail_cid:
 	destroy_context(mm);
 fail_nocontext:
@@ -1730,6 +1724,16 @@ static int copy_mm(unsigned long clone_flags, struct=
 task_struct *tsk)
 	if (!oldmm)
 		return 0;
=20
+	/*
+	 * For single-thread processes, rss_stat is in atomic mode, which
+	 * reduces the memory consumption and performance regression caused by
+	 * using percpu. For multiple-thread processes, rss_stat is switched to
+	 * the percpu mode to reduce the error margin.
+	 */
+	if (clone_flags & CLONE_THREAD)
+		if (mm_counter_switch_to_pcpu_many(oldmm))
+			return -ENOMEM;
+
 	if (clone_flags & CLONE_VM) {
 		mmget(oldmm);
 		mm =3D oldmm;
--=20
2.25.1