From nobody Sun May 19 01:15:30 2024 Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7257916C852 for ; Thu, 18 Apr 2024 14:20:22 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=45.249.212.188 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713450027; cv=none; b=kuNVnUpZhV2XTynYgcY8CqZxn8sQsnFz/IJcF2WQ4JAQpWBekBDxU2/kQ5f0tJpaVb3ChHVQhBgWORA7zO5Eby1PYfwUIytMiR7CduJf8ZruBKIDyjF/0VjqSV3ZjWO2G9rcJZqzNZJ+NXM6/ESnkZ2fQUNKDwmiSziVOPJyH98= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713450027; c=relaxed/simple; bh=yN02vDdA73XbkzuEVpO9Jpre9qVR28t2iJxR7y3j8a8=; h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=DiRxZ9ixsUg0JI/9hHCAmo1M+rHqSjiq0NSIM+xBLR22880hCAMouZDafDItX4Pzugm03dWdYCJ8xV0TnU20WCyE4acP3VB/rjKB9P+KnqsmH/7P9Lprha/uZ+pn3U3qLpcGUgr6ekVBKId4UiGwmqC9aitjgcOYUNPnX3CBLIk= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com; spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.188 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.19.163.252]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4VL0G20QnKzXlRQ; Thu, 18 Apr 2024 22:16:58 +0800 (CST) Received: from kwepemm600020.china.huawei.com (unknown [7.193.23.147]) by mail.maildlp.com (Postfix) with ESMTPS id 98AD218006B; Thu, 18 Apr 2024 22:20:18 +0800 (CST) Received: from localhost.localdomain (10.175.112.125) by kwepemm600020.china.huawei.com (7.193.23.147) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.1.2507.35; Thu, 18 Apr 2024 22:20:17 +0800 From: Peng Zhang To: , CC: , , , , , , , , , , , , Subject: [RFC PATCH v2 1/2] percpu_counter: introduce atomic mode for percpu_counter Date: Thu, 18 Apr 2024 22:20:07 +0800 Message-ID: <20240418142008.2775308-2-zhangpeng362@huawei.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20240418142008.2775308-1-zhangpeng362@huawei.com> References: <20240418142008.2775308-1-zhangpeng362@huawei.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To kwepemm600020.china.huawei.com (7.193.23.147) Content-Type: text/plain; charset="utf-8" From: ZhangPeng Depending on whether counters is NULL, we can support two modes: atomic mode and perpcu mode. We implement both modes by grouping the s64 count and atomic64_t count_atomic in a union. At the same time, we create the interface for adding and reading in atomic mode and for switching atomic mode to percpu mode. Suggested-by: Jan Kara Signed-off-by: ZhangPeng Signed-off-by: Kefeng Wang --- include/linux/percpu_counter.h | 43 +++++++++++++++++++++++++++++++--- lib/percpu_counter.c | 31 ++++++++++++++++++++++-- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 3a44dd1e33d2..160f9734c0bb 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -21,7 +21,13 @@ =20 struct percpu_counter { raw_spinlock_t lock; - s64 count; + /* Depending on whether counters is NULL, we can support two modes, + * atomic mode using count_atomic and perpcu mode using count. + */ + union { + s64 count; + atomic64_t count_atomic; + }; #ifdef CONFIG_HOTPLUG_CPU struct list_head list; /* All percpu_counters are on a list */ #endif @@ -32,14 +38,14 @@ extern int percpu_counter_batch; =20 int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, - struct lock_class_key *key); + struct lock_class_key *key, bool switch_mode); =20 #define percpu_counter_init_many(fbc, value, gfp, nr_counters) \ ({ \ static struct lock_class_key __key; \ \ __percpu_counter_init_many(fbc, value, gfp, nr_counters,\ - &__key); \ + &__key, false); \ }) =20 =20 @@ -130,6 +136,20 @@ static inline bool percpu_counter_initialized(struct p= ercpu_counter *fbc) return (fbc->counters !=3D NULL); } =20 +static inline s64 percpu_counter_atomic_read(struct percpu_counter *fbc) +{ + return atomic64_read(&fbc->count_atomic); +} + +static inline void percpu_counter_atomic_add(struct percpu_counter *fbc, + s64 amount) +{ + atomic64_add(amount, &fbc->count_atomic); +} + +int percpu_counter_switch_to_pcpu_many(struct percpu_counter *fbc, + u32 nr_counters); + #else /* !CONFIG_SMP */ =20 struct percpu_counter { @@ -260,6 +280,23 @@ static inline bool percpu_counter_initialized(struct p= ercpu_counter *fbc) static inline void percpu_counter_sync(struct percpu_counter *fbc) { } + +static inline s64 percpu_counter_atomic_read(struct percpu_counter *fbc) +{ + return fbc->count; +} + +static inline void percpu_counter_atomic_add(struct percpu_counter *fbc, + s64 amount) +{ + percpu_counter_add(fbc, amount); +} + +static inline int percpu_counter_switch_to_pcpu_many(struct percpu_counter= *fbc, + u32 nr_counters) +{ + return 0; +} #endif /* CONFIG_SMP */ =20 static inline void percpu_counter_inc(struct percpu_counter *fbc) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 44dd133594d4..95c4e038051a 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(__percpu_counter_sum); =20 int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, gfp_t gfp, u32 nr_counters, - struct lock_class_key *key) + struct lock_class_key *key, bool switch_mode) { unsigned long flags __maybe_unused; size_t counter_size; @@ -174,7 +174,8 @@ int __percpu_counter_init_many(struct percpu_counter *f= bc, s64 amount, #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc[i].list); #endif - fbc[i].count =3D amount; + if (likely(!switch_mode)) + fbc[i].count =3D amount; fbc[i].counters =3D (void *)counters + (i * counter_size); =20 debug_percpu_counter_activate(&fbc[i]); @@ -357,6 +358,32 @@ bool __percpu_counter_limited_add(struct percpu_counte= r *fbc, return good; } =20 +/* + * percpu_counter_switch_to_pcpu_many: Converts struct percpu_counters from + * atomic mode to percpu mode. + */ +int percpu_counter_switch_to_pcpu_many(struct percpu_counter *fbc, + u32 nr_counters) +{ + static struct lock_class_key __key; + unsigned long flags; + bool ret =3D 0; + + if (percpu_counter_initialized(fbc)) + return 0; + + preempt_disable(); + local_irq_save(flags); + if (likely(!percpu_counter_initialized(fbc))) + ret =3D __percpu_counter_init_many(fbc, 0, + GFP_ATOMIC|__GFP_NOWARN|__GFP_ZERO, + nr_counters, &__key, true); + local_irq_restore(flags); + preempt_enable(); + + return ret; +} + static int __init percpu_counter_startup(void) { int ret; --=20 2.25.1 From nobody Sun May 19 01:15:30 2024 Received: from szxga08-in.huawei.com (szxga08-in.huawei.com [45.249.212.255]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id AB27616C868 for ; Thu, 18 Apr 2024 14:20:27 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=45.249.212.255 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713450030; cv=none; b=KBlVG4JOA72OsuamPDzP0Hv6x6j7uiOFOaE7rIuDgfwylWTIh98QOxdMcyC60jKTNFklqCxDZ76cqcjXzSIFbxQIDTLhpx8y1ozluOdVXUVfPzqoVt88KW3QDzA0y32Et4jfXVUgfqvRskPLk3vnB/3pgVIv/L6Objb/gHeGETU= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1713450030; c=relaxed/simple; bh=VToe+G5gCpIh0JC/YRdvGcXhSR6YzrScmagb3i00W1Q=; h=From:To:CC:Subject:Date:Message-ID:In-Reply-To:References: MIME-Version:Content-Type; b=XkE5nZJhYJt32SjiZBdao8qncTQ8UiCnaeukk+jNREjC9WI6oNO0P5IHgUL74rEWIqiIuXf4T7n+oCMAYvCDUO+pJylDvT8oTQzMFfxpIAEB+ynkA6II4nB5IGn3T8twfo4PruhXiS4JLQ3oexbPgMuNColZNhQdzKtcWclwwAU= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com; spf=pass smtp.mailfrom=huawei.com; arc=none smtp.client-ip=45.249.212.255 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=quarantine dis=none) header.from=huawei.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=huawei.com Received: from mail.maildlp.com (unknown [172.19.88.105]) by szxga08-in.huawei.com (SkyGuard) with ESMTP id 4VL0GY5dl6z1R5WL; Thu, 18 Apr 2024 22:17:25 +0800 (CST) Received: from kwepemm600020.china.huawei.com (unknown [7.193.23.147]) by mail.maildlp.com (Postfix) with ESMTPS id A4877140382; Thu, 18 Apr 2024 22:20:19 +0800 (CST) Received: from localhost.localdomain (10.175.112.125) by kwepemm600020.china.huawei.com (7.193.23.147) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384) id 15.1.2507.35; Thu, 18 Apr 2024 22:20:18 +0800 From: Peng Zhang To: , CC: , , , , , , , , , , , , Subject: [RFC PATCH v2 2/2] mm: convert mm's rss stats to use atomic mode Date: Thu, 18 Apr 2024 22:20:08 +0800 Message-ID: <20240418142008.2775308-3-zhangpeng362@huawei.com> X-Mailer: git-send-email 2.25.1 In-Reply-To: <20240418142008.2775308-1-zhangpeng362@huawei.com> References: <20240418142008.2775308-1-zhangpeng362@huawei.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To kwepemm600020.china.huawei.com (7.193.23.147) Content-Type: text/plain; charset="utf-8" From: ZhangPeng Since commit f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"), the rss_stats have converted into percpu_counter, which convert the error margin from (nr_threads * 64) to approximately (nr_cpus ^ 2). However, the new percpu allocation in mm_init() causes a performance regression on fork/exec/shell. Even after commit 14ef95be6f55 ("kernel/fork: group allocation/free of per-cpu counters for mm struct"), the performance of fork/exec/shell is still poor compared to previous kernel versions. To mitigate performance regression, we delay the allocation of percpu memory for rss_stats. Therefore, we convert mm's rss stats to use percpu_counter atomic mode. For single-thread processes, rss_stat is in atomic mode, which reduces the memory consumption and performance regression caused by using percpu. For multiple-thread processes, rss_stat is switched to the percpu mode to reduce the error margin. We convert rss_stats from atomic mode to percpu mode only when the second thread is created. After lmbench test, we can get 2% ~ 4% performance improvement for lmbench fork_proc/exec_proc/shell_proc and 6.7% performance improvement for lmbench page_fault (before batch mode[1]). The test results are as follows: base base+revert base+this patch fork_proc 416.3ms 400.0ms (3.9%) 398.6ms (4.2%) exec_proc 2095.9ms 2061.1ms (1.7%) 2047.7ms (2.3%) shell_proc 3028.2ms 2954.7ms (2.4%) 2961.2ms (2.2%) page_fault 0.3603ms 0.3358ms (6.8%) 0.3361ms (6.7%) [1] https://lore.kernel.org/all/20240412064751.119015-1-wangkefeng.wang@hua= wei.com/ Suggested-by: Jan Kara Signed-off-by: ZhangPeng Signed-off-by: Kefeng Wang --- include/linux/mm.h | 50 +++++++++++++++++++++++++++++++------ include/trace/events/kmem.h | 4 +-- kernel/fork.c | 18 +++++++------ 3 files changed, 56 insertions(+), 16 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index d261e45bb29b..8f1bfbd54697 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2631,30 +2631,66 @@ static inline bool get_user_page_fast_only(unsigned= long addr, */ static inline unsigned long get_mm_counter(struct mm_struct *mm, int membe= r) { - return percpu_counter_read_positive(&mm->rss_stat[member]); + struct percpu_counter *fbc =3D &mm->rss_stat[member]; + + if (percpu_counter_initialized(fbc)) + return percpu_counter_read_positive(fbc); + + return percpu_counter_atomic_read(fbc); } =20 void mm_trace_rss_stat(struct mm_struct *mm, int member); =20 static inline void add_mm_counter(struct mm_struct *mm, int member, long v= alue) { - percpu_counter_add(&mm->rss_stat[member], value); + struct percpu_counter *fbc =3D &mm->rss_stat[member]; + + if (percpu_counter_initialized(fbc)) + percpu_counter_add(fbc, value); + else + percpu_counter_atomic_add(fbc, value); =20 mm_trace_rss_stat(mm, member); } =20 static inline void inc_mm_counter(struct mm_struct *mm, int member) { - percpu_counter_inc(&mm->rss_stat[member]); - - mm_trace_rss_stat(mm, member); + add_mm_counter(mm, member, 1); } =20 static inline void dec_mm_counter(struct mm_struct *mm, int member) { - percpu_counter_dec(&mm->rss_stat[member]); + add_mm_counter(mm, member, -1); +} =20 - mm_trace_rss_stat(mm, member); +static inline s64 mm_counter_sum(struct mm_struct *mm, int member) +{ + struct percpu_counter *fbc =3D &mm->rss_stat[member]; + + if (percpu_counter_initialized(fbc)) + return percpu_counter_sum(fbc); + + return percpu_counter_atomic_read(fbc); +} + +static inline s64 mm_counter_sum_positive(struct mm_struct *mm, int member) +{ + struct percpu_counter *fbc =3D &mm->rss_stat[member]; + + if (percpu_counter_initialized(fbc)) + return percpu_counter_sum_positive(fbc); + + return percpu_counter_atomic_read(fbc); +} + +static inline int mm_counter_switch_to_pcpu_many(struct mm_struct *mm) +{ + return percpu_counter_switch_to_pcpu_many(mm->rss_stat, NR_MM_COUNTERS); +} + +static inline void mm_counter_destroy_many(struct mm_struct *mm) +{ + percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); } =20 /* Optimized variant when folio is already known not to be anon */ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 6e62cc64cd92..a4e40ae6a8c8 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -399,8 +399,8 @@ TRACE_EVENT(rss_stat, __entry->mm_id =3D mm_ptr_to_hash(mm); __entry->curr =3D !!(current->mm =3D=3D mm); __entry->member =3D member; - __entry->size =3D (percpu_counter_sum_positive(&mm->rss_stat[member]) - << PAGE_SHIFT); + __entry->size =3D (mm_counter_sum_positive(mm, member) + << PAGE_SHIFT); ), =20 TP_printk("mm_id=3D%u curr=3D%d type=3D%s size=3D%ldB", diff --git a/kernel/fork.c b/kernel/fork.c index 99076dbe27d8..0214273798c5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -823,7 +823,7 @@ static void check_mm(struct mm_struct *mm) "Please make sure 'struct resident_page_types[]' is updated as well"); =20 for (i =3D 0; i < NR_MM_COUNTERS; i++) { - long x =3D percpu_counter_sum(&mm->rss_stat[i]); + long x =3D mm_counter_sum(mm, i); =20 if (unlikely(x)) pr_alert("BUG: Bad rss-counter state mm:%p type:%s val:%ld\n", @@ -1301,16 +1301,10 @@ static struct mm_struct *mm_init(struct mm_struct *= mm, struct task_struct *p, if (mm_alloc_cid(mm)) goto fail_cid; =20 - if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, - NR_MM_COUNTERS)) - goto fail_pcpu; - mm->user_ns =3D get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; =20 -fail_pcpu: - mm_destroy_cid(mm); fail_cid: destroy_context(mm); fail_nocontext: @@ -1730,6 +1724,16 @@ static int copy_mm(unsigned long clone_flags, struct= task_struct *tsk) if (!oldmm) return 0; =20 + /* + * For single-thread processes, rss_stat is in atomic mode, which + * reduces the memory consumption and performance regression caused by + * using percpu. For multiple-thread processes, rss_stat is switched to + * the percpu mode to reduce the error margin. + */ + if (clone_flags & CLONE_THREAD) + if (mm_counter_switch_to_pcpu_many(oldmm)) + return -ENOMEM; + if (clone_flags & CLONE_VM) { mmget(oldmm); mm =3D oldmm; --=20 2.25.1