[PATCH v3 2/2] kernel/fork: group allocation/free of per-cpu counters for mm struct

Mateusz Guzik posted 2 patches 2 years, 3 months ago
[PATCH v3 2/2] kernel/fork: group allocation/free of per-cpu counters for mm struct
Posted by Mateusz Guzik 2 years, 3 months ago
A trivial execve scalability test which tries to be very friendly
(statically linked binaries, all separate) is predominantly bottlenecked
by back-to-back per-cpu counter allocations which serialize on global
locks.

Ease the pain by allocating and freeing them in one go.

Bench can be found here:
http://apollo.backplane.com/DFlyMisc/doexec.c

$ cc -static -O2 -o static-doexec doexec.c
$ ./static-doexec $(nproc)

Even at a very modest scale of 26 cores (ops/s):
before:	133543.63
after:	186061.81 (+39%)

While with the patch these allocations remain a significant problem,
the primary bottleneck shifts to page release handling.

Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>
---
 kernel/fork.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index d2e12b6d2b18..4f0ada33457e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
  */
 void __mmdrop(struct mm_struct *mm)
 {
-	int i;
-
 	BUG_ON(mm == &init_mm);
 	WARN_ON_ONCE(mm == current->mm);
 
@@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm)
 	put_user_ns(mm->user_ns);
 	mm_pasid_drop(mm);
 	mm_destroy_cid(mm);
+	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
 
-	for (i = 0; i < NR_MM_COUNTERS; i++)
-		percpu_counter_destroy(&mm->rss_stat[i]);
 	free_mm(mm);
 }
 EXPORT_SYMBOL_GPL(__mmdrop);
@@ -1252,8 +1249,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
 static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	struct user_namespace *user_ns)
 {
-	int i;
-
 	mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
 	mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
 	atomic_set(&mm->mm_users, 1);
@@ -1301,17 +1296,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
 	if (mm_alloc_cid(mm))
 		goto fail_cid;
 
-	for (i = 0; i < NR_MM_COUNTERS; i++)
-		if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
-			goto fail_pcpu;
+	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS))
+		goto fail_pcpu;
 
 	mm->user_ns = get_user_ns(user_ns);
 	lru_gen_init_mm(mm);
 	return mm;
 
 fail_pcpu:
-	while (i > 0)
-		percpu_counter_destroy(&mm->rss_stat[--i]);
 	mm_destroy_cid(mm);
 fail_cid:
 	destroy_context(mm);
-- 
2.41.0
Re: [PATCH v3 2/2] kernel/fork: group allocation/free of per-cpu counters for mm struct
Posted by kernel test robot 2 years, 3 months ago

Hello,

kernel test robot noticed a -8.2% improvement of phoronix-test-suite.osbench.LaunchPrograms.us_per_event on:


commit: 9d32938c115580bfff128a926d704199d2f33ba3 ("[PATCH v3 2/2] kernel/fork: group allocation/free of per-cpu counters for mm struct")
url: https://github.com/intel-lab-lkp/linux/commits/Mateusz-Guzik/pcpcntr-add-group-allocation-free/20230823-130803
base: https://git.kernel.org/cgit/linux/kernel/git/dennis/percpu.git for-next
patch link: https://lore.kernel.org/all/20230823050609.2228718-3-mjguzik@gmail.com/
patch subject: [PATCH v3 2/2] kernel/fork: group allocation/free of per-cpu counters for mm struct

testcase: phoronix-test-suite
test machine: 96 threads 2 sockets Intel(R) Xeon(R) Gold 6252 CPU @ 2.10GHz (Cascade Lake) with 512G memory
parameters:

	test: osbench-1.0.2
	option_a: Launch Programs
	cpufreq_governor: performance






Details are as below:
-------------------------------------------------------------------------------------------------->


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20230906/202309061504.7e645826-oliver.sang@intel.com

=========================================================================================
compiler/cpufreq_governor/kconfig/option_a/rootfs/tbox_group/test/testcase:
  gcc-12/performance/x86_64-rhel-8.3/Launch Programs/debian-x86_64-phoronix/lkp-csl-2sp7/osbench-1.0.2/phoronix-test-suite

commit: 
  1db50472c8 ("pcpcntr: add group allocation/free")
  9d32938c11 ("kernel/fork: group allocation/free of per-cpu counters for mm struct")

1db50472c8bc1d34 9d32938c115580bfff128a926d7 
---------------- --------------------------- 
         %stddev     %change         %stddev
             \          |                \  
      3.00           +33.3%       4.00        vmstat.procs.r
     14111            +5.7%      14918        vmstat.system.cs
      2114            +1.1%       2136        turbostat.Bzy_MHz
      1.67            +0.2        1.83        turbostat.C1E%
    121.98            +5.1%     128.24        turbostat.PkgWatt
     98.05            -8.2%      90.02        phoronix-test-suite.osbench.LaunchPrograms.us_per_event
     16246 ±  4%      +6.1%      17243        phoronix-test-suite.time.involuntary_context_switches
   9791476            +9.2%   10689455        phoronix-test-suite.time.minor_page_faults
    311.33            +9.3%     340.33        phoronix-test-suite.time.percent_of_cpu_this_job_got
     83.40 ±  2%      +9.2%      91.07 ±  2%  phoronix-test-suite.time.system_time
    151333            +8.6%     164355        phoronix-test-suite.time.voluntary_context_switches
      3225            -5.5%       3046 ±  5%  proc-vmstat.nr_page_table_pages
   9150454            +8.0%    9884178        proc-vmstat.numa_hit
   9088660            +8.7%    9882518        proc-vmstat.numa_local
   9971116            +8.3%   10802925        proc-vmstat.pgalloc_normal
  10202032            +8.8%   11099649        proc-vmstat.pgfault
   9845338            +8.4%   10676360        proc-vmstat.pgfree
    207049           +10.3%     228380 ±  8%  proc-vmstat.pgreuse
 1.947e+09            +5.0%  2.045e+09        perf-stat.i.branch-instructions
  52304206            +4.4%   54610501        perf-stat.i.branch-misses
      9.06 ±  2%      +0.5        9.52        perf-stat.i.cache-miss-rate%
  19663522 ±  3%     +10.0%   21634645        perf-stat.i.cache-misses
 1.658e+08            +3.6%  1.717e+08        perf-stat.i.cache-references
     14769            +6.2%      15691        perf-stat.i.context-switches
 1.338e+10            +6.2%   1.42e+10        perf-stat.i.cpu-cycles
   3112873 ±  3%     -12.5%    2724690 ±  3%  perf-stat.i.dTLB-load-misses
 2.396e+09            +5.5%  2.528e+09        perf-stat.i.dTLB-loads
      0.11 ±  4%      -0.0        0.10 ±  2%  perf-stat.i.dTLB-store-miss-rate%
   1003394 ±  6%     -14.0%     862768 ±  5%  perf-stat.i.dTLB-store-misses
  1.25e+09            +6.0%  1.325e+09        perf-stat.i.dTLB-stores
     71.16            -1.3       69.88        perf-stat.i.iTLB-load-miss-rate%
   1872082            +8.2%    2025999        perf-stat.i.iTLB-loads
 9.606e+09            +5.4%  1.012e+10        perf-stat.i.instructions
     23.37 ±  5%     +30.6%      30.53 ±  4%  perf-stat.i.major-faults
      0.14            +6.2%       0.15        perf-stat.i.metric.GHz
     59.39            +5.4%      62.61        perf-stat.i.metric.M/sec
    249517           +10.0%     274572        perf-stat.i.minor-faults
   5081285            +6.0%    5385686 ±  4%  perf-stat.i.node-load-misses
    565117 ±  3%      +8.1%     610682 ±  3%  perf-stat.i.node-loads
    249541           +10.0%     274602        perf-stat.i.page-faults
     17.27            -1.7%      16.98        perf-stat.overall.MPKI
     11.85 ±  2%      +0.7       12.59        perf-stat.overall.cache-miss-rate%
      0.13 ±  2%      -0.0        0.11 ±  2%  perf-stat.overall.dTLB-load-miss-rate%
      0.08 ±  7%      -0.0        0.07 ±  4%  perf-stat.overall.dTLB-store-miss-rate%
     67.26            -1.1       66.12        perf-stat.overall.iTLB-load-miss-rate%
 1.895e+09            +5.0%   1.99e+09        perf-stat.ps.branch-instructions
  50921385            +4.4%   53146828        perf-stat.ps.branch-misses
  19140130 ±  3%     +10.0%   21047707        perf-stat.ps.cache-misses
 1.615e+08            +3.5%  1.672e+08        perf-stat.ps.cache-references
     14376            +6.2%      15266        perf-stat.ps.context-switches
 1.303e+10            +6.1%  1.383e+10        perf-stat.ps.cpu-cycles
   3033019 ±  3%     -12.5%    2654269 ±  3%  perf-stat.ps.dTLB-load-misses
 2.332e+09            +5.5%   2.46e+09        perf-stat.ps.dTLB-loads
    976773 ±  6%     -14.1%     839517 ±  5%  perf-stat.ps.dTLB-store-misses
 1.217e+09            +6.0%  1.289e+09        perf-stat.ps.dTLB-stores
   1822198            +8.2%    1971115        perf-stat.ps.iTLB-loads
 9.349e+09            +5.3%  9.846e+09        perf-stat.ps.instructions
     22.75 ±  5%     +30.5%      29.69 ±  4%  perf-stat.ps.major-faults
    242831           +10.0%     267074        perf-stat.ps.minor-faults
   4945101            +5.9%    5238638 ±  4%  perf-stat.ps.node-load-misses
    550029 ±  3%      +8.0%     594116 ±  3%  perf-stat.ps.node-loads
    242854           +10.0%     267104        perf-stat.ps.page-faults
 3.719e+11            +4.4%  3.883e+11        perf-stat.total.instructions




Disclaimer:
Results have been estimated based on internal Intel analysis and are provided
for informational purposes only. Any difference in system hardware or software
design or configuration may affect actual performance.


-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v3 2/2] kernel/fork: group allocation/free of per-cpu counters for mm struct
Posted by Dennis Zhou 2 years, 3 months ago
On Wed, Aug 23, 2023 at 07:06:09AM +0200, Mateusz Guzik wrote:
> A trivial execve scalability test which tries to be very friendly
> (statically linked binaries, all separate) is predominantly bottlenecked
> by back-to-back per-cpu counter allocations which serialize on global
> locks.
> 
> Ease the pain by allocating and freeing them in one go.
> 
> Bench can be found here:
> http://apollo.backplane.com/DFlyMisc/doexec.c
> 
> $ cc -static -O2 -o static-doexec doexec.c
> $ ./static-doexec $(nproc)
> 
> Even at a very modest scale of 26 cores (ops/s):
> before:	133543.63
> after:	186061.81 (+39%)
> 
> While with the patch these allocations remain a significant problem,
> the primary bottleneck shifts to page release handling.
> 
> Signed-off-by: Mateusz Guzik <mjguzik@gmail.com>

Same message as for 1/2. I'm happy with this, just a minor reflow. I'll
take this for-6.6 unless there are other comments / objections to that.

I'll run a few tests myself too tomorrow just for validation.

Reviewed-by: Dennis Zhou <dennis@kernel.org>

Thanks,
Dennis

> ---
>  kernel/fork.c | 14 +++-----------
>  1 file changed, 3 insertions(+), 11 deletions(-)
> 
> diff --git a/kernel/fork.c b/kernel/fork.c
> index d2e12b6d2b18..4f0ada33457e 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm)
>   */
>  void __mmdrop(struct mm_struct *mm)
>  {
> -	int i;
> -
>  	BUG_ON(mm == &init_mm);
>  	WARN_ON_ONCE(mm == current->mm);
>  
> @@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm)
>  	put_user_ns(mm->user_ns);
>  	mm_pasid_drop(mm);
>  	mm_destroy_cid(mm);
> +	percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS);
>  
> -	for (i = 0; i < NR_MM_COUNTERS; i++)
> -		percpu_counter_destroy(&mm->rss_stat[i]);
>  	free_mm(mm);
>  }
>  EXPORT_SYMBOL_GPL(__mmdrop);
> @@ -1252,8 +1249,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm)
>  static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	struct user_namespace *user_ns)
>  {
> -	int i;
> -
>  	mt_init_flags(&mm->mm_mt, MM_MT_FLAGS);
>  	mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock);
>  	atomic_set(&mm->mm_users, 1);
> @@ -1301,17 +1296,14 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
>  	if (mm_alloc_cid(mm))
>  		goto fail_cid;
>  
> -	for (i = 0; i < NR_MM_COUNTERS; i++)
> -		if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
> -			goto fail_pcpu;
> +	if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, NR_MM_COUNTERS))
> +		goto fail_pcpu;
>  
>  	mm->user_ns = get_user_ns(user_ns);
>  	lru_gen_init_mm(mm);
>  	return mm;
>  
>  fail_pcpu:
> -	while (i > 0)
> -		percpu_counter_destroy(&mm->rss_stat[--i]);
>  	mm_destroy_cid(mm);
>  fail_cid:
>  	destroy_context(mm);
> -- 
> 2.41.0
>