From nobody Tue Dec 16 11:46:11 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 87B02CDB482 for ; Thu, 19 Oct 2023 03:36:42 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232574AbjJSDgl (ORCPT ); Wed, 18 Oct 2023 23:36:41 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:58390 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232463AbjJSDg1 (ORCPT ); Wed, 18 Oct 2023 23:36:27 -0400 Received: from szxga01-in.huawei.com (szxga01-in.huawei.com [45.249.212.187]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 7C24B124 for ; Wed, 18 Oct 2023 20:36:24 -0700 (PDT) Received: from canpemm500009.china.huawei.com (unknown [172.30.72.53]) by szxga01-in.huawei.com (SkyGuard) with ESMTP id 4S9tYK3ZkfzvPsk; Thu, 19 Oct 2023 11:31:33 +0800 (CST) Received: from localhost.localdomain (10.50.163.32) by canpemm500009.china.huawei.com (7.192.105.203) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.31; Thu, 19 Oct 2023 11:36:18 +0800 From: Yicong Yang To: , , , , , , , , , , , CC: , , , , , , , , , <21cnbao@gmail.com>, , Subject: [PATCH v11 1/3] sched: Add cpus_share_resources API Date: Thu, 19 Oct 2023 11:33:21 +0800 Message-ID: <20231019033323.54147-2-yangyicong@huawei.com> X-Mailer: git-send-email 2.31.0 In-Reply-To: <20231019033323.54147-1-yangyicong@huawei.com> References: <20231019033323.54147-1-yangyicong@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Originating-IP: [10.50.163.32] X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To canpemm500009.china.huawei.com (7.192.105.203) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Barry Song Add cpus_share_resources() API. This is the preparation for the optimization of select_idle_cpu() on platforms with cluster scheduler level. On a machine with clusters cpus_share_resources() will test whether two cpus are within the same cluster. On a non-cluster machine it will behaves the same as cpus_share_cache(). So we use "resources" here for cache resources. Tested-by: K Prateek Nayak Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Gautham R. Shenoy Reviewed-by: Tim Chen Reviewed-by: Vincent Guittot --- include/linux/sched/sd_flags.h | 7 +++++++ include/linux/sched/topology.h | 8 +++++++- kernel/sched/core.c | 12 ++++++++++++ kernel/sched/sched.h | 1 + kernel/sched/topology.c | 13 +++++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index fad77b5172e2..a8b28647aafc 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -109,6 +109,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | = SDF_NEEDS_GROUPS) */ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) =20 +/* + * Domain members share CPU cluster (LLC tags or L2 cache) + * + * NEEDS_GROUPS: Clusters are shared between groups. + */ +SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) + /* * Domain members share CPU package resources (i.e. caches) * diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 67b573d5bf28..4c14fe127223 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; } #endif =20 @@ -179,6 +179,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); =20 bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_resources(int this_cpu, int that_cpu); =20 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); @@ -232,6 +233,11 @@ static inline bool cpus_share_cache(int this_cpu, int = that_cpu) return true; } =20 +static inline bool cpus_share_resources(int this_cpu, int that_cpu) +{ + return true; +} + #endif /* !CONFIG_SMP */ =20 #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 264c2eb380d7..562b27ced328 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3939,6 +3939,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) =3D=3D per_cpu(sd_llc_id, that_cpu); } =20 +/* + * Whether CPUs are share cache resources, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_resources(int this_cpu, int that_cpu) +{ + if (this_cpu =3D=3D that_cpu) + return true; + + return per_cpu(sd_share_id, this_cpu) =3D=3D per_cpu(sd_share_id, that_cp= u); +} + static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 65cad0e5729e..998f03d02de0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1853,6 +1853,7 @@ static inline struct sched_domain *lowest_flag_domain= (int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a63729f87c21..dbb8c328e8ad 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -668,6 +668,7 @@ static void destroy_sched_domains(struct sched_domain *= sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -693,6 +694,17 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) =3D id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); =20 + sd =3D lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id =3D cpumask_first(sched_domain_span(sd)); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_share_id, cpu) =3D id; + sd =3D lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); =20 @@ -1550,6 +1562,7 @@ static struct cpumask ***sched_domains_numa_masks; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING) --=20 2.24.0 From nobody Tue Dec 16 11:46:11 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 7FEA7C41513 for ; Thu, 19 Oct 2023 03:36:35 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232562AbjJSDge (ORCPT ); Wed, 18 Oct 2023 23:36:34 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48410 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231950AbjJSDg0 (ORCPT ); Wed, 18 Oct 2023 23:36:26 -0400 Received: from szxga02-in.huawei.com (szxga02-in.huawei.com [45.249.212.188]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id BD201121 for ; Wed, 18 Oct 2023 20:36:22 -0700 (PDT) Received: from canpemm500009.china.huawei.com (unknown [172.30.72.54]) by szxga02-in.huawei.com (SkyGuard) with ESMTP id 4S9tZC3g2dzNmqK; Thu, 19 Oct 2023 11:32:19 +0800 (CST) Received: from localhost.localdomain (10.50.163.32) by canpemm500009.china.huawei.com (7.192.105.203) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.31; Thu, 19 Oct 2023 11:36:18 +0800 From: Yicong Yang To: , , , , , , , , , , , CC: , , , , , , , , , <21cnbao@gmail.com>, , Subject: [PATCH v11 2/3] sched/fair: Scan cluster before scanning LLC in wake-up path Date: Thu, 19 Oct 2023 11:33:22 +0800 Message-ID: <20231019033323.54147-3-yangyicong@huawei.com> X-Mailer: git-send-email 2.31.0 In-Reply-To: <20231019033323.54147-1-yangyicong@huawei.com> References: <20231019033323.54147-1-yangyicong@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Originating-IP: [10.50.163.32] X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To canpemm500009.china.huawei.com (7.192.105.203) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Barry Song For platforms having clusters like Kunpeng920, CPUs within the same cluster have lower latency when synchronizing and accessing shared resources like cache. Thus, this patch tries to find an idle cpu within the cluster of the target CPU before scanning the whole LLC to gain lower latency. This will be implemented in 2 steps in select_idle_sibling(): 1. When the prev_cpu/recent_used_cpu are good wakeup candidates, use them if they're sharing cluster with the target CPU. Otherwise trying to scan for an idle CPU in the target's cluster. 2. Scanning the cluster prior to the LLC of the target CPU for an idle CPU to wakeup. Testing has been done on Kunpeng920 by pinning tasks to one numa and two numa. On Kunpeng920, Each numa has 8 clusters and each cluster has 4 CPUs. With this patch, We noticed enhancement on tbench and netperf within one numa or cross two numa on top of tip-sched-core commit 9b46f1abc6d4 ("sched/debug: Print 'tgid' in sched_show_task()") tbench results (node 0): baseline patched 1: 327.2833 372.4623 ( 13.80%) 4: 1320.5933 1479.8833 ( 12.06%) 8: 2638.4867 2921.5267 ( 10.73%) 16: 5282.7133 5891.5633 ( 11.53%) 32: 9810.6733 9877.3400 ( 0.68%) 64: 7408.9367 7447.9900 ( 0.53%) 128: 6203.2600 6191.6500 ( -0.19%) tbench results (node 0-1): baseline patched 1: 332.0433 372.7223 ( 12.25%) 4: 1325.4667 1477.6733 ( 11.48%) 8: 2622.9433 2897.9967 ( 10.49%) 16: 5218.6100 5878.2967 ( 12.64%) 32: 10211.7000 11494.4000 ( 12.56%) 64: 13313.7333 16740.0333 ( 25.74%) 128: 13959.1000 14533.9000 ( 4.12%) netperf results TCP_RR (node 0): baseline patched 1: 76546.5033 90649.9867 ( 18.42%) 4: 77292.4450 90932.7175 ( 17.65%) 8: 77367.7254 90882.3467 ( 17.47%) 16: 78519.9048 90938.8344 ( 15.82%) 32: 72169.5035 72851.6730 ( 0.95%) 64: 25911.2457 25882.2315 ( -0.11%) 128: 10752.6572 10768.6038 ( 0.15%) netperf results TCP_RR (node 0-1): baseline patched 1: 76857.6667 90892.2767 ( 18.26%) 4: 78236.6475 90767.3017 ( 16.02%) 8: 77929.6096 90684.1633 ( 16.37%) 16: 77438.5873 90502.5787 ( 16.87%) 32: 74205.6635 88301.5612 ( 19.00%) 64: 69827.8535 71787.6706 ( 2.81%) 128: 25281.4366 25771.3023 ( 1.94%) netperf results UDP_RR (node 0): baseline patched 1: 96869.8400 110800.8467 ( 14.38%) 4: 97744.9750 109680.5425 ( 12.21%) 8: 98783.9863 110409.9637 ( 11.77%) 16: 99575.0235 110636.2435 ( 11.11%) 32: 95044.7250 97622.8887 ( 2.71%) 64: 32925.2146 32644.4991 ( -0.85%) 128: 12859.2343 12824.0051 ( -0.27%) netperf results UDP_RR (node 0-1): baseline patched 1: 97202.4733 110190.1200 ( 13.36%) 4: 95954.0558 106245.7258 ( 10.73%) 8: 96277.1958 105206.5304 ( 9.27%) 16: 97692.7810 107927.2125 ( 10.48%) 32: 79999.6702 103550.2999 ( 29.44%) 64: 80592.7413 87284.0856 ( 8.30%) 128: 27701.5770 29914.5820 ( 7.99%) Note neither Kunpeng920 nor x86 Jacobsville supports SMT, so the SMT branch in the code has not been tested but it supposed to work. Chen Yu also noticed this will improve the performance of tbench and netperf on a 24 CPUs Jacobsville machine, there are 4 CPUs in one cluster sharing L2 Cache. Suggested-by: Peter Zijlstra [https://lore.kernel.org/lkml/Ytfjs+m1kUs0ScSn@worktop.programming.kicks-as= s.net] Tested-by: Yicong Yang Signed-off-by: Barry Song Signed-off-by: Yicong Yang Reviewed-by: Tim Chen Reviewed-by: Chen Yu Reviewed-by: Gautham R. Shenoy Reviewed-by: Vincent Guittot --- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++++++++++---- kernel/sched/sched.h | 1 + kernel/sched/topology.c | 12 ++++++++++++ 3 files changed, 50 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9ae2208089e4..02d842df5294 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7213,6 +7213,30 @@ static int select_idle_cpu(struct task_struct *p, st= ruct sched_domain *sd, bool } } =20 + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg =3D sd->groups; + + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i =3D select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <=3D 0) + return -1; + idle_cpu =3D __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_group_span(sg)); + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i =3D select_idle_core(p, cpu, cpus, &idle_cpu); @@ -7220,7 +7244,7 @@ static int select_idle_cpu(struct task_struct *p, str= uct sched_domain *sd, bool return i; =20 } else { - if (!--nr) + if (--nr <=3D 0) return -1; idle_cpu =3D __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) @@ -7349,8 +7373,13 @@ static int select_idle_sibling(struct task_struct *p= , int prev, int target) */ if (prev !=3D target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_cpu(task_util, util_min, util_max, prev)) - return prev; + asym_fits_cpu(task_util, util_min, util_max, prev)) { + if (!static_branch_unlikely(&sched_cluster_active)) + return prev; + + if (cpus_share_resources(prev, target)) + return prev; + } =20 /* * Allow a per-cpu kthread to stack with the wakee if the @@ -7377,7 +7406,11 @@ static int select_idle_sibling(struct task_struct *p= , int prev, int target) (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cp= u)) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { - return recent_used_cpu; + if (!static_branch_unlikely(&sched_cluster_active)) + return recent_used_cpu; + + if (cpus_share_resources(recent_used_cpu, target)) + return recent_used_cpu; } =20 /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 998f03d02de0..ef4fe7bcf740 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1859,6 +1859,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; =20 static __always_inline bool sched_asym_cpucap_active(void) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dbb8c328e8ad..10d1391e7416 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -673,7 +673,9 @@ DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_l= lc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); =20 static void update_top_cache_domain(int cpu) { @@ -2386,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, st= ruct sched_domain_attr *att struct rq *rq =3D NULL; int i, ret =3D -ENOMEM; bool has_asym =3D false; + bool has_cluster =3D false; =20 if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2514,12 +2517,18 @@ build_sched_domains(const struct cpumask *cpu_map, = struct sched_domain_attr *att WRITE_ONCE(d.rd->max_cpu_capacity, capacity); =20 cpu_attach_domain(sd, d.rd, i); + + if (lowest_flag_domain(i, SD_CLUSTER)) + has_cluster =3D true; } rcu_read_unlock(); =20 if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); =20 + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity =3D %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); @@ -2619,6 +2628,9 @@ static void detach_destroy_domains(const struct cpuma= sk *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); =20 + if (static_branch_unlikely(&sched_cluster_active)) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); --=20 2.24.0 From nobody Tue Dec 16 11:46:11 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 95ACBCDB47E for ; Thu, 19 Oct 2023 03:36:35 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S232544AbjJSDga (ORCPT ); Wed, 18 Oct 2023 23:36:30 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48398 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231421AbjJSDg0 (ORCPT ); Wed, 18 Oct 2023 23:36:26 -0400 Received: from szxga08-in.huawei.com (szxga08-in.huawei.com [45.249.212.255]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 9501D119 for ; Wed, 18 Oct 2023 20:36:22 -0700 (PDT) Received: from canpemm500009.china.huawei.com (unknown [172.30.72.56]) by szxga08-in.huawei.com (SkyGuard) with ESMTP id 4S9tbg6VQqz15NYC; Thu, 19 Oct 2023 11:33:35 +0800 (CST) Received: from localhost.localdomain (10.50.163.32) by canpemm500009.china.huawei.com (7.192.105.203) with Microsoft SMTP Server (version=TLS1_2, cipher=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256) id 15.1.2507.31; Thu, 19 Oct 2023 11:36:19 +0800 From: Yicong Yang To: , , , , , , , , , , , CC: , , , , , , , , , <21cnbao@gmail.com>, , Subject: [PATCH v11 3/3] sched/fair: Use candidate prev/recent_used CPU if scanning failed for cluster wakeup Date: Thu, 19 Oct 2023 11:33:23 +0800 Message-ID: <20231019033323.54147-4-yangyicong@huawei.com> X-Mailer: git-send-email 2.31.0 In-Reply-To: <20231019033323.54147-1-yangyicong@huawei.com> References: <20231019033323.54147-1-yangyicong@huawei.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-Originating-IP: [10.50.163.32] X-ClientProxiedBy: dggems702-chm.china.huawei.com (10.3.19.179) To canpemm500009.china.huawei.com (7.192.105.203) X-CFilter-Loop: Reflected Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Yicong Yang Chen Yu reports a hackbench regression of cluster wakeup when hackbench threads equal to the CPU number [1]. Analysis shows it's because we wake up more on the target CPU even if the prev_cpu is a good wakeup candidate and leads to the decrease of the CPU utilization. Generally if the task's prev_cpu is idle we'll wake up the task on it without scanning. On cluster machines we'll try to wake up the task in the same cluster of the target for better cache affinity, so if the prev_cpu is idle but not sharing the same cluster with the target we'll still try to find an idle CPU within the cluster. This will improve the performance at low loads on cluster machines. But in the issue above, if the prev_cpu is idle but not in the cluster with the target CPU, we'll try to scan an idle one in the cluster. But since the system is busy, we're likely to fail the scanning and use target instead, even if the prev_cpu is idle. Then leads to the regression. This patch solves this in 2 steps: o record the prev_cpu/recent_used_cpu if they're good wakeup candidates but not sharing the cluster with the target. o on scanning failure use the prev_cpu/recent_used_cpu if they're recorded as idle [1] https://lore.kernel.org/all/ZGzDLuVaHR1PAYDt@chenyu5-mobl1/ Reported-by: Chen Yu Closes: https://lore.kernel.org/all/ZGsLy83wPIpamy6x@chenyu5-mobl1/ Signed-off-by: Yicong Yang Reviewed-by: Vincent Guittot --- kernel/sched/fair.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 02d842df5294..d508d1999ecc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7346,7 +7346,7 @@ static int select_idle_sibling(struct task_struct *p,= int prev, int target) bool has_idle_core =3D false; struct sched_domain *sd; unsigned long task_util, util_min, util_max; - int i, recent_used_cpu; + int i, recent_used_cpu, prev_aff =3D -1; =20 /* * On asymmetric system, update task utilization because we will check @@ -7379,6 +7379,8 @@ static int select_idle_sibling(struct task_struct *p,= int prev, int target) =20 if (cpus_share_resources(prev, target)) return prev; + + prev_aff =3D prev; } =20 /* @@ -7411,6 +7413,8 @@ static int select_idle_sibling(struct task_struct *p,= int prev, int target) =20 if (cpus_share_resources(recent_used_cpu, target)) return recent_used_cpu; + } else { + recent_used_cpu =3D -1; } =20 /* @@ -7451,6 +7455,17 @@ static int select_idle_sibling(struct task_struct *p= , int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; =20 + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster + * first. But prev_cpu or recent_used_cpu may also be a good candidate, + * use them if possible when no idle CPU found in select_idle_cpu(). + */ + if ((unsigned int)prev_aff < nr_cpumask_bits) + return prev_aff; + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + return target; } =20 --=20 2.24.0 From nobody Tue Dec 16 11:46:11 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 0CDB6C001DF for ; Fri, 20 Oct 2023 13:44:35 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1377413AbjJTNoe (ORCPT ); Fri, 20 Oct 2023 09:44:34 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:41416 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1377074AbjJTNoc (ORCPT ); Fri, 20 Oct 2023 09:44:32 -0400 Received: from casper.infradead.org (casper.infradead.org [IPv6:2001:8b0:10b:1236::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id B0F44A3 for ; Fri, 20 Oct 2023 06:44:30 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; q=dns/txt; c=relaxed/relaxed; d=infradead.org; s=casper.20170209; h=In-Reply-To:Content-Type:MIME-Version: References:Message-ID:Subject:Cc:To:From:Date:Sender:Reply-To: Content-Transfer-Encoding:Content-ID:Content-Description; bh=VtM4fZvfkrGP76noU6hoHZ7O8+G+x3ssq63G10tuoB0=; b=FbZu5DGm6rwLX9Tb2SzEf9Jqqi ypsCIfvuVb7UuqsjmvA3xKHhsrOkP7XJCXM9EAIk93OK0ViQElE27AkiEPWmpZA03soRwDFgpWHEb HgvttCTMxpqDJA9S0EZFenzPXi/G3SgmaSF/8O70XRJSZpgcTt0EJoeijiZO9SYgl11f5OkCiZms6 qx8zRNI+qdzjdqyPayxNvdRA129rhkdI/yXk2t86MRF5Nq4b4dSw/ArFdEOXH4yLcSYTZbTJxWdPZ oOIdLqjgHCarypugH3yY+KMwy9Cw2Q8HVmHbbz4i44CHbnPwAdua9UaDoEuUOOmNUEyD1STVw8Lro iFjCuLVg==; Received: from j130084.upc-j.chello.nl ([24.132.130.84] helo=noisy.programming.kicks-ass.net) by casper.infradead.org with esmtpsa (Exim 4.94.2 #2 (Red Hat Linux)) id 1qtpms-00DQUw-Fn; Fri, 20 Oct 2023 13:43:38 +0000 Received: by noisy.programming.kicks-ass.net (Postfix, from userid 1000) id CA0AB300392; Fri, 20 Oct 2023 15:43:37 +0200 (CEST) Date: Fri, 20 Oct 2023 15:43:37 +0200 From: Peter Zijlstra To: Yicong Yang Cc: mingo@redhat.com, juri.lelli@redhat.com, vincent.guittot@linaro.org, dietmar.eggemann@arm.com, tim.c.chen@linux.intel.com, yu.c.chen@intel.com, gautham.shenoy@amd.com, mgorman@suse.de, vschneid@redhat.com, linux-kernel@vger.kernel.org, linux-arm-kernel@lists.infradead.org, rostedt@goodmis.org, bsegall@google.com, bristot@redhat.com, prime.zeng@huawei.com, yangyicong@hisilicon.com, jonathan.cameron@huawei.com, ego@linux.vnet.ibm.com, srikar@linux.vnet.ibm.com, linuxarm@huawei.com, 21cnbao@gmail.com, kprateek.nayak@amd.com, wuyun.abel@bytedance.com Subject: [PATCH] sched/fair: Remove SIS_PROP Message-ID: <20231020134337.GD33965@noisy.programming.kicks-ass.net> References: <20231019033323.54147-1-yangyicong@huawei.com> MIME-Version: 1.0 Content-Disposition: inline In-Reply-To: <20231019033323.54147-1-yangyicong@huawei.com> Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Since this had me looking at all that code, I did the below. Holler if there's objections etc.. Acked-by: Mel Gorman Acked-by: Vincent Guittot Reviewed-by: Yicong Yang --- Subject: sched/fair: Remove SIS_PROP From: Peter Zijlstra Date: Fri Oct 20 12:35:33 CEST 2023 SIS_UTIL seems to work well, lets remove the old thing. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/sched/topology.h | 2 - kernel/sched/core.c | 5 ---- kernel/sched/fair.c | 48 ------------------------------------= ----- kernel/sched/features.h | 1=20 kernel/sched/sched.h | 3 -- 5 files changed, 59 deletions(-) --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -109,8 +109,6 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; =20 - u64 avg_scan_cost; /* select_idle_sibling */ - #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3792,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct t if (rq->avg_idle > max) rq->avg_idle =3D max; =20 - rq->wake_stamp =3D jiffies; - rq->wake_avg_idle =3D rq->avg_idle / 2; - rq->idle_stamp =3D 0; } #endif @@ -9991,8 +9988,6 @@ void __init sched_init(void) rq->online =3D 0; rq->idle_stamp =3D 0; rq->avg_idle =3D 2*sysctl_sched_migration_cost; - rq->wake_stamp =3D jiffies; - rq->wake_avg_idle =3D rq->avg_idle; rq->max_idle_balance_cost =3D sysctl_sched_migration_cost; =20 INIT_LIST_HEAD(&rq->cfs_tasks); --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7209,45 +7209,9 @@ static int select_idle_cpu(struct task_s struct cpumask *cpus =3D this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu =3D -1, nr =3D INT_MAX; struct sched_domain_shared *sd_share; - struct rq *this_rq =3D this_rq(); - int this =3D smp_processor_id(); - struct sched_domain *this_sd =3D NULL; - u64 time =3D 0; =20 cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); =20 - if (sched_feat(SIS_PROP) && !has_idle_core) { - u64 avg_cost, avg_idle, span_avg; - unsigned long now =3D jiffies; - - this_sd =3D rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - - /* - * If we're busy, the assumption that the last idle period - * predicts the future is flawed; age away the remaining - * predicted idle time. - */ - if (unlikely(this_rq->wake_stamp < now)) { - while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { - this_rq->wake_stamp++; - this_rq->wake_avg_idle >>=3D 1; - } - } - - avg_idle =3D this_rq->wake_avg_idle; - avg_cost =3D this_sd->avg_scan_cost + 1; - - span_avg =3D sd->span_weight * avg_idle; - if (span_avg > 4*avg_cost) - nr =3D div_u64(span_avg, avg_cost); - else - nr =3D 4; - - time =3D cpu_clock(this); - } - if (sched_feat(SIS_UTIL)) { sd_share =3D rcu_dereference(per_cpu(sd_llc_shared, target)); if (sd_share) { @@ -7301,18 +7265,6 @@ static int select_idle_cpu(struct task_s if (has_idle_core) set_idle_cores(target, false); =20 - if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { - time =3D cpu_clock(this) - time; - - /* - * Account for the scan cost of wakeups against the average - * idle time. - */ - this_rq->wake_avg_idle -=3D min(this_rq->wake_avg_idle, time); - - update_avg(&this_sd->avg_scan_cost, time); - } - return idle_cpu; } =20 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domai= n. */ -SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) =20 /* --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1059,9 +1059,6 @@ struct rq { u64 idle_stamp; u64 avg_idle; =20 - unsigned long wake_stamp; - u64 wake_avg_idle; - /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost;