From nobody Sun Sep 14 22:30:59 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id BD0B0C3DA78 for ; Tue, 17 Jan 2023 18:19:30 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231217AbjAQSSU (ORCPT ); Tue, 17 Jan 2023 13:18:20 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:47252 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S233208AbjAQSOE (ORCPT ); Tue, 17 Jan 2023 13:14:04 -0500 Received: from galois.linutronix.de (Galois.linutronix.de [IPv6:2a0a:51c0:0:12e:550::1]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 313CA59E45; Tue, 17 Jan 2023 09:55:00 -0800 (PST) Date: Tue, 17 Jan 2023 17:54:57 -0000 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020; t=1673978097; h=from:from:sender:sender:reply-to:reply-to:subject:subject:date:date: message-id:message-id:to:to:cc:cc:mime-version:mime-version: content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=xJ91IUMshMkq/FQxBTEgRO84zIo9Sze3/R99k5JCRwA=; b=mvoQ9w46QBY6O4TslPU0jG21wWsLvDiztZ+2GSQK5ezeU8P5GJKBMN3xA8jESe2v+tIoCd jwfU61rrDEywhnB/U/qFi5Kb1uKwyKOWdh3DMzn/qHOVDPbFl1MyjsfpA5StJikePueEmk vPtI0lem74/zOQz6V2FrQT4R9fAhpCsV8pELaNgWgJu3LqYfXhdjSlEsBOZ1WdZIwJLk/s 9Zmyv8pOgV+LUP+QRvv207EFq5ls5JrGsjeUViuJTYazHGnLjeVUGILhkpskedRkBilIAV WN5yvScgdB7IfPLBx6z6xCiKNSSt3qhnq47ZFAzsq6A8OEMB9F6q3cR7GuqKVg== DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=linutronix.de; s=2020e; t=1673978097; h=from:from:sender:sender:reply-to:reply-to:subject:subject:date:date: message-id:message-id:to:to:cc:cc:mime-version:mime-version: content-type:content-type: content-transfer-encoding:content-transfer-encoding: in-reply-to:in-reply-to:references:references; bh=xJ91IUMshMkq/FQxBTEgRO84zIo9Sze3/R99k5JCRwA=; b=+vPKxkgsApvKuMneKxanjtRHiXxnppPuOSHWoYNKnDkWgV9VCMBQB8tOYFeYfP0mwmOY0l A38IPi3rc1iTSXBg== From: "tip-bot2 for Ming Lei" Sender: tip-bot2@linutronix.de Reply-to: linux-kernel@vger.kernel.org To: linux-tip-commits@vger.kernel.org Subject: [tip: irq/core] genirq/affinity: Move group_cpus_evenly() into lib/ Cc: Ming Lei , Thomas Gleixner , Christoph Hellwig , Jens Axboe , x86@kernel.org, linux-kernel@vger.kernel.org, maz@kernel.org In-Reply-To: <20221227022905.352674-6-ming.lei@redhat.com> References: <20221227022905.352674-6-ming.lei@redhat.com> MIME-Version: 1.0 Message-ID: <167397809730.4906.9269442173087463441.tip-bot2@tip-bot2> Robot-ID: Robot-Unsubscribe: Contact to get blacklisted from these emails Content-Type: text/plain; charset="utf-8" Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org The following commit has been merged into the irq/core branch of tip: Commit-ID: f7b3ea8cf72f3d6060fe08e461805181e7450a13 Gitweb: https://git.kernel.org/tip/f7b3ea8cf72f3d6060fe08e461805181e= 7450a13 Author: Ming Lei AuthorDate: Tue, 27 Dec 2022 10:29:04 +08:00 Committer: Thomas Gleixner CommitterDate: Tue, 17 Jan 2023 18:50:06 +01:00 genirq/affinity: Move group_cpus_evenly() into lib/ group_cpus_evenly() has become a generic function which can be used for other subsystems than the interrupt subsystem, so move it into lib/. Signed-off-by: Ming Lei Signed-off-by: Thomas Gleixner Reviewed-by: Christoph Hellwig Reviewed-by: Jens Axboe = = = =20 Link: https://lore.kernel.org/r/20221227022905.352674-6-ming.lei@redhat.com --- MAINTAINERS | 2 +- include/linux/group_cpus.h | 14 +- kernel/irq/affinity.c | 398 +---------------------------------- lib/Makefile | 2 +- lib/group_cpus.c | 427 ++++++++++++++++++++++++++++++++++++- 5 files changed, 446 insertions(+), 397 deletions(-) create mode 100644 include/linux/group_cpus.h create mode 100644 lib/group_cpus.c diff --git a/MAINTAINERS b/MAINTAINERS index a36df9e..9a07bd4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10935,6 +10935,8 @@ L: linux-kernel@vger.kernel.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git irq/core F: kernel/irq/ +F: include/linux/group_cpus.h +F: lib/group_cpus.c =20 IRQCHIP DRIVERS M: Thomas Gleixner diff --git a/include/linux/group_cpus.h b/include/linux/group_cpus.h new file mode 100644 index 0000000..e42807e --- /dev/null +++ b/include/linux/group_cpus.h @@ -0,0 +1,14 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (C) 2016 Thomas Gleixner. + * Copyright (C) 2016-2017 Christoph Hellwig. + */ + +#ifndef __LINUX_GROUP_CPUS_H +#define __LINUX_GROUP_CPUS_H +#include +#include + +struct cpumask *group_cpus_evenly(unsigned int numgrps); + +#endif diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index 5408333..44a4eba 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -7,403 +7,7 @@ #include #include #include -#include - -static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nm= sk, - unsigned int cpus_per_grp) -{ - const struct cpumask *siblmsk; - int cpu, sibl; - - for ( ; cpus_per_grp > 0; ) { - cpu =3D cpumask_first(nmsk); - - /* Should not happen, but I'm too lazy to think about it */ - if (cpu >=3D nr_cpu_ids) - return; - - cpumask_clear_cpu(cpu, nmsk); - cpumask_set_cpu(cpu, irqmsk); - cpus_per_grp--; - - /* If the cpu has siblings, use them first */ - siblmsk =3D topology_sibling_cpumask(cpu); - for (sibl =3D -1; cpus_per_grp > 0; ) { - sibl =3D cpumask_next(sibl, siblmsk); - if (sibl >=3D nr_cpu_ids) - break; - if (!cpumask_test_and_clear_cpu(sibl, nmsk)) - continue; - cpumask_set_cpu(sibl, irqmsk); - cpus_per_grp--; - } - } -} - -static cpumask_var_t *alloc_node_to_cpumask(void) -{ - cpumask_var_t *masks; - int node; - - masks =3D kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); - if (!masks) - return NULL; - - for (node =3D 0; node < nr_node_ids; node++) { - if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) - goto out_unwind; - } - - return masks; - -out_unwind: - while (--node >=3D 0) - free_cpumask_var(masks[node]); - kfree(masks); - return NULL; -} - -static void free_node_to_cpumask(cpumask_var_t *masks) -{ - int node; - - for (node =3D 0; node < nr_node_ids; node++) - free_cpumask_var(masks[node]); - kfree(masks); -} - -static void build_node_to_cpumask(cpumask_var_t *masks) -{ - int cpu; - - for_each_possible_cpu(cpu) - cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); -} - -static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, - const struct cpumask *mask, nodemask_t *nodemsk) -{ - int n, nodes =3D 0; - - /* Calculate the number of nodes in the supplied affinity mask */ - for_each_node(n) { - if (cpumask_intersects(mask, node_to_cpumask[n])) { - node_set(n, *nodemsk); - nodes++; - } - } - return nodes; -} - -struct node_groups { - unsigned id; - - union { - unsigned ngroups; - unsigned ncpus; - }; -}; - -static int ncpus_cmp_func(const void *l, const void *r) -{ - const struct node_groups *ln =3D l; - const struct node_groups *rn =3D r; - - return ln->ncpus - rn->ncpus; -} - -/* - * Allocate group number for each node, so that for each node: - * - * 1) the allocated number is >=3D 1 - * - * 2) the allocated number is <=3D active CPU number of this node - * - * The actual allocated total groups may be less than @numgrps when - * active total CPU number is less than @numgrps. - * - * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' - * for each node. - */ -static void alloc_nodes_groups(unsigned int numgrps, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - const nodemask_t nodemsk, - struct cpumask *nmsk, - struct node_groups *node_groups) -{ - unsigned n, remaining_ncpus =3D 0; - - for (n =3D 0; n < nr_node_ids; n++) { - node_groups[n].id =3D n; - node_groups[n].ncpus =3D UINT_MAX; - } - - for_each_node_mask(n, nodemsk) { - unsigned ncpus; - - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - ncpus =3D cpumask_weight(nmsk); - - if (!ncpus) - continue; - remaining_ncpus +=3D ncpus; - node_groups[n].ncpus =3D ncpus; - } - - numgrps =3D min_t(unsigned, remaining_ncpus, numgrps); - - sort(node_groups, nr_node_ids, sizeof(node_groups[0]), - ncpus_cmp_func, NULL); - - /* - * Allocate groups for each node according to the ratio of this - * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is - * bigger than number of active numa nodes. Always start the - * allocation from the node with minimized nr_cpus. - * - * This way guarantees that each active node gets allocated at - * least one group, and the theory is simple: over-allocation - * is only done when this node is assigned by one group, so - * other nodes will be allocated >=3D 1 groups, since 'numgrps' is - * bigger than number of numa nodes. - * - * One perfect invariant is that number of allocated groups for - * each node is <=3D CPU count of this node: - * - * 1) suppose there are two nodes: A and B - * ncpu(X) is CPU count of node X - * grps(X) is the group count allocated to node X via this - * algorithm - * - * ncpu(A) <=3D ncpu(B) - * ncpu(A) + ncpu(B) =3D N - * grps(A) + grps(B) =3D G - * - * grps(A) =3D max(1, round_down(G * ncpu(A) / N)) - * grps(B) =3D G - grps(A) - * - * both N and G are integer, and 2 <=3D G <=3D N, suppose - * G =3D N - delta, and 0 <=3D delta <=3D N - 2 - * - * 2) obviously grps(A) <=3D ncpu(A) because: - * - * if grps(A) is 1, then grps(A) <=3D ncpu(A) given - * ncpu(A) >=3D 1 - * - * otherwise, - * grps(A) <=3D G * ncpu(A) / N <=3D ncpu(A), given G <=3D N - * - * 3) prove how grps(B) <=3D ncpu(B): - * - * if round_down(G * ncpu(A) / N) =3D=3D 0, vecs(B) won't be - * over-allocated, so grps(B) <=3D ncpu(B), - * - * otherwise: - * - * grps(A) =3D - * round_down(G * ncpu(A) / N) =3D - * round_down((N - delta) * ncpu(A) / N) =3D - * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=3D - * round_down((N * ncpu(A) - delta * N) / N) =3D - * cpu(A) - delta - * - * then: - * - * grps(A) - G >=3D ncpu(A) - delta - G - * =3D> - * G - grps(A) <=3D G + delta - ncpu(A) - * =3D> - * grps(B) <=3D N - ncpu(A) - * =3D> - * grps(B) <=3D cpu(B) - * - * For nodes >=3D 3, it can be thought as one node and another big - * node given that is exactly what this algorithm is implemented, - * and we always re-calculate 'remaining_ncpus' & 'numgrps', and - * finally for each node X: grps(X) <=3D ncpu(X). - * - */ - for (n =3D 0; n < nr_node_ids; n++) { - unsigned ngroups, ncpus; - - if (node_groups[n].ncpus =3D=3D UINT_MAX) - continue; - - WARN_ON_ONCE(numgrps =3D=3D 0); - - ncpus =3D node_groups[n].ncpus; - ngroups =3D max_t(unsigned, 1, - numgrps * ncpus / remaining_ncpus); - WARN_ON_ONCE(ngroups > ncpus); - - node_groups[n].ngroups =3D ngroups; - - remaining_ncpus -=3D ncpus; - numgrps -=3D ngroups; - } -} - -static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, - cpumask_var_t *node_to_cpumask, - const struct cpumask *cpu_mask, - struct cpumask *nmsk, struct cpumask *masks) -{ - unsigned int i, n, nodes, cpus_per_grp, extra_grps, done =3D 0; - unsigned int last_grp =3D numgrps; - unsigned int curgrp =3D startgrp; - nodemask_t nodemsk =3D NODE_MASK_NONE; - struct node_groups *node_groups; - - if (cpumask_empty(cpu_mask)) - return 0; - - nodes =3D get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); - - /* - * If the number of nodes in the mask is greater than or equal the - * number of groups we just spread the groups across the nodes. - */ - if (numgrps <=3D nodes) { - for_each_node_mask(n, nodemsk) { - /* Ensure that only CPUs which are in both masks are set */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); - cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); - if (++curgrp =3D=3D last_grp) - curgrp =3D 0; - } - return numgrps; - } - - node_groups =3D kcalloc(nr_node_ids, - sizeof(struct node_groups), - GFP_KERNEL); - if (!node_groups) - return -ENOMEM; - - /* allocate group number for each node */ - alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, - nodemsk, nmsk, node_groups); - for (i =3D 0; i < nr_node_ids; i++) { - unsigned int ncpus, v; - struct node_groups *nv =3D &node_groups[i]; - - if (nv->ngroups =3D=3D UINT_MAX) - continue; - - /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); - ncpus =3D cpumask_weight(nmsk); - if (!ncpus) - continue; - - WARN_ON_ONCE(nv->ngroups > ncpus); - - /* Account for rounding errors */ - extra_grps =3D ncpus - nv->ngroups * (ncpus / nv->ngroups); - - /* Spread allocated groups on CPUs of the current node */ - for (v =3D 0; v < nv->ngroups; v++, curgrp++) { - cpus_per_grp =3D ncpus / nv->ngroups; - - /* Account for extra groups to compensate rounding errors */ - if (extra_grps) { - cpus_per_grp++; - --extra_grps; - } - - /* - * wrapping has to be considered given 'startgrp' - * may start anywhere - */ - if (curgrp >=3D last_grp) - curgrp =3D 0; - grp_spread_init_one(&masks[curgrp], nmsk, - cpus_per_grp); - } - done +=3D nv->ngroups; - } - kfree(node_groups); - return done; -} - -/* - * build affinity in two stages for each group, and try to put close CPUs - * in viewpoint of CPU and NUMA locality into same group, and we run - * two-stage grouping: - * - * 1) allocate present CPUs on these groups evenly first - * 2) allocate other possible CPUs on these groups evenly - */ -static struct cpumask *group_cpus_evenly(unsigned int numgrps) -{ - unsigned int curgrp =3D 0, nr_present =3D 0, nr_others =3D 0; - cpumask_var_t *node_to_cpumask; - cpumask_var_t nmsk, npresmsk; - int ret =3D -ENOMEM; - struct cpumask *masks =3D NULL; - - if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) - return NULL; - - if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) - goto fail_nmsk; - - node_to_cpumask =3D alloc_node_to_cpumask(); - if (!node_to_cpumask) - goto fail_npresmsk; - - masks =3D kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); - if (!masks) - goto fail_node_to_cpumask; - - /* Stabilize the cpumasks */ - cpus_read_lock(); - build_node_to_cpumask(node_to_cpumask); - - /* grouping present CPUs first */ - ret =3D __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, - cpu_present_mask, nmsk, masks); - if (ret < 0) - goto fail_build_affinity; - nr_present =3D ret; - - /* - * Allocate non present CPUs starting from the next group to be - * handled. If the grouping of present CPUs already exhausted the - * group space, assign the non present CPUs to the already - * allocated out groups. - */ - if (nr_present >=3D numgrps) - curgrp =3D 0; - else - curgrp =3D nr_present; - cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); - ret =3D __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, - npresmsk, nmsk, masks); - if (ret >=3D 0) - nr_others =3D ret; - - fail_build_affinity: - cpus_read_unlock(); - - if (ret >=3D 0) - WARN_ON(nr_present + nr_others < numgrps); - - fail_node_to_cpumask: - free_node_to_cpumask(node_to_cpumask); - - fail_npresmsk: - free_cpumask_var(npresmsk); - - fail_nmsk: - free_cpumask_var(nmsk); - if (ret < 0) { - kfree(masks); - return NULL; - } - return masks; -} +#include =20 static void default_calc_sets(struct irq_affinity *affd, unsigned int affv= ecs) { diff --git a/lib/Makefile b/lib/Makefile index 4d9461b..a4665a8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -353,6 +353,8 @@ obj-$(CONFIG_SBITMAP) +=3D sbitmap.o =20 obj-$(CONFIG_PARMAN) +=3D parman.o =20 +obj-y +=3D group_cpus.o + # GCC library routines obj-$(CONFIG_GENERIC_LIB_ASHLDI3) +=3D ashldi3.o obj-$(CONFIG_GENERIC_LIB_ASHRDI3) +=3D ashrdi3.o diff --git a/lib/group_cpus.c b/lib/group_cpus.c new file mode 100644 index 0000000..99f08c6 --- /dev/null +++ b/lib/group_cpus.c @@ -0,0 +1,427 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2016 Thomas Gleixner. + * Copyright (C) 2016-2017 Christoph Hellwig. + */ +#include +#include +#include +#include +#include + +static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nm= sk, + unsigned int cpus_per_grp) +{ + const struct cpumask *siblmsk; + int cpu, sibl; + + for ( ; cpus_per_grp > 0; ) { + cpu =3D cpumask_first(nmsk); + + /* Should not happen, but I'm too lazy to think about it */ + if (cpu >=3D nr_cpu_ids) + return; + + cpumask_clear_cpu(cpu, nmsk); + cpumask_set_cpu(cpu, irqmsk); + cpus_per_grp--; + + /* If the cpu has siblings, use them first */ + siblmsk =3D topology_sibling_cpumask(cpu); + for (sibl =3D -1; cpus_per_grp > 0; ) { + sibl =3D cpumask_next(sibl, siblmsk); + if (sibl >=3D nr_cpu_ids) + break; + if (!cpumask_test_and_clear_cpu(sibl, nmsk)) + continue; + cpumask_set_cpu(sibl, irqmsk); + cpus_per_grp--; + } + } +} + +static cpumask_var_t *alloc_node_to_cpumask(void) +{ + cpumask_var_t *masks; + int node; + + masks =3D kcalloc(nr_node_ids, sizeof(cpumask_var_t), GFP_KERNEL); + if (!masks) + return NULL; + + for (node =3D 0; node < nr_node_ids; node++) { + if (!zalloc_cpumask_var(&masks[node], GFP_KERNEL)) + goto out_unwind; + } + + return masks; + +out_unwind: + while (--node >=3D 0) + free_cpumask_var(masks[node]); + kfree(masks); + return NULL; +} + +static void free_node_to_cpumask(cpumask_var_t *masks) +{ + int node; + + for (node =3D 0; node < nr_node_ids; node++) + free_cpumask_var(masks[node]); + kfree(masks); +} + +static void build_node_to_cpumask(cpumask_var_t *masks) +{ + int cpu; + + for_each_possible_cpu(cpu) + cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); +} + +static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, + const struct cpumask *mask, nodemask_t *nodemsk) +{ + int n, nodes =3D 0; + + /* Calculate the number of nodes in the supplied affinity mask */ + for_each_node(n) { + if (cpumask_intersects(mask, node_to_cpumask[n])) { + node_set(n, *nodemsk); + nodes++; + } + } + return nodes; +} + +struct node_groups { + unsigned id; + + union { + unsigned ngroups; + unsigned ncpus; + }; +}; + +static int ncpus_cmp_func(const void *l, const void *r) +{ + const struct node_groups *ln =3D l; + const struct node_groups *rn =3D r; + + return ln->ncpus - rn->ncpus; +} + +/* + * Allocate group number for each node, so that for each node: + * + * 1) the allocated number is >=3D 1 + * + * 2) the allocated number is <=3D active CPU number of this node + * + * The actual allocated total groups may be less than @numgrps when + * active total CPU number is less than @numgrps. + * + * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' + * for each node. + */ +static void alloc_nodes_groups(unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + const nodemask_t nodemsk, + struct cpumask *nmsk, + struct node_groups *node_groups) +{ + unsigned n, remaining_ncpus =3D 0; + + for (n =3D 0; n < nr_node_ids; n++) { + node_groups[n].id =3D n; + node_groups[n].ncpus =3D UINT_MAX; + } + + for_each_node_mask(n, nodemsk) { + unsigned ncpus; + + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + ncpus =3D cpumask_weight(nmsk); + + if (!ncpus) + continue; + remaining_ncpus +=3D ncpus; + node_groups[n].ncpus =3D ncpus; + } + + numgrps =3D min_t(unsigned, remaining_ncpus, numgrps); + + sort(node_groups, nr_node_ids, sizeof(node_groups[0]), + ncpus_cmp_func, NULL); + + /* + * Allocate groups for each node according to the ratio of this + * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is + * bigger than number of active numa nodes. Always start the + * allocation from the node with minimized nr_cpus. + * + * This way guarantees that each active node gets allocated at + * least one group, and the theory is simple: over-allocation + * is only done when this node is assigned by one group, so + * other nodes will be allocated >=3D 1 groups, since 'numgrps' is + * bigger than number of numa nodes. + * + * One perfect invariant is that number of allocated groups for + * each node is <=3D CPU count of this node: + * + * 1) suppose there are two nodes: A and B + * ncpu(X) is CPU count of node X + * grps(X) is the group count allocated to node X via this + * algorithm + * + * ncpu(A) <=3D ncpu(B) + * ncpu(A) + ncpu(B) =3D N + * grps(A) + grps(B) =3D G + * + * grps(A) =3D max(1, round_down(G * ncpu(A) / N)) + * grps(B) =3D G - grps(A) + * + * both N and G are integer, and 2 <=3D G <=3D N, suppose + * G =3D N - delta, and 0 <=3D delta <=3D N - 2 + * + * 2) obviously grps(A) <=3D ncpu(A) because: + * + * if grps(A) is 1, then grps(A) <=3D ncpu(A) given + * ncpu(A) >=3D 1 + * + * otherwise, + * grps(A) <=3D G * ncpu(A) / N <=3D ncpu(A), given G <=3D N + * + * 3) prove how grps(B) <=3D ncpu(B): + * + * if round_down(G * ncpu(A) / N) =3D=3D 0, vecs(B) won't be + * over-allocated, so grps(B) <=3D ncpu(B), + * + * otherwise: + * + * grps(A) =3D + * round_down(G * ncpu(A) / N) =3D + * round_down((N - delta) * ncpu(A) / N) =3D + * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >=3D + * round_down((N * ncpu(A) - delta * N) / N) =3D + * cpu(A) - delta + * + * then: + * + * grps(A) - G >=3D ncpu(A) - delta - G + * =3D> + * G - grps(A) <=3D G + delta - ncpu(A) + * =3D> + * grps(B) <=3D N - ncpu(A) + * =3D> + * grps(B) <=3D cpu(B) + * + * For nodes >=3D 3, it can be thought as one node and another big + * node given that is exactly what this algorithm is implemented, + * and we always re-calculate 'remaining_ncpus' & 'numgrps', and + * finally for each node X: grps(X) <=3D ncpu(X). + * + */ + for (n =3D 0; n < nr_node_ids; n++) { + unsigned ngroups, ncpus; + + if (node_groups[n].ncpus =3D=3D UINT_MAX) + continue; + + WARN_ON_ONCE(numgrps =3D=3D 0); + + ncpus =3D node_groups[n].ncpus; + ngroups =3D max_t(unsigned, 1, + numgrps * ncpus / remaining_ncpus); + WARN_ON_ONCE(ngroups > ncpus); + + node_groups[n].ngroups =3D ngroups; + + remaining_ncpus -=3D ncpus; + numgrps -=3D ngroups; + } +} + +static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, + cpumask_var_t *node_to_cpumask, + const struct cpumask *cpu_mask, + struct cpumask *nmsk, struct cpumask *masks) +{ + unsigned int i, n, nodes, cpus_per_grp, extra_grps, done =3D 0; + unsigned int last_grp =3D numgrps; + unsigned int curgrp =3D startgrp; + nodemask_t nodemsk =3D NODE_MASK_NONE; + struct node_groups *node_groups; + + if (cpumask_empty(cpu_mask)) + return 0; + + nodes =3D get_nodes_in_cpumask(node_to_cpumask, cpu_mask, &nodemsk); + + /* + * If the number of nodes in the mask is greater than or equal the + * number of groups we just spread the groups across the nodes. + */ + if (numgrps <=3D nodes) { + for_each_node_mask(n, nodemsk) { + /* Ensure that only CPUs which are in both masks are set */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]); + cpumask_or(&masks[curgrp], &masks[curgrp], nmsk); + if (++curgrp =3D=3D last_grp) + curgrp =3D 0; + } + return numgrps; + } + + node_groups =3D kcalloc(nr_node_ids, + sizeof(struct node_groups), + GFP_KERNEL); + if (!node_groups) + return -ENOMEM; + + /* allocate group number for each node */ + alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, + nodemsk, nmsk, node_groups); + for (i =3D 0; i < nr_node_ids; i++) { + unsigned int ncpus, v; + struct node_groups *nv =3D &node_groups[i]; + + if (nv->ngroups =3D=3D UINT_MAX) + continue; + + /* Get the cpus on this node which are in the mask */ + cpumask_and(nmsk, cpu_mask, node_to_cpumask[nv->id]); + ncpus =3D cpumask_weight(nmsk); + if (!ncpus) + continue; + + WARN_ON_ONCE(nv->ngroups > ncpus); + + /* Account for rounding errors */ + extra_grps =3D ncpus - nv->ngroups * (ncpus / nv->ngroups); + + /* Spread allocated groups on CPUs of the current node */ + for (v =3D 0; v < nv->ngroups; v++, curgrp++) { + cpus_per_grp =3D ncpus / nv->ngroups; + + /* Account for extra groups to compensate rounding errors */ + if (extra_grps) { + cpus_per_grp++; + --extra_grps; + } + + /* + * wrapping has to be considered given 'startgrp' + * may start anywhere + */ + if (curgrp >=3D last_grp) + curgrp =3D 0; + grp_spread_init_one(&masks[curgrp], nmsk, + cpus_per_grp); + } + done +=3D nv->ngroups; + } + kfree(node_groups); + return done; +} + +#ifdef CONFIG_SMP +/** + * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality + * @numgrps: number of groups + * + * Return: cpumask array if successful, NULL otherwise. And each element + * includes CPUs assigned to this group + * + * Try to put close CPUs from viewpoint of CPU and NUMA locality into + * same group, and run two-stage grouping: + * 1) allocate present CPUs on these groups evenly first + * 2) allocate other possible CPUs on these groups evenly + * + * We guarantee in the resulted grouping that all CPUs are covered, and + * no same CPU is assigned to multiple groups + */ +struct cpumask *group_cpus_evenly(unsigned int numgrps) +{ + unsigned int curgrp =3D 0, nr_present =3D 0, nr_others =3D 0; + cpumask_var_t *node_to_cpumask; + cpumask_var_t nmsk, npresmsk; + int ret =3D -ENOMEM; + struct cpumask *masks =3D NULL; + + if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL)) + return NULL; + + if (!zalloc_cpumask_var(&npresmsk, GFP_KERNEL)) + goto fail_nmsk; + + node_to_cpumask =3D alloc_node_to_cpumask(); + if (!node_to_cpumask) + goto fail_npresmsk; + + masks =3D kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + if (!masks) + goto fail_node_to_cpumask; + + /* Stabilize the cpumasks */ + cpus_read_lock(); + build_node_to_cpumask(node_to_cpumask); + + /* grouping present CPUs first */ + ret =3D __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + cpu_present_mask, nmsk, masks); + if (ret < 0) + goto fail_build_affinity; + nr_present =3D ret; + + /* + * Allocate non present CPUs starting from the next group to be + * handled. If the grouping of present CPUs already exhausted the + * group space, assign the non present CPUs to the already + * allocated out groups. + */ + if (nr_present >=3D numgrps) + curgrp =3D 0; + else + curgrp =3D nr_present; + cpumask_andnot(npresmsk, cpu_possible_mask, cpu_present_mask); + ret =3D __group_cpus_evenly(curgrp, numgrps, node_to_cpumask, + npresmsk, nmsk, masks); + if (ret >=3D 0) + nr_others =3D ret; + + fail_build_affinity: + cpus_read_unlock(); + + if (ret >=3D 0) + WARN_ON(nr_present + nr_others < numgrps); + + fail_node_to_cpumask: + free_node_to_cpumask(node_to_cpumask); + + fail_npresmsk: + free_cpumask_var(npresmsk); + + fail_nmsk: + free_cpumask_var(nmsk); + if (ret < 0) { + kfree(masks); + return NULL; + } + return masks; +} +#else +struct cpumask *group_cpus_evenly(unsigned int numgrps) +{ + struct cpumask *masks =3D kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); + + if (!masks) + return NULL; + + /* assign all CPUs(cpu 0) to the 1st group only */ + cpumask_copy(&masks[0], cpu_possible_mask); + return masks; +} +#endif