lib/group_cpus: rotate extra groups to avoid IRQ stacking

[PATCH] lib/group_cpus: rotate extra groups to avoid IRQ stacking
Posted by Naman Jain 1 week, 3 days ago
When multiple devices call group_cpus_evenly() with the same number of
groups, the cluster-aware path in __try_group_cluster_cpus() assigns
extra groups to the same set of clusters every time, producing identical
affinity masks for every caller. CPUs in clusters that receive two
groups (and thus get single-CPU dedicated masks) end up handling
interrupts from ALL devices, creating an IRQ imbalance.

For example, on a 96-CPU / 2-NUMA-node system with 24 clusters of
2 CPUs each and 6 NVMe disks each requesting 62 vectors:
alloc_groups_to_nodes() distributes 31 groups across 24 clusters,
giving 7 clusters 2 groups (single-CPU mask = dedicated) and 17
clusters 1 group (2-CPU mask = shared). Because the assignment is
deterministic, all 6 disks produce the same mapping and the same 14
CPUs each accumulate 6 dedicated IRQs -- roughly twice the interrupt
load of other CPUs -- causing up to 11% per-disk throughput degradation
on IRQ-heavy CPUs.

Fix this by introducing a per-caller rotation offset via a static
atomic counter. After alloc_groups_to_nodes() determines each
cluster's group count, collect the extras (groups above the per-cluster
minimum), then redistribute them starting from a rotated position with
a stride of ncluster/extras so that successive callers scatter their
extra groups across different clusters. A capacity check
(cpumask_weight_and) ensures no cluster is assigned more groups than it
has CPUs, with a fallback loop for any extras that could not be placed
in the strided pass.

For systems without cluster topology, the same rotation is applied in
assign_cpus_to_groups() at the per-group level: the modular expression
(v + spread_offset) % nv->ngroups selects which groups receive the
extra CPU, replacing the previous sequential decrement.

Tested on a 96-vCPU Hyper-V VM (AMD EPYC 9V74, 48 clusters of 2, 2
NUMA nodes) with 6 NVMe data disks (63 MSI-X vectors each), running
4K random-read fio (psync, 16 jobs/disk, 120s, CPU-pinned):
- Worst-disk degradation vs average: 11% -> 5%
- Previously penalized disks: +12% IOPS, 10% lower latency

Fixes: 89802ca36c96 ("lib/group_cpus: make group CPU cluster aware")
Co-developed-by: Long Li <longli@microsoft.com>
Signed-off-by: Long Li <longli@microsoft.com>
Signed-off-by: Naman Jain <namjain@linux.microsoft.com>
---
 lib/group_cpus.c | 116 ++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 104 insertions(+), 12 deletions(-)

diff --git a/lib/group_cpus.c b/lib/group_cpus.c
index e6e18d7a49bb..241c2c9b437d 100644
--- a/lib/group_cpus.c
+++ b/lib/group_cpus.c
@@ -7,6 +7,7 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/sort.h>
+#include <linux/atomic.h>
 #include <linux/group_cpus.h>
 
 #ifdef CONFIG_SMP
@@ -255,12 +256,20 @@ static void alloc_nodes_groups(unsigned int numgrps,
 	alloc_groups_to_nodes(numgrps, numcpus, node_groups, nr_node_ids);
 }
 
+/*
+ * Per-caller rotation counter for group_cpus_evenly().
+ * Wrapping is harmless: the offset is only used modulo small values
+ * (ncluster or nv->ngroups), so any unsigned value works.
+ */
+static atomic_t group_spread_cnt = ATOMIC_INIT(0);
+
 static void assign_cpus_to_groups(unsigned int ncpus,
 				  struct cpumask *nmsk,
 				  struct node_groups *nv,
 				  struct cpumask *masks,
 				  unsigned int *curgrp,
-				  unsigned int last_grp)
+				  unsigned int last_grp,
+				  unsigned int spread_offset)
 {
 	unsigned int v, cpus_per_grp, extra_grps;
 	/* Account for rounding errors */
@@ -270,11 +279,15 @@ static void assign_cpus_to_groups(unsigned int ncpus,
 	for (v = 0; v < nv->ngroups; v++, *curgrp += 1) {
 		cpus_per_grp = ncpus / nv->ngroups;
 
-		/* Account for extra groups to compensate rounding errors */
-		if (extra_grps) {
+		/*
+		 * Rotate which groups get the extra CPU so that
+		 * successive callers produce different mappings,
+		 * avoiding IRQ stacking when multiple devices
+		 * share the same CPU topology.
+		 */
+		if (extra_grps &&
+		    (v + spread_offset) % nv->ngroups < extra_grps)
 			cpus_per_grp++;
-			--extra_grps;
-		}
 
 		/*
 		 * wrapping has to be considered given 'startgrp'
@@ -361,7 +374,8 @@ static bool __try_group_cluster_cpus(unsigned int ncpus,
 				     struct cpumask *node_cpumask,
 				     struct cpumask *masks,
 				     unsigned int *curgrp,
-				     unsigned int last_grp)
+				     unsigned int last_grp,
+				     unsigned int spread_offset)
 {
 	struct node_groups *cluster_groups;
 	const struct cpumask **clusters;
@@ -379,6 +393,78 @@ static bool __try_group_cluster_cpus(unsigned int ncpus,
 	if (ncluster == 0)
 		goto fail_no_clusters;
 
+	/*
+	 * Rotate which clusters receive extra groups so that different
+	 * callers of group_cpus_evenly() produce different group-to-CPU
+	 * mappings. Without this, all devices get identical affinity
+	 * masks, causing IRQ stacking on CPUs assigned single-CPU groups.
+	 *
+	 * alloc_groups_to_nodes() distributes ngroups proportionally, but
+	 * rounding causes some clusters to get one more group than others.
+	 * The assignment is deterministic, so every device gets the same
+	 * mapping. Fix: collect the "extra" groups (above the per-cluster
+	 * minimum), then redistribute them starting from a rotated
+	 * position. The ncpus constraint (ngroups <= ncpus) is preserved
+	 * by checking each cluster's actual CPU count before adding.
+	 *
+	 * Note: after alloc_groups_to_nodes(), the union field holds
+	 * ngroups (not ncpus), so we recompute ncpus from cluster masks.
+	 */
+	if (ncluster > 1) {
+		unsigned int min_grps = UINT_MAX;
+		unsigned int total_extra = 0;
+		unsigned int start, stride;
+
+		for (i = 0; i < ncluster; i++) {
+			if (cluster_groups[i].ngroups < min_grps)
+				min_grps = cluster_groups[i].ngroups;
+		}
+
+		for (i = 0; i < ncluster; i++) {
+			if (cluster_groups[i].ngroups > min_grps) {
+				total_extra +=
+					cluster_groups[i].ngroups - min_grps;
+				cluster_groups[i].ngroups = min_grps;
+			}
+		}
+
+		/*
+		 * Redistribute extras using a stride to scatter them
+		 * across clusters.  With stride = ncluster / extras,
+		 * consecutive callers' extra sets overlap minimally
+		 * (e.g. max 2 overlap for 6 callers with 24 clusters
+		 * and 7 extras, vs 6 overlap with stride 1).
+		 */
+		start = spread_offset % ncluster;
+		stride = (total_extra > 0 && total_extra < ncluster) ?
+			 ncluster / total_extra : 1;
+
+		for (i = 0; i < ncluster && total_extra > 0; i++) {
+			unsigned int idx =
+				(start + i * stride) % ncluster;
+			unsigned int cap;
+
+			cap = cpumask_weight_and(clusters[cluster_groups[idx].id],
+						 node_cpumask);
+			if (cluster_groups[idx].ngroups < cap) {
+				cluster_groups[idx].ngroups++;
+				total_extra--;
+			}
+		}
+
+		/* Fallback: place remaining extras wherever they fit */
+		for (i = 0; i < ncluster && total_extra > 0; i++) {
+			unsigned int cap;
+
+			cap = cpumask_weight_and(clusters[cluster_groups[i].id],
+						 node_cpumask);
+			if (cluster_groups[i].ngroups < cap) {
+				cluster_groups[i].ngroups++;
+				total_extra--;
+			}
+		}
+	}
+
 	for (i = 0; i < ncluster; i++) {
 		struct node_groups *nv = &cluster_groups[i];
 
@@ -389,7 +475,8 @@ static bool __try_group_cluster_cpus(unsigned int ncpus,
 			continue;
 		WARN_ON_ONCE(nv->ngroups > nc);
 
-		assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp);
+		assign_cpus_to_groups(nc, nmsk, nv, masks, curgrp, last_grp,
+				      spread_offset);
 	}
 
 	ret = true;
@@ -404,7 +491,8 @@ static bool __try_group_cluster_cpus(unsigned int ncpus,
 static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 			       cpumask_var_t *node_to_cpumask,
 			       const struct cpumask *cpu_mask,
-			       struct cpumask *nmsk, struct cpumask *masks)
+			       struct cpumask *nmsk, struct cpumask *masks,
+			       unsigned int spread_offset)
 {
 	unsigned int i, n, nodes, done = 0;
 	unsigned int last_grp = numgrps;
@@ -455,13 +543,14 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 		WARN_ON_ONCE(nv->ngroups > ncpus);
 
 		if (__try_group_cluster_cpus(ncpus, nv->ngroups, nmsk,
-					     masks, &curgrp, last_grp)) {
+					     masks, &curgrp, last_grp,
+					     spread_offset)) {
 			done += nv->ngroups;
 			continue;
 		}
 
 		assign_cpus_to_groups(ncpus, nmsk, nv, masks, &curgrp,
-				      last_grp);
+				      last_grp, spread_offset);
 		done += nv->ngroups;
 	}
 	kfree(node_groups);
@@ -488,6 +577,7 @@ static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps,
 struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
 {
 	unsigned int curgrp = 0, nr_present = 0, nr_others = 0;
+	unsigned int spread_offset;
 	cpumask_var_t *node_to_cpumask;
 	cpumask_var_t nmsk, npresmsk;
 	int ret = -ENOMEM;
@@ -510,6 +600,8 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
 	if (!masks)
 		goto fail_node_to_cpumask;
 
+	spread_offset = (unsigned int)atomic_fetch_inc(&group_spread_cnt);
+
 	build_node_to_cpumask(node_to_cpumask);
 
 	/*
@@ -528,7 +620,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
 
 	/* grouping present CPUs first */
 	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
-				  npresmsk, nmsk, masks);
+				  npresmsk, nmsk, masks, spread_offset);
 	if (ret < 0)
 		goto fail_node_to_cpumask;
 	nr_present = ret;
@@ -545,7 +637,7 @@ struct cpumask *group_cpus_evenly(unsigned int numgrps, unsigned int *nummasks)
 		curgrp = nr_present;
 	cpumask_andnot(npresmsk, cpu_possible_mask, npresmsk);
 	ret = __group_cpus_evenly(curgrp, numgrps, node_to_cpumask,
-				  npresmsk, nmsk, masks);
+				  npresmsk, nmsk, masks, spread_offset);
 	if (ret >= 0)
 		nr_others = ret;
 

base-commit: c369299895a591d96745d6492d4888259b004a9e
-- 
2.43.0