From nobody Wed Dec 17 22:42:42 2025 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 0C725C61D92 for ; Tue, 21 Nov 2023 13:54:43 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S234387AbjKUNyo (ORCPT ); Tue, 21 Nov 2023 08:54:44 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:48380 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229514AbjKUNym (ORCPT ); Tue, 21 Nov 2023 08:54:42 -0500 Received: from linux.microsoft.com (linux.microsoft.com [13.77.154.182]) by lindbergh.monkeyblade.net (Postfix) with ESMTP id 1554CD6A; Tue, 21 Nov 2023 05:54:39 -0800 (PST) Received: by linux.microsoft.com (Postfix, from userid 1099) id 72E1520B74C0; Tue, 21 Nov 2023 05:54:38 -0800 (PST) DKIM-Filter: OpenDKIM Filter v2.11.0 linux.microsoft.com 72E1520B74C0 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=linux.microsoft.com; s=default; t=1700574878; bh=4d/Tj40I/q17k+/4xRhGk5Qv/cyNpSEcakEe1Oqoli0=; h=From:To:Cc:Subject:Date:From; b=F4nAeQAd5kyOcVSyw99btyLgSRxMu13uOqYGPWIWoERH9t4JBazZb6wjLfuc1OScB eRMnIe0A3w5npnpV3XgO2YfH+R++1Yyq8bQ7RJAz//olGn/UoXNdnhCeRSE4v6QM2R 5UxTz7+VHkfPQ3jesFSw1IKiQfkJfCUNzj+utm/Q= From: Souradeep Chakrabarti To: kys@microsoft.com, haiyangz@microsoft.com, wei.liu@kernel.org, decui@microsoft.com, davem@davemloft.net, edumazet@google.com, kuba@kernel.org, pabeni@redhat.com, longli@microsoft.com, sharmaajay@microsoft.com, leon@kernel.org, cai.huoqing@linux.dev, ssengar@linux.microsoft.com, vkuznets@redhat.com, tglx@linutronix.de, linux-hyperv@vger.kernel.org, netdev@vger.kernel.org, linux-kernel@vger.kernel.org, linux-rdma@vger.kernel.org Cc: schakrabarti@microsoft.com, paulros@microsoft.com, Souradeep Chakrabarti Subject: [PATCH V2 net-next] net: mana: Assigning IRQ affinity on HT cores Date: Tue, 21 Nov 2023 05:54:37 -0800 Message-Id: <1700574877-6037-1-git-send-email-schakrabarti@linux.microsoft.com> X-Mailer: git-send-email 1.8.3.1 Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Transfer-Encoding: quoted-printable MIME-Version: 1.0 Content-Type: text/plain; charset="utf-8" Existing MANA design assigns IRQ to every CPUs, including sibling hyper-thr= eads in a core. This causes multiple IRQs to work on same CPU and may reduce the= network performance with RSS. Improve the performance by adhering the configuration for RSS, which assign= s IRQ on HT cores. Signed-off-by: Souradeep Chakrabarti --- V1 -> V2: * Simplified the code by removing filter_mask_list and using avail_cpus. * Addressed infinite loop issue when there are numa nodes with no CPUs. * Addressed uses of local numa node instead of 0 to start. * Removed uses of BUG_ON. * Placed cpus_read_lock in parent function to avoid num_online_cpus to get changed before function finishes the affinity assignment. --- .../net/ethernet/microsoft/mana/gdma_main.c | 134 ++++++++++++++++-- 1 file changed, 123 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/= ethernet/microsoft/mana/gdma_main.c index 6367de0c2c2e..8177502ffbd9 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -1243,15 +1243,120 @@ void mana_gd_free_res_map(struct gdma_resource *r) r->size =3D 0; } =20 +static int irq_setup(int *irqs, int nvec, int start_numa_node) +{ + unsigned int *core_id_list; + cpumask_var_t filter_mask, avail_cpus; + int i, core_count =3D 0, cpu_count =3D 0, err =3D 0, node_count =3D 0; + unsigned int cpu_first, cpu, irq_start, cores =3D 0, numa_node =3D start_= numa_node; + + if(!alloc_cpumask_var(&filter_mask, GFP_KERNEL) + || !alloc_cpumask_var(&avail_cpus, GFP_KERNEL)) { + err =3D -ENOMEM; + goto free_irq; + } + cpumask_copy(filter_mask, cpu_online_mask); + cpumask_copy(avail_cpus, cpu_online_mask); + /* count the number of cores + */ + for_each_cpu(cpu, filter_mask) { + cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu)); + cores++; + } + core_id_list =3D kcalloc(cores, sizeof(unsigned int), GFP_KERNEL); + cpumask_copy(filter_mask, cpu_online_mask); + /* initialize core_id_list array */ + for_each_cpu(cpu, filter_mask) { + core_id_list[core_count] =3D cpu; + cpumask_andnot(filter_mask, filter_mask, topology_sibling_cpumask(cpu)); + core_count++; + } + + /* if number of cpus are equal to max_queues per port, then + * one extra interrupt for the hardware channel communication. + */ + if (nvec - 1 =3D=3D num_online_cpus()) { + irq_start =3D 1; + cpu_first =3D cpumask_first(cpu_online_mask); + irq_set_affinity_and_hint(irqs[0], cpumask_of(cpu_first)); + } else { + irq_start =3D 0; + } + + /* reset the core_count and num_node to 0. + */ + core_count =3D 0; + + /* for each interrupt find the cpu of a particular + * sibling set and if it belongs to the specific numa + * then assign irq to it and clear the cpu bit from + * the corresponding avail_cpus. + * Increase the cpu_count for that node. + * Once all cpus for a numa node is assigned, then + * move to different numa node and continue the same. + */ + for (i =3D irq_start; i < nvec; ) { + + /* check if the numa node has cpu or not + * to avoid infinite loop. + */ + if (cpumask_empty(cpumask_of_node(numa_node))) { + numa_node++; + if (++node_count =3D=3D num_online_nodes()) { + err =3D -EAGAIN; + goto free_irq; + } + } + cpu_first =3D cpumask_first_and(avail_cpus, + topology_sibling_cpumask(core_id_list[core_count])); + if (cpu_first < nr_cpu_ids && cpu_to_node(cpu_first) =3D=3D numa_node) { + irq_set_affinity_and_hint(irqs[i], cpumask_of(cpu_first)); + cpumask_clear_cpu(cpu_first, avail_cpus); + cpu_count =3D cpu_count + 1; + i =3D i + 1; + + /* checking if all the cpus are used from the + * particular node. + */ + if (cpu_count =3D=3D nr_cpus_node(numa_node)) { + numa_node =3D numa_node + 1; + if (numa_node =3D=3D num_online_nodes()) + numa_node =3D 0; + + /* wrap around once numa nodes + * are traversed. + */ + if (numa_node =3D=3D start_numa_node) { + node_count =3D 0; + cpumask_copy(avail_cpus, cpu_online_mask); + } + cpu_count =3D 0; + core_count =3D 0; + continue; + } + } + if (++core_count =3D=3D cores) + core_count =3D 0; + } +free_irq: + free_cpumask_var(filter_mask); + free_cpumask_var(avail_cpus); + if (core_id_list) + kfree(core_id_list); + return err; +} + static int mana_gd_setup_irqs(struct pci_dev *pdev) { - unsigned int max_queues_per_port =3D num_online_cpus(); + unsigned int max_queues_per_port; struct gdma_context *gc =3D pci_get_drvdata(pdev); struct gdma_irq_context *gic; - unsigned int max_irqs, cpu; - int nvec, irq; + unsigned int max_irqs; + int nvec, *irqs, irq; int err, i =3D 0, j; =20 + cpus_read_lock(); + max_queues_per_port =3D num_online_cpus(); if (max_queues_per_port > MANA_MAX_NUM_QUEUES) max_queues_per_port =3D MANA_MAX_NUM_QUEUES; =20 @@ -1261,6 +1366,11 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) nvec =3D pci_alloc_irq_vectors(pdev, 2, max_irqs, PCI_IRQ_MSIX); if (nvec < 0) return nvec; + irqs =3D kmalloc_array(nvec, sizeof(int), GFP_KERNEL); + if (!irqs) { + err =3D -ENOMEM; + goto free_irq_vector; + } =20 gc->irq_contexts =3D kcalloc(nvec, sizeof(struct gdma_irq_context), GFP_KERNEL); @@ -1281,27 +1391,27 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) snprintf(gic->name, MANA_IRQ_NAME_SZ, "mana_q%d@pci:%s", i - 1, pci_name(pdev)); =20 - irq =3D pci_irq_vector(pdev, i); - if (irq < 0) { - err =3D irq; + irqs[i] =3D pci_irq_vector(pdev, i); + if (irqs[i] < 0) { + err =3D irqs[i]; goto free_irq; } =20 - err =3D request_irq(irq, mana_gd_intr, 0, gic->name, gic); + err =3D request_irq(irqs[i], mana_gd_intr, 0, gic->name, gic); if (err) goto free_irq; - - cpu =3D cpumask_local_spread(i, gc->numa_node); - irq_set_affinity_and_hint(irq, cpumask_of(cpu)); } =20 + err =3D irq_setup(irqs, nvec, gc->numa_node); + if (err) + goto free_irq; err =3D mana_gd_alloc_res_map(nvec, &gc->msix_resource); if (err) goto free_irq; =20 gc->max_num_msix =3D nvec; gc->num_msix_usable =3D nvec; - + cpus_read_unlock(); return 0; =20 free_irq: @@ -1314,8 +1424,10 @@ static int mana_gd_setup_irqs(struct pci_dev *pdev) } =20 kfree(gc->irq_contexts); + kfree(irqs); gc->irq_contexts =3D NULL; free_irq_vector: + cpus_read_unlock(); pci_free_irq_vectors(pdev); return err; } --=20 2.34.1