From nobody Sun Feb 8 11:10:52 2026 Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.11]) (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) (No client certificate requested) by smtp.subspace.kernel.org (Postfix) with ESMTPS id 490781DA2FE for ; Wed, 4 Sep 2024 15:05:15 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.11 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725462317; cv=none; b=JFjk38Usjb1DzlI3+nbZSH8/vmTciYq7BJKYwtseORWkUMvDpdifbXLA2UxVmyZMzKm5TmlWRhSUhlt8PGYtaj+fBkdI6dKgCfBVOq97PKjgkr++EFcOB/ulVCXy0cns1CPoA42bnENR3OjtkrcVGFDBSznfETD1Fb5gK0goITA= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1725462317; c=relaxed/simple; bh=Qz82WMMHJhgtm208/hRpQDVXvQckCENoit6Mf3U+4Os=; h=From:To:Cc:Subject:Date:Message-ID:MIME-Version; b=Y0x312VBwmv9Hau7HxeUoCxSiJVACY7ouyfCbEQIOAWMLGSj28TGAZEuPLGlmD/EVRmtECiPWUgM8U1gAal8qAFWnm51b3fKnz4Ohnm8CyJ1YtqD2A2zODu1QhP4PhCMrb7xKUmiSLlLvZbX4o6QPxIjw3+ybqadwQFuylMRCao= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=VlSbwY+b; arc=none smtp.client-ip=192.198.163.11 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="VlSbwY+b" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=intel.com; i=@intel.com; q=dns/txt; s=Intel; t=1725462315; x=1756998315; h=from:to:cc:subject:date:message-id:mime-version: content-transfer-encoding; bh=Qz82WMMHJhgtm208/hRpQDVXvQckCENoit6Mf3U+4Os=; b=VlSbwY+bZOPThBVOcGNv60diUSNxxIZY/yLDzXODthe67n9KEuWau3ok sMNcKTVombhLbiuVPUQPIX4Z+i122/SPJ6Qku4qck0S8o6+D0JTj1Qxjb E/G9SplOFeOGEOmlDoiKHY1pqmBJxnN3IjE6WGwGtYX2clEZujoHWstyE aIUvUOowYvJvJNExRgQs2oJv+NriIRmDVZ4px9Wtb7sU9bsHnVHLq+zGj Y2uKdWUjtqe/DYJ53Y520Y4WO3oC+Zpu+ER6Dor5nQZ3mvPNz7rarwGef Bux3gMYZjW3Aty4y5Ba8d7LKnZCPbmCoDRe3x6uyaST5FoBWN4PQz+pMv Q==; X-CSE-ConnectionGUID: azYHbbFyQju2XMqha7N+vA== X-CSE-MsgGUID: J2DLUgCoRbO3ULOno5upRg== X-IronPort-AV: E=McAfee;i="6700,10204,11185"; a="34731264" X-IronPort-AV: E=Sophos;i="6.10,202,1719903600"; d="scan'208";a="34731264" Received: from fmviesa006.fm.intel.com ([10.60.135.146]) by fmvoesa105.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 04 Sep 2024 08:05:14 -0700 X-CSE-ConnectionGUID: fhNpQt96RN2LV2zHokamow== X-CSE-MsgGUID: tdRsXmv7S7qyfLk3zY0moQ== X-ExtLoop1: 1 X-IronPort-AV: E=Sophos;i="6.10,202,1719903600"; d="scan'208";a="64963930" Received: from sschumil-mobl2.ger.corp.intel.com (HELO tkristo-desk.intel.com) ([10.245.246.254]) by fmviesa006-auth.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 04 Sep 2024 08:05:12 -0700 From: Tero Kristo To: Cc: kbusch@kernel.org, linux-nvme@lists.infradead.org, linux-kernel@vger.kernel.org, sagi@grimberg.me, hch@lst.de, axboe@kernel.dk Subject: [RFC PATCH] nvme-pci: Add CPU latency pm-qos handling Date: Wed, 4 Sep 2024 18:04:47 +0300 Message-ID: <20240904150509.1830278-1-tero.kristo@linux.intel.com> X-Mailer: git-send-email 2.43.0 Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Content-Type: text/plain; charset="utf-8" Add support for limiting CPU latency while NVME IO is running. When a NVME IO is started, it will add a user configurable CPU latency limit in place (if any.) The limit is removed after 3ms of inactivity. The CPU latency limit is configurable via a sysfs parameter; cpu_latency_us under the NVME device. Signed-off-by: Tero Kristo --- See [1] for the initial series posted to block mailing list as an RFC. It was proposed to move this to a lower layer, i.e. NVMe driver. [1] https://lore.kernel.org/all/20240829075423.1345042-1-tero.kristo@linux.= intel.com/ drivers/nvme/host/pci.c | 95 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 90 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 710043086dff..ab9db4b02735 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -112,6 +113,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, boo= l shutdown); static void nvme_delete_io_queues(struct nvme_dev *dev); static void nvme_update_attrs(struct nvme_dev *dev); =20 +#define NVME_CPU_LATENCY_TIMEOUT_MS 3 + +struct nvme_cpu_latency_qos { + struct dev_pm_qos_request req; + struct delayed_work work; + unsigned long active; +}; + /* * Represents an NVM Express device. Each nvme_dev is a PCI function. */ @@ -141,6 +150,8 @@ struct nvme_dev { struct nvme_ctrl ctrl; u32 last_ps; bool hmb; + int cpu_latency; + struct nvme_cpu_latency_qos __percpu *cpu_latency_qos; =20 mempool_t *iod_mempool; =20 @@ -213,6 +224,7 @@ struct nvme_queue { __le32 *dbbuf_cq_db; __le32 *dbbuf_sq_ei; __le32 *dbbuf_cq_ei; + const struct cpumask *irq_aff_mask; struct completion delete_done; }; =20 @@ -470,6 +482,9 @@ static void nvme_pci_map_queues(struct blk_mq_tag_set *= set) */ static inline void nvme_write_sq_db(struct nvme_queue *nvmeq, bool write_s= q) { + struct nvme_dev *dev; + int cpu; + if (!write_sq) { u16 next_tail =3D nvmeq->sq_tail + 1; =20 @@ -483,6 +498,27 @@ static inline void nvme_write_sq_db(struct nvme_queue = *nvmeq, bool write_sq) nvmeq->dbbuf_sq_db, nvmeq->dbbuf_sq_ei)) writel(nvmeq->sq_tail, nvmeq->q_db); nvmeq->last_sq_tail =3D nvmeq->sq_tail; + + /* Kick CPU latency while updating queue. */ + dev =3D nvmeq->dev; + if (!dev || dev->cpu_latency < 0) + return; + + for_each_cpu(cpu, nvmeq->irq_aff_mask) { + struct nvme_cpu_latency_qos *qos; + + qos =3D per_cpu_ptr(dev->cpu_latency_qos, cpu); + + qos->active =3D jiffies + msecs_to_jiffies(NVME_CPU_LATENCY_TIMEOUT_MS); + + if (dev_pm_qos_request_active(&qos->req)) + continue; + + dev_pm_qos_add_request(get_cpu_device(cpu), &qos->req, + DEV_PM_QOS_RESUME_LATENCY, + dev->cpu_latency); + schedule_delayed_work(&qos->work, msecs_to_jiffies(NVME_CPU_LATENCY_TIME= OUT_MS)); + } } =20 static inline void nvme_sq_copy_cmd(struct nvme_queue *nvmeq, @@ -1563,14 +1599,19 @@ static int queue_request_irq(struct nvme_queue *nvm= eq) { struct pci_dev *pdev =3D to_pci_dev(nvmeq->dev->dev); int nr =3D nvmeq->dev->ctrl.instance; + int ret; =20 if (use_threaded_interrupts) { - return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check, - nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid); + ret =3D pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check, + nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid); } else { - return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq, - NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid); + ret =3D pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq, + NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid); } + + nvmeq->irq_aff_mask =3D pci_irq_get_affinity(pdev, nvmeq->cq_vector); + + return ret; } =20 static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) @@ -2134,6 +2175,26 @@ static ssize_t hmb_store(struct device *dev, struct = device_attribute *attr, } static DEVICE_ATTR_RW(hmb); =20 +static ssize_t cpu_latency_us_show(struct device *dev, struct device_attri= bute *attr, + char *buf) +{ + struct nvme_dev *ndev =3D to_nvme_dev(dev_get_drvdata(dev)); + + return sysfs_emit(buf, "%d\n", ndev->cpu_latency); +} + +static ssize_t cpu_latency_us_store(struct device *dev, struct device_attr= ibute *attr, + const char *buf, size_t count) +{ + struct nvme_dev *ndev =3D to_nvme_dev(dev_get_drvdata(dev)); + + if (kstrtoint(buf, 10, &ndev->cpu_latency) < 0) + return -EINVAL; + + return count; +} +static DEVICE_ATTR_RW(cpu_latency_us); + static umode_t nvme_pci_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { @@ -2158,6 +2219,7 @@ static struct attribute *nvme_pci_attrs[] =3D { &dev_attr_cmbloc.attr, &dev_attr_cmbsz.attr, &dev_attr_hmb.attr, + &dev_attr_cpu_latency_us.attr, NULL, }; =20 @@ -2695,6 +2757,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) nvme_free_tagset(dev); put_device(dev->dev); kfree(dev->queues); + free_percpu(dev->cpu_latency_qos); kfree(dev); } =20 @@ -2934,6 +2997,17 @@ static unsigned long check_vendor_combination_bug(st= ruct pci_dev *pdev) return 0; } =20 +static void nvme_cpu_latency_work(struct work_struct *work) +{ + struct nvme_cpu_latency_qos *qos =3D + container_of(work, struct nvme_cpu_latency_qos, work.work); + if (time_after(jiffies, qos->active)) { + dev_pm_qos_remove_request(&qos->req); + } else { + schedule_delayed_work(&qos->work, msecs_to_jiffies(NVME_CPU_LATENCY_TIME= OUT_MS)); + } +} + static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -2941,6 +3015,7 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci= _dev *pdev, int node =3D dev_to_node(&pdev->dev); struct nvme_dev *dev; int ret =3D -ENOMEM; + int cpu; =20 dev =3D kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) @@ -2948,13 +3023,21 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct p= ci_dev *pdev, INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work); mutex_init(&dev->shutdown_lock); =20 + dev->cpu_latency_qos =3D alloc_percpu(struct nvme_cpu_latency_qos); + if (!dev->cpu_latency_qos) + goto out_free_dev; + for_each_possible_cpu(cpu) + INIT_DELAYED_WORK(per_cpu_ptr(&dev->cpu_latency_qos->work, cpu), + nvme_cpu_latency_work); + dev->cpu_latency =3D -1; + dev->nr_write_queues =3D write_queues; dev->nr_poll_queues =3D poll_queues; dev->nr_allocated_queues =3D nvme_max_io_queues(dev) + 1; dev->queues =3D kcalloc_node(dev->nr_allocated_queues, sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) - goto out_free_dev; + goto out_free_pm_qos; =20 dev->dev =3D get_device(&pdev->dev); =20 @@ -3000,6 +3083,8 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci= _dev *pdev, out_put_device: put_device(dev->dev); kfree(dev->queues); +out_free_pm_qos: + free_percpu(dev->cpu_latency_qos); out_free_dev: kfree(dev); return ERR_PTR(ret); --=20 2.43.1