From nobody Mon Feb 9 05:55:08 2026 Received: from m16.mail.163.com (m16.mail.163.com [220.197.31.5]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 76B751662E3 for ; Wed, 26 Jun 2024 08:30:03 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=220.197.31.5 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719390606; cv=none; b=ZaJ0OCO2Buy18ZI0HyTiXTeTpDWvIic84QIFzLgcIDNG6WdUYV4ZmC821ir8OQCtDEgYREYNs/ZguT/biBGTY/FMdt+8y2ydNco9ywBQEduQr4d4K4iv350EAf70ZW+p4YYi88t8s4md78M/2fDS6nvzKCmSGYcDIOhJWV4Erk8= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719390606; c=relaxed/simple; bh=J0p11dtNRBccV6QFa2ceu0jWu3hcCAPsQrCNJAhNbew=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=ZhgQUBTF0dKGBpxRDGPrjlP22Pvmf78SSxkcE6EVvlC91Tj2oNNQ5YeUF4PsfjVaMgbX8cr0+Iusk5XWru/ZD8GIo8sHfGH/qTW5cgFv4pA+EjCqZbsgBh+clLqi9r7yCKozJ09V9IgJ9s6thaMlONx/dzMQnjWu8RsWeVOl/Yo= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=163.com; spf=pass smtp.mailfrom=163.com; dkim=pass (1024-bit key) header.d=163.com header.i=@163.com header.b=FayOYwZw; arc=none smtp.client-ip=220.197.31.5 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=163.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=163.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=163.com header.i=@163.com header.b="FayOYwZw" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=163.com; s=s110527; h=From:Subject:Date:Message-Id:MIME-Version; bh=z/cXE T2St5Z8IIkwJucSBod5ZKQXzHGuOYIljqwqLug=; b=FayOYwZw/ZvRUilHqTqVd hS5XRjdB5mHzdaB9zavhqROJPRAGdORleA7x4pXfprFfzK/gMlL1LU91nZ25Z+zw 0Zib8+qsrQi47zXaPQD3Aa+6v6K2nluHrSe50TftY1Y6GUH0lwKBTd0eHi8vOQdN yb4dDUwLszWr/iTEnfBu6o= Received: from localhost.localdomain (unknown [139.226.176.191]) by gzga-smtp-mta-g3-2 (Coremail) with SMTP id _____wD3__Ve0XtmGuclAg--.42928S3; Wed, 26 Jun 2024 16:29:27 +0800 (CST) From: Ping Gan To: hch@lst.de, sagi@grimberg.me, kch@nvidia.com, linux-nvme@lists.infradead.org, linux-kernel@vger.kernel.org Cc: ping.gan@dell.com, Ping Gan Subject: [PATCH 1/2] nvmet-rdma: add polling cq task for nvmet-rdma Date: Wed, 26 Jun 2024 16:28:22 +0800 Message-Id: <20240626082823.48326-2-jacky_gam_2001@163.com> X-Mailer: git-send-email 2.26.2 In-Reply-To: <20240626082823.48326-1-jacky_gam_2001@163.com> References: <20240626082823.48326-1-jacky_gam_2001@163.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-CM-TRANSID: _____wD3__Ve0XtmGuclAg--.42928S3 X-Coremail-Antispam: 1Uf129KBjvAXoW3KFWrtr48ZF13Kw18ZFy5urg_yoW8GFy7to ZxXF45JFnYkw18Ja1FyanrtFy2va4qqFZ3ArsxCFs0gF1xJ3yrWr92kw15Xr18Ww1Iyr1U Wr4xuw1Sga1kXF1rn29KB7ZKAUJUUUU8529EdanIXcx71UUUUU7v73VFW2AGmfu7bjvjm3 AaLaJ3UbIYCTnIWIevJa73UjIFyTuYvjTicTPRDUUUU X-CM-SenderInfo: 5mdfy55bjdzsisqqiqqrwthudrp/1tbiKAAKKWXAl9atFAABsX Content-Type: text/plain; charset="utf-8" To add dedicated polling cq tasks versus kworker for nvmet-rdma module. And we have three module parametes: task_num is to define number of polling cq task. core_affinity is to define which cpu core will be begun to use. idle_peroid is to define task's polling time before go to idle. Signed-off-by: Ping Gan --- drivers/nvme/target/rdma.c | 331 ++++++++++++++++++++++++++++++++++++- 1 file changed, 326 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 1eff8ca6a5f1..83c03e088bf9 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -26,6 +26,28 @@ #include #include "nvmet.h" =20 +/* Define a time period (in usecs) that poll thread shall sample an activa= ted + * queue before determining it to be idle. + */ +static int idle_poll_period_usecs; +module_param(idle_poll_period_usecs, int, 0644); +MODULE_PARM_DESC(idle_poll_period_usecs, + "nvmet rdma cq thread poll till idle time period in usecs"); + +/* Define the target rdma cq polling thread's affinity cpu core. + */ +static int pt_affinity_core =3D -2; +module_param(pt_affinity_core, int, 0644); +MODULE_PARM_DESC(pt_affinity_core, + "target rdma cq polling thread's affinity core, -1 for all online cpu= s"); + +/* Define the polling thread number. + */ +static int pt_num; +module_param(pt_num, int, 0644); +MODULE_PARM_DESC(pt_num, "target rdma cq polling thread number"); +bool rdma_polling_cq_task; + /* * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data */ @@ -39,6 +61,23 @@ =20 #define NVMET_RDMA_BACKLOG 128 =20 +struct nvmet_rdma_pt_data { + struct wait_queue_head wait_head; + struct mutex queue_lock; + struct list_head pt_admin_queue_list; + struct list_head pt_io_queue_list; + u32 thread_idle; + int affinity_cpu; + pid_t task_pid; + pid_t task_tgid; + atomic64_t admin_queue_cnt; + atomic64_t io_queue_cnt; + struct task_struct *thread; + struct mutex thread_lock; +}; + +struct nvmet_rdma_pt_data **rdma_pt_data; + struct nvmet_rdma_srq; =20 struct nvmet_rdma_cmd { @@ -114,6 +153,10 @@ struct nvmet_rdma_queue { int send_queue_size; =20 struct list_head queue_list; + //for cq poll thread + struct nvmet_rdma_pt_data *pt_data; + struct list_head pt_list_entry; + atomic64_t req_cnt; }; =20 struct nvmet_rdma_port { @@ -176,6 +219,59 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_devi= ce *ndev, =20 static const struct nvmet_fabrics_ops nvmet_rdma_ops; =20 +static void nvmet_rdma_wakeup_poll_thread(struct nvmet_rdma_queue *queue) +{ + smp_mb(); + if (queue->pt_data && waitqueue_active(&queue->pt_data->wait_head)) + wake_up(&queue->pt_data->wait_head); +} + +static void nvmet_rdma_ib_cq_handler(struct ib_cq *cq, void *private) +{ + struct nvmet_rdma_queue *queue =3D (struct nvmet_rdma_queue *)cq->cq_cont= ext; + atomic64_set(&queue->req_cnt, 1); + nvmet_rdma_wakeup_poll_thread(queue); +} + +static int nvmet_rdma_get_pcq_task(bool io_queue) +{ + int i =3D 1, ret =3D 0; + s64 min, tmp; + struct nvmet_rdma_pt_data *tptd; + + tptd =3D rdma_pt_data[0]; + if (io_queue) + min =3D atomic64_read(&tptd->io_queue_cnt); + else + min =3D atomic64_read(&tptd->admin_queue_cnt); + while (i < pt_num) { + tptd =3D rdma_pt_data[i]; + if (io_queue) + tmp =3D atomic64_read(&tptd->io_queue_cnt); + else + tmp =3D atomic64_read(&tptd->admin_queue_cnt); + if (min > tmp) { + min =3D tmp; + ret =3D i; + } + i++; + } + tptd =3D rdma_pt_data[ret]; + if (io_queue) + atomic64_inc(&tptd->io_queue_cnt); + else + atomic64_inc(&tptd->admin_queue_cnt); + return ret; +} + +static inline void nvmet_rdma_pq_clear_req(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_pt_data *tptd =3D queue->pt_data; + mutex_lock(&tptd->queue_lock); + list_del(&queue->pt_list_entry); + mutex_unlock(&tptd->queue_lock); +} + static int srq_size_set(const char *val, const struct kernel_param *kp) { int n =3D 0, ret; @@ -507,6 +603,10 @@ static int nvmet_rdma_post_recv(struct nvmet_rdma_devi= ce *ndev, =20 if (unlikely(ret)) pr_err("post_recv cmd failed\n"); + else if (rdma_polling_cq_task) { + atomic64_set(&cmd->queue->req_cnt, 1); + nvmet_rdma_wakeup_poll_thread(cmd->queue); + } =20 return ret; } @@ -740,6 +840,9 @@ static void nvmet_rdma_queue_response(struct nvmet_req = *req) if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) { pr_err("sending cmd response failed\n"); nvmet_rdma_release_rsp(rsp); + } else if (rdma_polling_cq_task) { + atomic64_set(&rsp->queue->req_cnt, 1); + nvmet_rdma_wakeup_poll_thread(rsp->queue); } } =20 @@ -816,6 +919,9 @@ static void nvmet_rdma_write_data_done(struct ib_cq *cq= , struct ib_wc *wc) if (unlikely(ib_post_send(cm_id->qp, &rsp->send_wr, NULL))) { pr_err("sending cmd response failed\n"); nvmet_rdma_release_rsp(rsp); + } else if (rdma_polling_cq_task) { + atomic64_set(&rsp->queue->req_cnt, 1); + nvmet_rdma_wakeup_poll_thread(rsp->queue); } } =20 @@ -957,6 +1063,10 @@ static bool nvmet_rdma_execute_command(struct nvmet_r= dma_rsp *rsp) if (rdma_rw_ctx_post(&rsp->rw, queue->qp, queue->cm_id->port_num, &rsp->read_cqe, NULL)) nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); + if (rdma_polling_cq_task) { + atomic64_set(&queue->req_cnt, 1); + nvmet_rdma_wakeup_poll_thread(queue); + } } else { rsp->req.execute(&rsp->req); } @@ -1259,8 +1369,16 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_r= dma_queue *queue) */ nr_cqe =3D queue->recv_queue_size + 2 * queue->send_queue_size; =20 - queue->cq =3D ib_cq_pool_get(ndev->device, nr_cqe + 1, - queue->comp_vector, IB_POLL_WORKQUEUE); + if (rdma_polling_cq_task) { + queue->cq =3D ib_alloc_cq(ndev->device, queue, nr_cqe + 1, + queue->comp_vector, IB_POLL_DIRECT); + queue->cq->comp_handler =3D nvmet_rdma_ib_cq_handler; + ib_req_notify_cq(queue->cq, IB_CQ_NEXT_COMP); + } else { + queue->cq =3D ib_cq_pool_get(ndev->device, nr_cqe + 1, + queue->comp_vector, IB_POLL_WORKQUEUE); + } + if (IS_ERR(queue->cq)) { ret =3D PTR_ERR(queue->cq); pr_err("failed to create CQ cqe=3D %d ret=3D %d\n", @@ -1331,8 +1449,11 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet= _rdma_queue *queue) if (queue->cm_id) rdma_destroy_id(queue->cm_id); ib_destroy_qp(queue->qp); - ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 * - queue->send_queue_size + 1); + if (rdma_polling_cq_task) + ib_free_cq(queue->cq); + else + ib_cq_pool_put(queue->cq, queue->recv_queue_size + 2 * + queue->send_queue_size + 1); } =20 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) @@ -1340,6 +1461,13 @@ static void nvmet_rdma_free_queue(struct nvmet_rdma_= queue *queue) pr_debug("freeing queue %d\n", queue->idx); =20 nvmet_sq_destroy(&queue->nvme_sq); + if (rdma_polling_cq_task) { + nvmet_rdma_pq_clear_req(queue); + if (queue->host_qid > 0) + atomic64_dec(&queue->pt_data->io_queue_cnt); + else + atomic64_dec(&queue->pt_data->admin_queue_cnt); + } =20 nvmet_rdma_destroy_queue_ib(queue); if (!queue->nsrq) { @@ -1600,6 +1728,19 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_i= d *cm_id, goto free_queue; } =20 + if (rdma_polling_cq_task) { + bool io_queue =3D queue->host_qid > 0?1:0; + ret =3D nvmet_rdma_get_pcq_task(io_queue); + queue->pt_data =3D rdma_pt_data[ret]; + mutex_lock(&queue->pt_data->queue_lock); + if (io_queue) + list_add_tail(&queue->pt_list_entry, &queue->pt_data->pt_io_queue_list); + else + list_add_tail(&queue->pt_list_entry, &queue->pt_data->pt_admin_queue_li= st); + mutex_unlock(&queue->pt_data->queue_lock); + nvmet_rdma_wakeup_poll_thread(queue); + } + mutex_lock(&nvmet_rdma_queue_mutex); list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); mutex_unlock(&nvmet_rdma_queue_mutex); @@ -2082,9 +2223,156 @@ static struct ib_client nvmet_rdma_ib_client =3D { .remove =3D nvmet_rdma_remove_one }; =20 +#define RDMA_POLL_BUDGET 8 +static int __nvmet_rdma_poll_thread(struct nvmet_rdma_pt_data *rptd) +{ + int rcv_ret =3D 0; + bool need_repoll =3D false; + struct nvmet_rdma_queue *qreq, *tmp; + + mutex_lock(&rptd->queue_lock); + if (!list_empty(&rptd->pt_admin_queue_list)) { + list_for_each_entry_safe(qreq, tmp, &rptd->pt_admin_queue_list, pt_list_= entry) { + if (atomic64_read(&qreq->req_cnt) > 0) { + rcv_ret =3D ib_process_cq_direct(qreq->cq, RDMA_POLL_BUDGET); + if (rcv_ret > 0) + need_repoll =3D true; + else { + atomic64_set(&qreq->req_cnt, 0); + ib_req_notify_cq(qreq->cq, IB_CQ_NEXT_COMP); + } + } + } + } + if (!list_empty(&rptd->pt_io_queue_list)) { + list_for_each_entry_safe(qreq, tmp, &rptd->pt_io_queue_list, pt_list_ent= ry) { + if (atomic64_read(&qreq->req_cnt) > 0) { + rcv_ret =3D ib_process_cq_direct(qreq->cq, RDMA_POLL_BUDGET); + if (rcv_ret > 0) + need_repoll =3D true; + else { + atomic64_set(&qreq->req_cnt, 0); + ib_req_notify_cq(qreq->cq, IB_CQ_NEXT_COMP); + } + } + } + } + mutex_unlock(&rptd->queue_lock); + if (need_repoll) + return 1; + else + return 0; +} + +static int nvmet_rdma_poll_thread(void *data) +{ + struct nvmet_rdma_pt_data *rptd =3D data; + unsigned long timeout =3D 0; + DEFINE_WAIT(wait); + + if (rptd->affinity_cpu !=3D -1) + set_cpus_allowed_ptr(current, cpumask_of(rptd->affinity_cpu)); + else + set_cpus_allowed_ptr(current, cpu_online_mask); + current->flags |=3D PF_NO_SETAFFINITY; + mutex_lock(&rptd->thread_lock); + rptd->task_pid =3D current->pid; + rptd->task_tgid =3D current->tgid; + + while (!kthread_should_stop()) { + int ret =3D __nvmet_rdma_poll_thread(rptd); + if (ret > 0 || !time_after(jiffies, timeout)) { + cond_resched(); + if (ret > 0) + timeout =3D jiffies + rptd->thread_idle; + continue; + } + prepare_to_wait(&rptd->wait_head, &wait, TASK_INTERRUPTIBLE); + mutex_unlock(&rptd->thread_lock); + schedule(); + mutex_lock(&rptd->thread_lock); + finish_wait(&rptd->wait_head, &wait); + timeout =3D jiffies + rptd->thread_idle; + } + rptd->thread =3D NULL; + rptd->task_pid =3D -1; + rptd->task_tgid =3D -1; + mutex_unlock(&rptd->thread_lock); + kthread_complete_and_exit(NULL, 0); + //do_exit(0); +} + static int __init nvmet_rdma_init(void) { - int ret; + int ret, i; + char task_name[TASK_COMM_LEN]; + struct task_struct *task; + + rdma_polling_cq_task =3D false; + if ((pt_affinity_core >=3D -1 && pt_affinity_core < (int)nr_cpu_ids) + || pt_num > 0 || idle_poll_period_usecs > 0) { + if (pt_num =3D=3D 0) + pt_num =3D 1; + else if (pt_num < 0) { + printk(KERN_ERR "bad parameter for task num\n"); + ret =3D -EINVAL; + return ret; + } + if (pt_affinity_core =3D=3D -2) + pt_affinity_core =3D -1; + if (pt_affinity_core < -1 || + pt_affinity_core >=3D (int)nr_cpu_ids) { + printk(KERN_ERR "bad parameter for affinity core \n"); + ret =3D -EINVAL; + return ret; + } + if (idle_poll_period_usecs =3D=3D 0) + idle_poll_period_usecs =3D 1000; // default 1ms + else if (idle_poll_period_usecs < 0) { + printk(KERN_ERR "bad parameter for idle poll period\n"); + ret =3D -EINVAL; + return ret; + } + rdma_pt_data =3D kmalloc(pt_num * sizeof(void *), GFP_KERNEL); + if (!rdma_pt_data) + return -ENOMEM; + + for (i =3D 0; i < pt_num; i++) { + rdma_pt_data[i] =3D kmalloc(sizeof(struct nvmet_rdma_pt_data), GFP_KERN= EL); + if (!rdma_pt_data[i]) { + ret =3D -ENOMEM; + goto err_free_pqtd; + } + } + for (i =3D 0; i < pt_num; i++) { + mutex_init(&rdma_pt_data[i]->thread_lock); + rdma_pt_data[i]->thread_idle =3D usecs_to_jiffies(idle_poll_period_usec= s); + mutex_init(&rdma_pt_data[i]->queue_lock); + INIT_LIST_HEAD(&rdma_pt_data[i]->pt_admin_queue_list); + INIT_LIST_HEAD(&rdma_pt_data[i]->pt_io_queue_list); + init_waitqueue_head(&rdma_pt_data[i]->wait_head); + atomic64_set(&rdma_pt_data[i]->admin_queue_cnt, 0); + atomic64_set(&rdma_pt_data[i]->io_queue_cnt, 0); + if (pt_affinity_core !=3D -1) + rdma_pt_data[i]->affinity_cpu =3D (pt_affinity_core + (int)i) % + ((int) nr_cpu_ids); + else + rdma_pt_data[i]->affinity_cpu =3D -1; + snprintf(task_name, TASK_COMM_LEN, "nvmet-rdma-pt%u", i); + task =3D kthread_create(nvmet_rdma_poll_thread, (void *)rdma_pt_data[i]= , task_name); + if (IS_ERR(task)) { + ret =3D PTR_ERR(task); + goto err_free_pt_data; + } + set_user_nice(task, -20); + mutex_lock(&rdma_pt_data[i]->thread_lock); + rdma_pt_data[i]->thread =3D task; + mutex_unlock(&rdma_pt_data[i]->thread_lock); + } + rdma_polling_cq_task =3D true; + for (i =3D 0; i < pt_num; i++) + wake_up_process(rdma_pt_data[i]->thread); + } =20 ret =3D ib_register_client(&nvmet_rdma_ib_client); if (ret) @@ -2098,15 +2386,48 @@ static int __init nvmet_rdma_init(void) =20 err_ib_client: ib_unregister_client(&nvmet_rdma_ib_client); +err_free_pt_data: + if ((pt_affinity_core >=3D -1 && pt_affinity_core < (int)nr_cpu_ids) + || pt_num > 0 || idle_poll_period_usecs > 0) { + while (i > 0) { + kthread_stop(rdma_pt_data[i-1]->thread); + i--; + } + i =3D pt_num; +err_free_pqtd: + while (i > 0) { + kfree(rdma_pt_data[i-1]); + i--; + } + kfree(rdma_pt_data); + } return ret; } =20 static void __exit nvmet_rdma_exit(void) { + int i =3D 0; + + if (rdma_polling_cq_task) { + for (i =3D 0; i < pt_num; i++) { + mutex_lock(&rdma_pt_data[i]->thread_lock); + if (rdma_pt_data[i]->thread) { + mutex_unlock(&rdma_pt_data[i]->thread_lock); + kthread_stop(rdma_pt_data[i]->thread); + } else { + mutex_unlock(&rdma_pt_data[i]->thread_lock); + } + } + } nvmet_unregister_transport(&nvmet_rdma_ops); ib_unregister_client(&nvmet_rdma_ib_client); WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); ida_destroy(&nvmet_rdma_queue_ida); + if (rdma_polling_cq_task) { + for (i =3D 0; i < pt_num; i++) + kfree(rdma_pt_data[i]); + kfree(rdma_pt_data); + } } =20 module_init(nvmet_rdma_init); --=20 2.26.2 From nobody Mon Feb 9 05:55:08 2026 Received: from m16.mail.163.com (m16.mail.163.com [117.135.210.4]) by smtp.subspace.kernel.org (Postfix) with ESMTP id 24A9116A948 for ; Wed, 26 Jun 2024 08:30:11 +0000 (UTC) Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=117.135.210.4 ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719390616; cv=none; b=rT0PKikSUpp+1tUSgOycFg6qjWYzq9/CM1TEx4fNmFTPdWx2NIP/5BMr6heWSqJgXL20ecSrJiRf1U1pTik8swHGxWgsGPMMEBKPc40WyJyMTAi7k91WJacn7J7tcPDBzNKxfDVshg+g/WTwYFWCFLDPUp0yTufkDcy1ql3BbLs= ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; t=1719390616; c=relaxed/simple; bh=ykqfPUyIX3N0TWFsvsEz6pyUgUWi47ZCmisX9DUfcFg=; h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: MIME-Version; b=PLcdYjARFu0E0/tS2Wi+MB2qO78Wv3Z5hOAb0Ozq8xSRpH5GEZlgKy82HLWa3Sc708Vz+ol0ns6SjJJl+BuZMy4WHBmzDBkDnLEra1kaFiuaBCuRwd4VtoaaYZNXp0fwVpp9uhtLJO9yvqONKLwnPN4VOk1ydeQVnWQqRmDoNBI= ARC-Authentication-Results: i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=163.com; spf=pass smtp.mailfrom=163.com; dkim=pass (1024-bit key) header.d=163.com header.i=@163.com header.b=AEUzDkzM; arc=none smtp.client-ip=117.135.210.4 Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=163.com Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=163.com Authentication-Results: smtp.subspace.kernel.org; dkim=pass (1024-bit key) header.d=163.com header.i=@163.com header.b="AEUzDkzM" DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=163.com; s=s110527; h=From:Subject:Date:Message-Id:MIME-Version; bh=e6BcC L3vOh9/llYOa6IFwVuoWFl+P/VnevUdKHS76T8=; b=AEUzDkzMSvOY7c2p4ShNY 1GknHjIstMWAgbHJD8CsZ5X3b6NfHn1Mpv+aMOutS2z7ywkp7ux0j0cArTZo1ucn G3UxZhgy7KbTy92j7hi+/lONOU4JG/zJ0DJvKSqAc+MzfDzhSEnlUgvH+leti0az 9/5sGOABferwD2KDj/NOPk= Received: from localhost.localdomain (unknown [139.226.176.191]) by gzga-smtp-mta-g3-2 (Coremail) with SMTP id _____wD3__Ve0XtmGuclAg--.42928S4; Wed, 26 Jun 2024 16:29:30 +0800 (CST) From: Ping Gan To: hch@lst.de, sagi@grimberg.me, kch@nvidia.com, linux-nvme@lists.infradead.org, linux-kernel@vger.kernel.org Cc: ping.gan@dell.com, Ping Gan Subject: [PATCH 2/2] nvmet-tcp: add polling task for nvmet-tcp Date: Wed, 26 Jun 2024 16:28:23 +0800 Message-Id: <20240626082823.48326-3-jacky_gam_2001@163.com> X-Mailer: git-send-email 2.26.2 In-Reply-To: <20240626082823.48326-1-jacky_gam_2001@163.com> References: <20240626082823.48326-1-jacky_gam_2001@163.com> Precedence: bulk X-Mailing-List: linux-kernel@vger.kernel.org List-Id: List-Subscribe: List-Unsubscribe: MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable X-CM-TRANSID: _____wD3__Ve0XtmGuclAg--.42928S4 X-Coremail-Antispam: 1Uf129KBjvAXoW3KFWrAFWUAw13WrWxZr1rXrb_yoW8XF4DWo WfXF45Jr1kCw1rKayFvF13tFy8uan7ta9Yyrn8CF45WF17t3yru34akw13X34UWa18tr17 XayxAF1Sqw4ktF1kn29KB7ZKAUJUUUU8529EdanIXcx71UUUUU7v73VFW2AGmfu7bjvjm3 AaLaJ3UbIYCTnIWIevJa73UjIFyTuYvj4RGeHqUUUUU X-CM-SenderInfo: 5mdfy55bjdzsisqqiqqrwthudrp/1tbiEA4KKWXAloDqVQAAsx Content-Type: text/plain; charset="utf-8" To add dedicated polling tasks versus kworker to handle tcp's IO for nvmet-tcp module. And we have three module parametes: task_num is to define number of polling task. core_affinity is to define which cpu core will be begun to use. idle_peroid is to define task's polling time before go to idle. Signed-off-by: Ping Gan --- drivers/nvme/target/tcp.c | 356 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 339 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 5bff0d5464d1..aa6d90f8d11c 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -73,6 +73,20 @@ device_param_cb(idle_poll_period_usecs, &set_param_ops, MODULE_PARM_DESC(idle_poll_period_usecs, "nvmet tcp io_work poll till idle time period in usecs: Default 0"); =20 +/* Define the target tcp polling thread's affinity cpu core. + */ +static int pt_affinity_core =3D -2; +module_param(pt_affinity_core, int, 0644); +MODULE_PARM_DESC(pt_affinity_core, + "target tcp polling thread's affinity core, -1 for all online cpus"); + +/* Define the polling thread number. + */ +static int pt_num; +module_param(pt_num, int, 0644); +MODULE_PARM_DESC(pt_num, "target tcp polling thread number"); +static bool tcp_polling_task; + #ifdef CONFIG_NVME_TARGET_TCP_TLS /* * TLS handshake timeout @@ -106,6 +120,25 @@ enum { NVMET_TCP_F_INIT_FAILED =3D (1 << 0), }; =20 +struct nvmet_tcp_pt_data { + struct wait_queue_head wait_head; + struct mutex queue_lock; + struct list_head pt_queue_list; + struct list_head pt_io_queue_list; + struct list_head addon_queue_list; + struct mutex addon_queue_lock; + u32 thread_idle; + int affinity_cpu; + pid_t task_pid; + pid_t task_tgid; + atomic64_t queue_cnt; + atomic64_t io_queue_cnt; + struct task_struct *thread; + struct mutex thread_lock; +}; + +struct nvmet_tcp_pt_data **tcp_pt_data; + struct nvmet_tcp_cmd { struct nvmet_tcp_queue *queue; struct nvmet_req req; @@ -150,6 +183,9 @@ struct nvmet_tcp_queue { struct socket *sock; struct nvmet_tcp_port *port; struct work_struct io_work; + struct nvmet_tcp_pt_data *pt_data; + struct list_head pt_list_entry; + atomic64_t req_cnt; struct nvmet_cq nvme_cq; struct nvmet_sq nvme_sq; struct kref kref; @@ -218,6 +254,46 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops; static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd); =20 +static void nvmet_tcp_wakeup_poll_thread(struct nvmet_tcp_queue *queue) +{ + smp_mb(); + if (queue->pt_data && waitqueue_active(&queue->pt_data->wait_head)) + wake_up(&queue->pt_data->wait_head); +} + +static int nvmet_tcp_get_polling_task(void) +{ + int i =3D 1, ret =3D 0; + s64 min, tmp, totalq_min, totalq_tmp; + struct nvmet_tcp_pt_data *tptd; + + tptd =3D tcp_pt_data[0]; + min =3D atomic64_read(&tptd->io_queue_cnt); + totalq_min =3D atomic64_read(&tptd->queue_cnt); + while (i < pt_num) { + tptd =3D tcp_pt_data[i]; + tmp =3D atomic64_read(&tptd->io_queue_cnt); + totalq_tmp =3D atomic64_read(&tptd->queue_cnt); + if (min > tmp || (min =3D=3D tmp && totalq_min > totalq_tmp)) { + min =3D tmp; + totalq_min =3D totalq_tmp; + ret =3D i; + } + i++; + } + tptd =3D tcp_pt_data[ret]; + atomic64_inc(&tptd->queue_cnt); + return ret; +} + +static inline void nvmet_tcp_pq_clear_req(struct nvmet_tcp_queue *queue) +{ + struct nvmet_tcp_pt_data *tptd =3D queue->pt_data; + mutex_lock(&tptd->queue_lock); + list_del(&queue->pt_list_entry); + mutex_unlock(&tptd->queue_lock); +} + static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd) { @@ -590,7 +666,12 @@ static void nvmet_tcp_queue_response(struct nvmet_req = *req) } =20 llist_add(&cmd->lentry, &queue->resp_list); - queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work); + if (tcp_polling_task) { + atomic64_set(&queue->req_cnt, 1); + nvmet_tcp_wakeup_poll_thread(queue); + } else { + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work); + } } =20 static void nvmet_tcp_execute_request(struct nvmet_tcp_cmd *cmd) @@ -1598,13 +1679,21 @@ static void nvmet_tcp_release_queue_work(struct wor= k_struct *w) =20 nvmet_tcp_restore_socket_callbacks(queue); cancel_delayed_work_sync(&queue->tls_handshake_tmo_work); - cancel_work_sync(&queue->io_work); + if (!tcp_polling_task) + cancel_work_sync(&queue->io_work); + else { + nvmet_tcp_pq_clear_req(queue); + if (queue->nvme_sq.qid !=3D 0) + atomic64_dec(&queue->pt_data->io_queue_cnt); + atomic64_dec(&queue->pt_data->queue_cnt); + } /* stop accepting incoming data */ queue->rcv_state =3D NVMET_TCP_RECV_ERR; =20 nvmet_tcp_uninit_data_in_cmds(queue); nvmet_sq_destroy(&queue->nvme_sq); - cancel_work_sync(&queue->io_work); + if (!tcp_polling_task) + cancel_work_sync(&queue->io_work); nvmet_tcp_free_cmd_data_in_buffers(queue); /* ->sock will be released by fput() */ fput(queue->sock->file); @@ -1627,9 +1716,15 @@ static void nvmet_tcp_data_ready(struct sock *sk) if (likely(queue)) { if (queue->data_ready) queue->data_ready(sk); - if (queue->state !=3D NVMET_TCP_Q_TLS_HANDSHAKE) - queue_work_on(queue_cpu(queue), nvmet_tcp_wq, - &queue->io_work); + if (queue->state !=3D NVMET_TCP_Q_TLS_HANDSHAKE) { + if (tcp_polling_task) { + atomic64_set(&queue->req_cnt, 1); + nvmet_tcp_wakeup_poll_thread(queue); + } else { + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, + &queue->io_work); + } + } } read_unlock_bh(&sk->sk_callback_lock); } @@ -1650,7 +1745,12 @@ static void nvmet_tcp_write_space(struct sock *sk) =20 if (sk_stream_is_writeable(sk)) { clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags); - queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); + if (tcp_polling_task) { + atomic64_set(&queue->req_cnt, 1); + nvmet_tcp_wakeup_poll_thread(queue); + } else { + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); + } } out: read_unlock_bh(&sk->sk_callback_lock); @@ -1731,7 +1831,19 @@ static int nvmet_tcp_set_queue_sock(struct nvmet_tcp= _queue *queue) sock->sk->sk_write_space =3D nvmet_tcp_write_space; if (idle_poll_period_usecs) nvmet_tcp_arm_queue_deadline(queue); - queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); + if (tcp_polling_task) { + int task_index =3D nvmet_tcp_get_polling_task(); + queue->pt_data =3D tcp_pt_data[task_index]; + write_unlock_bh(&sock->sk->sk_callback_lock); + mutex_lock(&queue->pt_data->addon_queue_lock); + list_add_tail(&queue->pt_list_entry, &queue->pt_data->addon_queue_list); + mutex_unlock(&queue->pt_data->addon_queue_lock); + write_lock_bh(&sock->sk->sk_callback_lock); + atomic64_set(&queue->req_cnt, 1); + nvmet_tcp_wakeup_poll_thread(queue); + } else { + queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &queue->io_work); + } } write_unlock_bh(&sock->sk->sk_callback_lock); =20 @@ -1883,7 +1995,8 @@ static void nvmet_tcp_alloc_queue(struct nvmet_tcp_po= rt *port, } =20 INIT_WORK(&queue->release_work, nvmet_tcp_release_queue_work); - INIT_WORK(&queue->io_work, nvmet_tcp_io_work); + if (!tcp_polling_task) + INIT_WORK(&queue->io_work, nvmet_tcp_io_work); kref_init(&queue->kref); queue->sock =3D newsock; queue->port =3D port; @@ -2146,6 +2259,15 @@ static u16 nvmet_tcp_install_queue(struct nvmet_sq *= sq) } =20 queue->nr_cmds =3D sq->size * 2; + if (tcp_polling_task) { + if (queue->state =3D=3D NVMET_TCP_Q_DISCONNECTING) + return 0; // if release worker schedule, directly return + if (sq->qid !=3D 0) { + atomic64_inc(&queue->pt_data->io_queue_cnt); + list_del(&queue->pt_list_entry); + list_add_tail(&queue->pt_list_entry, &queue->pt_data->pt_io_queue_list); + } + } if (nvmet_tcp_alloc_cmds(queue)) return NVME_SC_INTERNAL; return 0; @@ -2193,14 +2315,181 @@ static const struct nvmet_fabrics_ops nvmet_tcp_op= s =3D { .host_traddr =3D nvmet_tcp_host_port_addr, }; =20 -static int __init nvmet_tcp_init(void) +static int __nvmet_tcp_poll_thread(struct nvmet_tcp_pt_data *tptd) +{ + int rcv_ret =3D 0, snd_ret =3D 0, ops =3D 0; + bool need_repoll =3D false; + struct nvmet_tcp_queue *qreq, *tmp; + + mutex_lock(&tptd->addon_queue_lock); + mutex_lock(&tptd->queue_lock); + list_splice_tail_init(&tptd->addon_queue_list, &tptd->pt_queue_list); + mutex_unlock(&tptd->queue_lock); + mutex_unlock(&tptd->addon_queue_lock); + + mutex_lock(&tptd->queue_lock); + if (!list_empty(&tptd->pt_queue_list)) { + list_for_each_entry_safe(qreq, tmp, &tptd->pt_queue_list, pt_list_entry)= { + if (atomic64_read(&qreq->req_cnt) > 0) { + rcv_ret =3D nvmet_tcp_try_recv(qreq, NVMET_TCP_RECV_BUDGET, &ops); + if (rcv_ret < 0) { + atomic64_set(&qreq->req_cnt, 0); + continue; + } + if (rcv_ret > 0) + need_repoll =3D true; + snd_ret =3D nvmet_tcp_try_send(qreq, NVMET_TCP_SEND_BUDGET, &ops); + if (snd_ret < 0) { + atomic64_set(&qreq->req_cnt, 0); + continue; + } + if (snd_ret > 0) + need_repoll =3D true; + else if (rcv_ret =3D=3D 0) + atomic64_set(&qreq->req_cnt, 0); + } + } + } + if (!list_empty(&tptd->pt_io_queue_list)) { + list_for_each_entry_safe(qreq, tmp, &tptd->pt_io_queue_list, pt_list_ent= ry) { + if (atomic64_read(&qreq->req_cnt) > 0) { + rcv_ret =3D nvmet_tcp_try_recv(qreq, NVMET_TCP_RECV_BUDGET, &ops); + if (rcv_ret < 0) { + atomic64_set(&qreq->req_cnt, 0); + continue; + } + if (rcv_ret > 0) + need_repoll =3D true; + snd_ret =3D nvmet_tcp_try_send(qreq, NVMET_TCP_SEND_BUDGET, &ops); + if (snd_ret < 0) { + atomic64_set(&qreq->req_cnt, 0); + continue; + } + if (snd_ret > 0) + need_repoll =3D true; + else if (rcv_ret =3D=3D 0) + atomic64_set(&qreq->req_cnt, 0); + } + } + } + mutex_unlock(&tptd->queue_lock); + if (need_repoll) + return 1; + else + return 0; +} + +static int nvmet_tcp_poll_thread(void *data) { - int ret; + struct nvmet_tcp_pt_data *tptd =3D data; + unsigned long timeout =3D 0; + DEFINE_WAIT(wait); =20 - nvmet_tcp_wq =3D alloc_workqueue("nvmet_tcp_wq", - WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); - if (!nvmet_tcp_wq) - return -ENOMEM; + if (tptd->affinity_cpu !=3D -1) + set_cpus_allowed_ptr(current, cpumask_of(tptd->affinity_cpu)); + else + set_cpus_allowed_ptr(current, cpu_online_mask); + current->flags |=3D PF_NO_SETAFFINITY; + mutex_lock(&tptd->thread_lock); + tptd->task_pid =3D current->pid; + tptd->task_tgid =3D current->tgid; + + while (!kthread_should_stop()) { + int ret =3D __nvmet_tcp_poll_thread(tptd); + if (ret > 0 || !time_after(jiffies, timeout)) { + cond_resched(); + if (ret > 0) + timeout =3D jiffies + tptd->thread_idle; + continue; + } + prepare_to_wait(&tptd->wait_head, &wait, TASK_INTERRUPTIBLE); + mutex_unlock(&tptd->thread_lock); + schedule(); + mutex_lock(&tptd->thread_lock); + finish_wait(&tptd->wait_head, &wait); + timeout =3D jiffies + tptd->thread_idle; + } + tptd->thread =3D NULL; + tptd->task_pid =3D -1; + tptd->task_tgid =3D -1; + mutex_unlock(&tptd->thread_lock); + kthread_complete_and_exit(NULL, 0); + //do_exit(0); +} + +static int __init nvmet_tcp_init(void) +{ + int ret, i =3D 0; + char task_name[TASK_COMM_LEN]; + struct task_struct *task; + + tcp_polling_task =3D false; + if ((pt_affinity_core >=3D -1 && + pt_affinity_core < (int)nr_cpu_ids) || pt_num > 0) { + if (pt_num =3D=3D 0) + pt_num =3D 1; + else if (pt_num < 0) { + printk(KERN_ERR "bad parameter for task num\n"); + ret =3D -EINVAL; + return ret; + } + if (pt_affinity_core =3D=3D -2) + pt_affinity_core =3D -1; + if (pt_affinity_core < -1 || + pt_affinity_core >=3D (int)nr_cpu_ids) { + printk(KERN_ERR "bad parameter for affinity core \n"); + ret =3D -EINVAL; + return ret; + } + if (!(idle_poll_period_usecs > 0)) + idle_poll_period_usecs =3D 1000; // default 1ms + tcp_pt_data =3D kmalloc(pt_num * sizeof(void *), GFP_KERNEL); + if (!tcp_pt_data) + return -ENOMEM; + + for (i =3D 0; i < pt_num; i++) { + tcp_pt_data[i] =3D kmalloc(sizeof(struct nvmet_tcp_pt_data), GFP_KERNEL= ); + if (!tcp_pt_data[i]) { + ret =3D -ENOMEM; + goto err_free_pqtd; + } + } + for (i =3D 0; i < pt_num; i++) { + mutex_init(&tcp_pt_data[i]->thread_lock); + tcp_pt_data[i]->thread_idle =3D usecs_to_jiffies(idle_poll_period_usecs= ); + mutex_init(&tcp_pt_data[i]->queue_lock); + mutex_init(&tcp_pt_data[i]->addon_queue_lock); + INIT_LIST_HEAD(&tcp_pt_data[i]->pt_queue_list); + INIT_LIST_HEAD(&tcp_pt_data[i]->pt_io_queue_list); + INIT_LIST_HEAD(&tcp_pt_data[i]->addon_queue_list); + init_waitqueue_head(&tcp_pt_data[i]->wait_head); + atomic64_set(&tcp_pt_data[i]->queue_cnt, 0); + atomic64_set(&tcp_pt_data[i]->io_queue_cnt, 0); + if (pt_affinity_core !=3D -1) + tcp_pt_data[i]->affinity_cpu =3D (pt_affinity_core + (int)i) % + ((int) nr_cpu_ids); + else + tcp_pt_data[i]->affinity_cpu =3D -1; + snprintf(task_name, TASK_COMM_LEN, "nvmet-tcp-pt%u", i); + task =3D kthread_create(nvmet_tcp_poll_thread, (void *)tcp_pt_data[i], = task_name); + if (IS_ERR(task)) { + ret =3D PTR_ERR(task); + goto err; + } + set_user_nice(task, -20); + mutex_lock(&tcp_pt_data[i]->thread_lock); + tcp_pt_data[i]->thread =3D task; + mutex_unlock(&tcp_pt_data[i]->thread_lock); + } + tcp_polling_task =3D true; + for (i =3D 0; i < pt_num; i++) + wake_up_process(tcp_pt_data[i]->thread); + } else { + nvmet_tcp_wq =3D alloc_workqueue("nvmet_tcp_wq", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + if (!nvmet_tcp_wq) + return -ENOMEM; + } =20 ret =3D nvmet_register_transport(&nvmet_tcp_ops); if (ret) @@ -2208,15 +2497,42 @@ static int __init nvmet_tcp_init(void) =20 return 0; err: - destroy_workqueue(nvmet_tcp_wq); + if ((pt_affinity_core >=3D -1 && + pt_affinity_core < (int)nr_cpu_ids) || pt_num > 0) { + while (i > 0) { + kthread_stop(tcp_pt_data[i-1]->thread); + i--; + } + i =3D pt_num; +err_free_pqtd: + while (i > 0) { + kfree(tcp_pt_data[i-1]); + i--; + } + kfree(tcp_pt_data); + } else { + destroy_workqueue(nvmet_tcp_wq); + } return ret; } =20 static void __exit nvmet_tcp_exit(void) { struct nvmet_tcp_queue *queue; + int i =3D 0; =20 nvmet_unregister_transport(&nvmet_tcp_ops); + if (tcp_polling_task) { + for (i =3D 0; i < pt_num; i++) { + mutex_lock(&tcp_pt_data[i]->thread_lock); + if (tcp_pt_data[i]->thread) { + mutex_unlock(&tcp_pt_data[i]->thread_lock); + kthread_stop(tcp_pt_data[i]->thread); + } else { + mutex_unlock(&tcp_pt_data[i]->thread_lock); + } + } + } =20 flush_workqueue(nvmet_wq); mutex_lock(&nvmet_tcp_queue_mutex); @@ -2225,7 +2541,13 @@ static void __exit nvmet_tcp_exit(void) mutex_unlock(&nvmet_tcp_queue_mutex); flush_workqueue(nvmet_wq); =20 - destroy_workqueue(nvmet_tcp_wq); + if (tcp_polling_task) { + for (i =3D 0; i < pt_num; i++) + kfree(tcp_pt_data[i]); + kfree(tcp_pt_data); + } else { + destroy_workqueue(nvmet_tcp_wq); + } ida_destroy(&nvmet_tcp_queue_ida); } =20 --=20 2.26.2