From nobody Wed Feb 11 22:55:01 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id E5900CA5509
	for <linux-kernel@archiver.kernel.org>; Wed, 13 Sep 2023 08:38:52 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S239025AbjIMIiz (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 13 Sep 2023 04:38:55 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:39200 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S239009AbjIMIiq (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 13 Sep 2023 04:38:46 -0400
Received: from m12.mail.163.com (m12.mail.163.com [220.181.12.217])
        by lindbergh.monkeyblade.net (Postfix) with ESMTP id 9C1931997
        for <linux-kernel@vger.kernel.org>;
 Wed, 13 Sep 2023 01:38:40 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=163.com;
        s=s110527; h=From:Subject:Date:Message-Id:MIME-Version; bh=SnLvL
        TWbhrC/XUz3Fy0z7AlLzfSz99Ix9EkTAMJlulM=; b=POrdJt7IONpqYhy6nJx4/
        OCwh73DXHhnA1msxdY4hErz0yZQr4z2d6cwPV8Y2kJkHo4/3tZ84KDgR7LjLjAE2
        NhQf8gDY4Jg4XcWk7ESzijr7FKn2TY8ZNrL3SW5uoAj+78H3McjObsCQ+Q9jIbtU
        HJi+GSvxmxehjGjOtvxHjQ=
Received: from localhost.localdomain (unknown [223.166.237.2])
        by zwqz-smtp-mta-g1-1 (Coremail) with SMTP id
 _____wAnln7QdAFli8i+Bw--.56942S3;
        Wed, 13 Sep 2023 16:37:49 +0800 (CST)
From: Ping Gan <jacky_gam_2001@163.com>
To: kbusch@kernel.org, axboe@kernel.dk, hch@lst.de, sagi@grimberg.me,
        kch@nvidia.com, linux-kernel@vger.kernel.org,
        linux-nvme@lists.infradead.org
Cc: ping_gan@dell.com, jacky_gam_2001@163.com
Subject: [PATCH 1/4] nvmet: Add nvme target polling queue task parameters
Date: Wed, 13 Sep 2023 16:36:41 +0800
Message-Id: 
 <9c3ebdd7a0411bd45512e0bc8eb60700f5bb8a04.1694592708.git.jacky_gam_2001@163.com>
X-Mailer: git-send-email 2.26.2
In-Reply-To: <cover.1694592708.git.jacky_gam_2001@163.com>
References: <cover.1694592708.git.jacky_gam_2001@163.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-CM-TRANSID: _____wAnln7QdAFli8i+Bw--.56942S3
X-Coremail-Antispam: 1Uf129KBjvJXoWxXr13tw1DJr4fAFW3CF18AFb_yoW5ZrW3pF
        W7Kwn0yr4xG3yxKwn3Gay5Jry3tw48Ca47Xa4xGw1rAFnY9ay8XFy8trWY9ryUCrW8Ca45
        JF1xCr4Uuw18J3DanT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2
        9KBjDUYxBIdaVFxhVjvjDU0xZFpf9x0pNJPiUUUUUU=
X-Originating-IP: [223.166.237.2]
X-CM-SenderInfo: 5mdfy55bjdzsisqqiqqrwthudrp/xtbBlwPpKWI0aQannwAAsV
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

To define a polling task's running parameters when
nvme target submits bio to a nvme polling queue.

Signed-off-by: Ping Gan <jacky_gam_2001@163.com>
---
 drivers/nvme/target/core.c | 55 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 53 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 3935165048e7..6f49965d5d17 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -17,6 +17,29 @@
=20
 #include "nvmet.h"
=20
+/* Define the polling queue thread's affinity cpu core.
+ *  */
+static int pqt_affinity_core =3D -1;
+module_param(pqt_affinity_core, int, 0644);
+MODULE_PARM_DESC(pqt_affinity_core,
+			    "nvme polling queue thread's affinity core, -1 for all online cpus"=
);
+
+/* Define a time (in usecs) that polling queue thread shall sample the
+ *  * io request ring before determining it to be idle.
+ *   */
+static int pqt_idle_usecs;
+module_param(pqt_idle_usecs, int, 0644);
+MODULE_PARM_DESC(pqt_idle_usecs,
+				"polling queue task will poll io request till idle time in usecs");
+
+/* Define the polling queue thread ring's size.
+ *  * The ring will be consumed by polling queue thread.
+ *   */
+static int pqt_ring_size;
+module_param(pqt_ring_size, int, 0644);
+MODULE_PARM_DESC(pqt_ring_size,
+				"nvme target polling queue thread ring size");
+
 struct kmem_cache *nvmet_bvec_cache;
 struct workqueue_struct *buffered_io_wq;
 struct workqueue_struct *zbd_wq;
@@ -1648,13 +1671,34 @@ static int __init nvmet_init(void)
 {
 	int error =3D -ENOMEM;
=20
+	if ((pqt_affinity_core >=3D -1 &&
+		pqt_affinity_core < nr_cpu_ids) ||
+		pqt_idle_usecs > 0 || pqt_ring_size > 0) {
+		if (pqt_idle_usecs =3D=3D 0)
+			pqt_idle_usecs =3D 1000; //default 1ms
+		if (pqt_affinity_core < -1 ||
+			pqt_affinity_core >=3D nr_cpu_ids) {
+			printk(KERN_ERR "bad parameter for affinity core \n");
+			error =3D  -EINVAL;
+			return error;
+		}
+		if (pqt_ring_size =3D=3D 0)
+			pqt_ring_size =3D 4096; //default 4k
+		error =3D nvmet_init_pq_thread(pqt_idle_usecs,
+						pqt_affinity_core, pqt_ring_size);
+		if (error)
+			return error;
+	}
+
 	nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] =3D 1;
=20
 	nvmet_bvec_cache =3D kmem_cache_create("nvmet-bvec",
 			NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), 0,
 			SLAB_HWCACHE_ALIGN, NULL);
-	if (!nvmet_bvec_cache)
-		return -ENOMEM;
+	if (!nvmet_bvec_cache) {
+		error =3D -ENOMEM;
+		goto out_free_pqt;
+	}
=20
 	zbd_wq =3D alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0);
 	if (!zbd_wq)
@@ -1688,6 +1732,8 @@ static int __init nvmet_init(void)
 	destroy_workqueue(zbd_wq);
 out_destroy_bvec_cache:
 	kmem_cache_destroy(nvmet_bvec_cache);
+out_free_pqt:
+	nvmet_exit_pq_thread();
 	return error;
 }
=20
@@ -1701,6 +1747,11 @@ static void __exit nvmet_exit(void)
 	destroy_workqueue(zbd_wq);
 	kmem_cache_destroy(nvmet_bvec_cache);
=20
+	if ((pqt_affinity_core >=3D -1 &&
+		pqt_affinity_core < nr_cpu_ids) ||
+		pqt_idle_usecs > 0 || pqt_ring_size > 0)
+		nvmet_exit_pq_thread();
+
 	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) !=3D 1024);
 	BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) !=3D 1024);
 }
--=20
2.26.2
From nobody Wed Feb 11 22:55:01 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id D7F91CA5509
	for <linux-kernel@archiver.kernel.org>; Wed, 13 Sep 2023 08:39:02 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S239029AbjIMIjF (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 13 Sep 2023 04:39:05 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:58786 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S239041AbjIMIi7 (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 13 Sep 2023 04:38:59 -0400
Received: from m12.mail.163.com (m12.mail.163.com [220.181.12.198])
        by lindbergh.monkeyblade.net (Postfix) with ESMTP id 949C919A4
        for <linux-kernel@vger.kernel.org>;
 Wed, 13 Sep 2023 01:38:54 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=163.com;
        s=s110527; h=From:Subject:Date:Message-Id:MIME-Version; bh=9iWo4
        a4XA1ZqqRcrem2AFBCl14YM1dEFICS7htkVU2s=; b=MtLU64VnHyEDmxw06DFgM
        PV/4F57UqCl23MajZmlFPjcKTSV8EblkGikQABkTLJeArHYKIjjtLSAyvrCZCNYx
        RHXSxwEjEtZeT8iqshcdXxWUfYsnNyOvs5ymuw9EG0G1iLWnSAnpuQ1C1JtztNMZ
        k4LoZwGWjUh8BSkZYD5XkY=
Received: from localhost.localdomain (unknown [223.166.237.2])
        by zwqz-smtp-mta-g1-1 (Coremail) with SMTP id
 _____wAnln7QdAFli8i+Bw--.56942S4;
        Wed, 13 Sep 2023 16:37:51 +0800 (CST)
From: Ping Gan <jacky_gam_2001@163.com>
To: kbusch@kernel.org, axboe@kernel.dk, hch@lst.de, sagi@grimberg.me,
        kch@nvidia.com, linux-kernel@vger.kernel.org,
        linux-nvme@lists.infradead.org
Cc: ping_gan@dell.com, jacky_gam_2001@163.com
Subject: [PATCH 2/4] nvmet: Add polling queue task for nvme target
Date: Wed, 13 Sep 2023 16:36:42 +0800
Message-Id: 
 <84d7f188e892b5b0ba251a4601455d7a137075f3.1694592708.git.jacky_gam_2001@163.com>
X-Mailer: git-send-email 2.26.2
In-Reply-To: <cover.1694592708.git.jacky_gam_2001@163.com>
References: <cover.1694592708.git.jacky_gam_2001@163.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-CM-TRANSID: _____wAnln7QdAFli8i+Bw--.56942S4
X-Coremail-Antispam: 1Uf129KBjvAXoW3uF13Cw13Ww4UAw1DKFW5trb_yoW8Cr4kKo
        WfXanxtw18CFWfGa9Y9Fn3JFy5Xwn7Cryjyr13JF4DXF1UJay3Gw1xKw43Z34I9r45Wr45
        XrWIq34Sqw4kJr1rn29KB7ZKAUJUUUUU529EdanIXcx71UUUUU7v73VFW2AGmfu7bjvjm3
        AaLaJ3UbIYCTnIWIevJa73UjIFyTuYvj4ED3DbUUUUU
X-Originating-IP: [223.166.237.2]
X-CM-SenderInfo: 5mdfy55bjdzsisqqiqqrwthudrp/1tbiKAPpKV7WNCYjmQAAsL
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

The polling queue task of nvme target will fetch bio
requests from the lossless ring which is filled by
io-cmd-bdev's rw , then submit the requests to nvme's
polling queue, at last do polling to check the coplemented
status of the requests and complete the request.

Signed-off-by: Ping Gan <jacky_gam_2001@163.com>
---
 drivers/nvme/target/Makefile               |   2 +-
 drivers/nvme/target/nvmet.h                |  13 +
 drivers/nvme/target/polling-queue-thread.c | 594 +++++++++++++++++++++
 3 files changed, 608 insertions(+), 1 deletion(-)
 create mode 100644 drivers/nvme/target/polling-queue-thread.c

diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile
index c66820102493..99272881b63e 100644
--- a/drivers/nvme/target/Makefile
+++ b/drivers/nvme/target/Makefile
@@ -10,7 +10,7 @@ obj-$(CONFIG_NVME_TARGET_FCLOOP)	+=3D nvme-fcloop.o
 obj-$(CONFIG_NVME_TARGET_TCP)		+=3D nvmet-tcp.o
=20
 nvmet-y		+=3D core.o configfs.o admin-cmd.o fabrics-cmd.o \
-			discovery.o io-cmd-file.o io-cmd-bdev.o
+			discovery.o io-cmd-file.o io-cmd-bdev.o polling-queue-thread.o
 nvmet-$(CONFIG_NVME_TARGET_PASSTHRU)	+=3D passthru.o
 nvmet-$(CONFIG_BLK_DEV_ZONED)		+=3D zns.o
 nvmet-$(CONFIG_NVME_TARGET_AUTH)	+=3D fabrics-cmd-auth.o auth.o
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 8cfd60f3b564..b29a45bbdf99 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -395,6 +395,12 @@ struct nvmet_req {
 	u64			error_slba;
 };
=20
+struct nvmet_pqt_bio_req {
+	struct nvmet_req		*req;
+	struct bio_list			blist;
+	unsigned short			io_completed;
+};
+
 #define NVMET_MAX_MPOOL_BVEC		16
 extern struct kmem_cache *nvmet_bvec_cache;
 extern struct workqueue_struct *buffered_io_wq;
@@ -455,6 +461,13 @@ u16 nvmet_parse_discovery_cmd(struct nvmet_req *req);
 u16 nvmet_parse_fabrics_admin_cmd(struct nvmet_req *req);
 u16 nvmet_parse_fabrics_io_cmd(struct nvmet_req *req);
=20
+//below is for enabling nvmet polling queue task
+int nvmet_init_pq_thread(u32 thread_idle, int affinity_cpu, u32 ring_size);
+void nvmet_exit_pq_thread(void);
+bool nvmet_pqt_enabled(void);
+int nvmet_pqt_ring_enqueue(void *pt);
+void nvmet_wakeup_pq_thread(void);
+
 bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
 		struct nvmet_sq *sq, const struct nvmet_fabrics_ops *ops);
 void nvmet_req_uninit(struct nvmet_req *req);
diff --git a/drivers/nvme/target/polling-queue-thread.c b/drivers/nvme/targ=
et/polling-queue-thread.c
new file mode 100644
index 000000000000..2eb107393df9
--- /dev/null
+++ b/drivers/nvme/target/polling-queue-thread.c
@@ -0,0 +1,594 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NVMe over Fabrics target POLLING queue thread implementation.
+ * Copyright (c) 2023 Ping Gan.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/syscalls.h>
+#include <linux/compat.h>
+#include <linux/refcount.h>
+#include <linux/bits.h>
+#include <linux/blk-mq.h>
+#include <linux/kthread.h>
+
+#include <linux/sched/signal.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/fdtable.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/bvec.h>
+#include <linux/bio.h>
+#include <linux/net.h>
+#include <linux/wait.h>
+#include "nvmet.h"
+
+
+struct nvmet_pqt_sqe {
+	struct bio					*bio;
+	struct nvmet_pqt_bio_req	*pqt_req;
+	struct list_head			list;
+};
+
+enum {
+	NVMET_PQ_THREAD_SHOULD_STOP =3D 0,
+	NVMET_PQ_THREAD_ENABLE =3D 1,
+};
+
+struct nvmet_pq_ring_headtail {
+	u32 head ____cacheline_aligned_in_smp;
+	u32 tail ____cacheline_aligned_in_smp;
+};
+
+struct nvmet_pq_ring {
+	struct nvmet_pq_ring_headtail prod, cons;
+	u32 size;
+	u32 mask;
+	u32 capacity;
+	struct bio_list *qe_arry[] ____cacheline_aligned_in_smp;
+};
+
+struct nvmet_pq_thread_data {
+	struct wait_queue_head wait_head;
+	struct nvmet_pq_ring *ring;
+	u32  ring_mem_size;
+	struct list_head  submit_list;
+	u32  thread_idle;
+	int  affinity_cpu;
+	unsigned long state;
+	pid_t			task_pid;
+	pid_t			task_tgid;
+	struct task_struct *thread;
+	struct mutex	   lock;
+	struct delayed_work compl_bio;
+};
+
+struct nvmet_pq_thread_data nvmet_pqt_data;
+
+static inline int
+nvmet_pq_powerof2_enabled(u32 n)
+{
+	return n && !(n & (n - 1));
+}
+
+static inline u32 nvmet_pq_alignpow2(u32 x)
+{
+	x--;
+	x |=3D x >> 1;
+	x |=3D x >> 2;
+	x |=3D x >> 4;
+	x |=3D x >> 8;
+	x |=3D x >> 16;
+	return x + 1;
+}
+
+static void nvmet_pq_mem_free(void *ptr, size_t size)
+{
+	struct page *page;
+
+	if (!ptr)
+		return;
+
+	page =3D virt_to_page(ptr);
+	__free_pages(page, get_order(size));
+}
+
+static void *nvmet_pq_mem_alloc(size_t size)
+{
+	gfp_t gfp_flags =3D GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN;
+	if (get_order(size) > MAX_ORDER)
+		return NULL;
+	return (void *) __get_free_pages(gfp_flags, get_order(size));
+}
+
+static struct nvmet_pq_ring *nvmet_create_pq_ring(u32 cnt)
+{
+	struct nvmet_pq_ring *pq_ring;
+	u32 ring_size =3D 0, qe_size =3D cnt;
+
+	if (!nvmet_pq_powerof2_enabled(cnt))
+		qe_size =3D nvmet_pq_alignpow2(cnt + 1);
+	ring_size +=3D sizeof(struct nvmet_pq_ring);
+	ring_size +=3D qe_size * sizeof(void **);
+	pq_ring =3D nvmet_pq_mem_alloc(ring_size);
+	if (likely(pq_ring)) {
+		pq_ring->cons.head =3D 0;
+		pq_ring->cons.tail =3D 0;
+		pq_ring->prod.head =3D 0;
+		pq_ring->prod.tail =3D 0;
+		pq_ring->size =3D qe_size;
+		pq_ring->mask =3D qe_size - 1;
+		pq_ring->capacity =3D nvmet_pq_powerof2_enabled(cnt)?cnt:(qe_size - 1);
+	}
+	return pq_ring;
+}
+
+//below is derived from FreeBSD's bufring.h
+/* the actual enqueue of pointers on the ring.
+ * Placed here since identical code needed in both
+ * single and multi producer enqueue functions */
+#define ENQUEUE_PTRS(r, ring_start, prod_head, obj_table, n, obj_type) do =
{ \
+	unsigned int i; \
+	const u32 size =3D (r)->size; \
+	u32 idx =3D prod_head & (r)->mask; \
+	obj_type *ring =3D (obj_type *)ring_start; \
+	if (likely(idx + n < size)) { \
+		for (i =3D 0; i < (n & ((~(unsigned)0x3))); i +=3D 4, idx +=3D 4) { \
+			ring[idx] =3D obj_table[i]; \
+			ring[idx+1] =3D obj_table[i+1]; \
+			ring[idx+2] =3D obj_table[i+2]; \
+			ring[idx+3] =3D obj_table[i+3]; \
+		} \
+		switch (n & 0x3) { \
+		case 3: \
+			ring[idx++] =3D obj_table[i++]; /* fallthrough */ \
+		case 2: \
+			ring[idx++] =3D obj_table[i++]; /* fallthrough */ \
+		case 1: \
+			ring[idx++] =3D obj_table[i++]; \
+		} \
+	} else { \
+		for (i =3D 0; idx < size; i++, idx++)\
+			ring[idx] =3D obj_table[i]; \
+		for (idx =3D 0; i < n; i++, idx++) \
+			ring[idx] =3D obj_table[i]; \
+	} \
+} while (0)
+
+/* the actual copy of pointers on the ring to obj_table.
+ * Placed here since identical code needed in both
+ * single and multi consumer dequeue functions */
+#define DEQUEUE_PTRS(r, ring_start, cons_head, obj_table, n, obj_type) do =
{ \
+	unsigned int i; \
+	u32 idx =3D cons_head & (r)->mask; \
+	const u32 size =3D (r)->size; \
+	obj_type *ring =3D (obj_type *)ring_start; \
+	if (likely(idx + n < size)) { \
+		for (i =3D 0; i < (n & (~(unsigned)0x3)); i +=3D 4, idx +=3D 4) {\
+			obj_table[i] =3D ring[idx]; \
+			obj_table[i+1] =3D ring[idx+1]; \
+			obj_table[i+2] =3D ring[idx+2]; \
+			obj_table[i+3] =3D ring[idx+3]; \
+		} \
+		switch (n & 0x3) { \
+		case 3: \
+			obj_table[i++] =3D ring[idx++]; /* fallthrough */ \
+		case 2: \
+			obj_table[i++] =3D ring[idx++]; /* fallthrough */ \
+		case 1: \
+			obj_table[i++] =3D ring[idx++]; \
+		} \
+	} else { \
+		for (i =3D 0; idx < size; i++, idx++) \
+			obj_table[i] =3D ring[idx]; \
+		for (idx =3D 0; i < n; i++, idx++) \
+			obj_table[i] =3D ring[idx]; \
+	} \
+} while (0)
+
+static inline u32
+__nvmet_pq_ring_move_prod_head(struct nvmet_pq_ring *r, u32 n,
+								u32 *old_head, u32 *new_head, u32 *free_entries)
+{
+	const u32 capacity =3D smp_load_acquire(&r->capacity);
+	u32 ret, success;
+
+	do {
+		*old_head =3D smp_load_acquire(&r->prod.head);
+
+		/* add rmb barrier to avoid load/load reorder in weak
+		 * memory model.
+		 */
+		smp_rmb();
+
+		*free_entries =3D (capacity + smp_load_acquire(&r->cons.tail) - *old_hea=
d);
+
+		/* check that we have enough room in ring */
+		if (unlikely(n > *free_entries))
+			return 0;
+
+		*new_head =3D *old_head + n;
+		ret =3D cmpxchg(&r->prod.head, *old_head, *new_head);
+		success =3D (ret =3D=3D *old_head) ? 1 : 0;
+	} while (unlikely(success =3D=3D 0));
+	return n;
+}
+
+static inline u32
+__nvmet_pq_ring_move_cons_head(struct nvmet_pq_ring *r, u32 n,
+									u32 *old_head, u32 *new_head, u32 *entries)
+{
+	unsigned int ret, success;
+
+	/* move cons.head atomically */
+	do {
+		*old_head =3D smp_load_acquire(&r->cons.head);
+
+		/* add rmb barrier to avoid load/load reorder in weak
+		 * memory model.
+		 */
+		smp_rmb();
+
+		*entries =3D (smp_load_acquire(&r->prod.tail) - *old_head);
+
+		/* check if we have enough entry to dequeue */
+		if (n > *entries)
+			return 0;
+
+		*new_head =3D *old_head + n;
+		ret =3D cmpxchg(&r->cons.head, *old_head, *new_head);
+		success =3D (ret =3D=3D *old_head) ? 1 : 0;
+	} while (unlikely(success =3D=3D 0));
+	return n;
+}
+
+static inline void
+__nvmet_pq_ring_update_tail(struct nvmet_pq_ring_headtail *ht,
+										u32 old_val, u32 new_val, u32 enqueue)
+{
+	if (enqueue)
+		smp_wmb();
+	else
+		smp_rmb();
+
+	while (unlikely(smp_load_acquire(&ht->tail) !=3D old_val))
+		;
+
+	smp_store_release(&ht->tail, new_val);
+}
+
+static inline u32
+__nvmet_pq_ring_do_enqueue(struct nvmet_pq_ring *r,
+							void **obj_table, u32 n, u32 *free_space)
+{
+	uint32_t prod_head, prod_next;
+	uint32_t free_entries;
+
+	n =3D __nvmet_pq_ring_move_prod_head(r, n, &prod_head,
+									&prod_next, &free_entries);
+	if (n =3D=3D 0)
+		goto end;
+
+	ENQUEUE_PTRS(r, &r[1], prod_head, obj_table, n, void *);
+
+	__nvmet_pq_ring_update_tail(&r->prod, prod_head, prod_next, 1);
+end:
+	if (free_space !=3D NULL)
+		*free_space =3D free_entries - n;
+	return n;
+}
+
+static inline u32
+__nvmet_pq_ring_do_dequeue(struct nvmet_pq_ring *r,
+							void **obj_table, u32 n, u32 *available)
+{
+	uint32_t cons_head, cons_next;
+	uint32_t entries;
+
+	n =3D __nvmet_pq_ring_move_cons_head(r, n, &cons_head, &cons_next, &entri=
es);
+	if (n =3D=3D 0)
+		goto end;
+
+	DEQUEUE_PTRS(r, &r[1], cons_head, obj_table, n, void *);
+
+	__nvmet_pq_ring_update_tail(&r->cons, cons_head, cons_next, 0);
+
+end:
+	if (available !=3D NULL)
+		*available =3D entries - n;
+	return n;
+}
+
+static inline u32
+nvmet_pq_ring_enqueue_bulk(struct nvmet_pq_ring *r,
+							void **obj_table, u32 n, u32 *free_space)
+{
+	return __nvmet_pq_ring_do_enqueue(r, obj_table, n, free_space);
+}
+
+static inline int
+nvmet_pq_ring_enqueue(struct nvmet_pq_ring *r, void **obj)
+{
+	return nvmet_pq_ring_enqueue_bulk(r, obj, 1, NULL) ? 0 : -ENOBUFS;
+}
+
+static inline u32
+nvmet_pq_ring_dequeue_bulk(struct nvmet_pq_ring *r,
+							void **obj_table, u32 n, u32 *available)
+{
+	return __nvmet_pq_ring_do_dequeue(r, obj_table, n, available);
+}
+
+static inline int
+nvmet_pq_ring_dequeue(struct nvmet_pq_ring *r, void **obj_p)
+{
+	return nvmet_pq_ring_dequeue_bulk(r, obj_p, 1, NULL) ? 0 : -ENOENT;
+}
+
+static inline u32
+__nvmet_pq_ring_count(const struct nvmet_pq_ring *r)
+{
+	u32 prod_tail =3D smp_load_acquire(&r->prod.tail);
+	u32 cons_tail =3D smp_load_acquire(&r->cons.tail);
+	u32 count =3D (prod_tail - cons_tail) & r->mask;
+	u32 capacity =3D smp_load_acquire(&r->capacity);
+	return (count > capacity) ? capacity : count;
+}
+
+/**
+ * Return the number of free entries in a ring.
+ */
+static inline u32
+__nvmet_pq_ring_free_count(const struct nvmet_pq_ring *r)
+{
+	return smp_load_acquire(&r->capacity) - __nvmet_pq_ring_count(r);
+}
+
+/**
+ * Test if a ring is full.
+ */
+static inline int
+__nvmet_pq_ring_full(const struct nvmet_pq_ring *r)
+{
+	return __nvmet_pq_ring_free_count(r) =3D=3D 0;
+}
+
+/**
+ * Test if a ring is empty.
+ */
+static inline int
+__nvmet_pq_ring_empty(const struct nvmet_pq_ring *r)
+{
+	return __nvmet_pq_ring_count(r) =3D=3D 0;
+}
+
+/**
+ * Return the size of the ring.
+ */
+static inline u32
+__nvmet_pq_ring_get_size(const struct nvmet_pq_ring *r)
+{
+	return smp_load_acquire(&r->size);
+}
+
+/**
+ * Return the number of elements which can be stored in the ring.
+ */
+static inline u32
+__nvmet_pq_ring_get_capacity(const struct nvmet_pq_ring *r)
+{
+	return smp_load_acquire(&r->capacity);
+}
+
+#define  NVMET_PQT_IO_BUDGET  64
+
+static inline void
+__nvmet_pqt_submit_bio(struct nvmet_pq_thread_data *pqtd,
+						struct nvmet_pqt_bio_req *pqt_req, bool cancel_thread)
+{
+	struct bio *bio;
+	struct nvmet_pqt_sqe *sqe;
+
+	while ((bio =3D bio_list_pop(&pqt_req->blist))) {
+		if (cancel_thread) {
+			nvmet_req_bio_put(pqt_req->req, bio);
+			if (!bio_flagged(bio, BIO_CHAIN)) {
+				kfree(pqt_req);
+				break;
+			}
+		} else {
+			if (bio_flagged(bio, BIO_CHAIN))
+				submit_bio(bio);
+			else {
+				sqe =3D kmalloc(sizeof(struct nvmet_pqt_sqe), GFP_KERNEL);
+				if (!sqe) {
+					bio_io_error(bio);
+					kfree(pqt_req);
+				} else {
+					sqe->bio =3D bio;
+					sqe->pqt_req =3D pqt_req;
+					submit_bio(bio);
+					list_add(&sqe->list, &pqtd->submit_list);
+				}
+			}
+		}
+	}
+}
+
+static int __nvmet_pq_thread(struct nvmet_pq_thread_data *pqtd,
+										bool cancel_thread)
+{
+	int ret =3D 0, poll_cnt =3D 0;
+	struct nvmet_pqt_bio_req *req_done;
+	struct nvmet_pqt_sqe *sqe, *tmp;
+	unsigned int poll_flags =3D BLK_POLL_ONESHOT;
+	DEFINE_IO_COMP_BATCH(iob);
+
+	while (1) {
+		ret =3D nvmet_pq_ring_dequeue(pqtd->ring, (void **)&req_done);
+		if (ret)
+			break;
+		__nvmet_pqt_submit_bio(pqtd, req_done, cancel_thread);
+		poll_cnt++;
+		if (poll_cnt =3D=3D NVMET_PQT_IO_BUDGET && !cancel_thread)
+			break;
+	}
+	if (!list_empty(&pqtd->submit_list)) {
+cancel_thread_poll_again:
+		list_for_each_entry_safe(sqe, tmp, &pqtd->submit_list, list) {
+			if (sqe->pqt_req->io_completed =3D=3D 1) {
+				list_del(&sqe->list);
+				kfree(sqe->pqt_req);
+				kfree(sqe);
+				continue;
+			}
+			ret =3D bio_poll(sqe->bio, &iob, poll_flags);
+			if (ret < 0) {
+				if (!cancel_thread) {
+					if (!rq_list_empty(iob.req_list))
+						iob.complete(&iob);
+					return 1;
+				}
+			}
+			if (ret > 0 && sqe->pqt_req->io_completed =3D=3D 1) {
+				list_del(&sqe->list);
+				kfree(sqe->pqt_req);
+				kfree(sqe);
+			}
+		}
+	}
+	if (cancel_thread) {
+		if (!list_empty(&pqtd->submit_list))
+			goto cancel_thread_poll_again;
+		nvmet_pq_mem_free(pqtd->ring, pqtd->ring_mem_size);
+	}
+	if (!rq_list_empty(iob.req_list))
+		iob.complete(&iob);
+	return 0;
+}
+
+void nvmet_wakeup_pq_thread(void)
+{
+	smp_mb();
+	if (waitqueue_active(&nvmet_pqt_data.wait_head))
+		wake_up(&nvmet_pqt_data.wait_head);
+}
+
+int nvmet_pqt_ring_enqueue(void *pt)
+{
+	struct nvmet_pqt_bio_req *pqt_req =3D pt;
+	return nvmet_pq_ring_enqueue(nvmet_pqt_data.ring, (void **)&pqt_req);
+}
+
+static int nvmet_pq_thread(void *data)
+{
+	struct nvmet_pq_thread_data *pqtd =3D data;
+	unsigned long timeout =3D 0;
+	DEFINE_WAIT(wait);
+
+	if (pqtd->affinity_cpu !=3D -1)
+		set_cpus_allowed_ptr(current, cpumask_of(pqtd->affinity_cpu));
+	else
+		set_cpus_allowed_ptr(current, cpu_online_mask);
+	current->flags |=3D PF_NO_SETAFFINITY;
+	mutex_lock(&pqtd->lock);
+	pqtd->task_pid =3D current->pid;
+	pqtd->task_tgid =3D current->tgid;
+
+	while (!kthread_should_stop()) {
+		if (test_bit(NVMET_PQ_THREAD_SHOULD_STOP, &pqtd->state))
+			break;
+
+		int ret =3D __nvmet_pq_thread(pqtd, false);
+		if (ret > 0 || !time_after(jiffies, timeout)) {
+			cond_resched();
+			if (ret > 0)
+				timeout =3D jiffies + pqtd->thread_idle;
+			continue;
+		}
+		prepare_to_wait(&pqtd->wait_head, &wait, TASK_INTERRUPTIBLE);
+		mutex_unlock(&pqtd->lock);
+		schedule();
+		mutex_lock(&pqtd->lock);
+		finish_wait(&pqtd->wait_head, &wait);
+		timeout =3D jiffies + pqtd->thread_idle;
+	}
+	pqtd->thread =3D NULL;
+	pqtd->task_pid =3D -1;
+	pqtd->task_tgid =3D -1;
+	mutex_unlock(&pqtd->lock);
+	kthread_complete_and_exit(NULL, 0);
+}
+
+bool nvmet_pqt_enabled(void)
+{
+	if (!test_bit(NVMET_PQ_THREAD_SHOULD_STOP, &nvmet_pqt_data.state) &&
+		test_bit(NVMET_PQ_THREAD_ENABLE, &nvmet_pqt_data.state))
+		return true;
+	else
+		return false;
+}
+
+static void nvmet_pqt_compl_bio_req_func(struct work_struct *work)
+{
+	struct nvmet_pq_thread_data *pqtd =3D container_of(work,
+							struct nvmet_pq_thread_data, compl_bio.work);
+	__nvmet_pq_thread(pqtd, true);
+}
+
+int nvmet_init_pq_thread(u32 thread_idle, int affinity_cpu, u32 ring_size)
+{
+	struct task_struct *task;
+	int ret =3D 0;
+
+	memset(&nvmet_pqt_data, 0, sizeof(struct nvmet_pq_thread_data));
+	init_waitqueue_head(&nvmet_pqt_data.wait_head);
+	mutex_init(&nvmet_pqt_data.lock);
+	nvmet_pqt_data.thread_idle =3D usecs_to_jiffies(thread_idle);
+	nvmet_pqt_data.affinity_cpu =3D affinity_cpu;
+	INIT_LIST_HEAD(&nvmet_pqt_data.submit_list);
+	nvmet_pqt_data.ring =3D nvmet_create_pq_ring(ring_size);
+	if (!nvmet_pqt_data.ring) {
+		printk(KERN_ERR "allocate poll ring failure\n");
+		return -1;
+	}
+	nvmet_pqt_data.ring_mem_size =3D sizeof(struct nvmet_pq_ring);
+	nvmet_pqt_data.ring_mem_size +=3D nvmet_pqt_data.ring->size * sizeof(void=
 **);
+	task =3D kthread_create(nvmet_pq_thread, (void *)&nvmet_pqt_data, "nvmet-=
pqt");
+	if (IS_ERR(task)) {
+		ret =3D PTR_ERR(task);
+		goto err;
+	}
+
+	set_user_nice(task, -20);
+	mutex_lock(&nvmet_pqt_data.lock);
+	nvmet_pqt_data.thread =3D task;
+	mutex_unlock(&nvmet_pqt_data.lock);
+	wake_up_process(task);
+	set_bit(NVMET_PQ_THREAD_ENABLE, &nvmet_pqt_data.state);
+	return 0;
+err:
+	nvmet_pq_mem_free(nvmet_pqt_data.ring, nvmet_pqt_data.ring_mem_size);
+	return ret;
+}
+
+void nvmet_exit_pq_thread(void)
+{
+	set_bit(NVMET_PQ_THREAD_SHOULD_STOP, &nvmet_pqt_data.state);
+	clear_bit(NVMET_PQ_THREAD_ENABLE, &nvmet_pqt_data.state);
+	mutex_lock(&nvmet_pqt_data.lock);
+	if (nvmet_pqt_data.thread) {
+		mutex_unlock(&nvmet_pqt_data.lock);
+		kthread_stop(nvmet_pqt_data.thread);
+	} else {
+		mutex_unlock(&nvmet_pqt_data.lock);
+	}
+	INIT_DELAYED_WORK(&nvmet_pqt_data.compl_bio, nvmet_pqt_compl_bio_req_func=
);
+	schedule_delayed_work(&nvmet_pqt_data.compl_bio, 3);
+}
--=20
2.26.2
From nobody Wed Feb 11 22:55:01 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 2E337CA550A
	for <linux-kernel@archiver.kernel.org>; Wed, 13 Sep 2023 08:39:18 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S239082AbjIMIjT (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 13 Sep 2023 04:39:19 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:60564 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S239015AbjIMIjK (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 13 Sep 2023 04:39:10 -0400
Received: from m12.mail.163.com (m12.mail.163.com [220.181.12.198])
        by lindbergh.monkeyblade.net (Postfix) with ESMTP id 6067019AD
        for <linux-kernel@vger.kernel.org>;
 Wed, 13 Sep 2023 01:39:01 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=163.com;
        s=s110527; h=From:Subject:Date:Message-Id:MIME-Version; bh=EOqlq
        SAkn0k3xbfI47MZ3fOAdq93w0S0QRWAAFzrDC4=; b=K6t/DXXC4Dtt0OrhKO75c
        eE8F5UsbDcgDZMdkezi+yZw4G7sm9iJSgjptRX+takJBZ59PxiYNMd6tvFbFJw/U
        aa/DaMealt6ebaKBrfvlHWm2ITqEFyNMtr+J9Qqd5vbOfATl2RPDfu0+m291ObA1
        A5DY3iAlcGM+8tqJ5iAtVY=
Received: from localhost.localdomain (unknown [223.166.237.2])
        by zwqz-smtp-mta-g1-1 (Coremail) with SMTP id
 _____wAnln7QdAFli8i+Bw--.56942S5;
        Wed, 13 Sep 2023 16:37:57 +0800 (CST)
From: Ping Gan <jacky_gam_2001@163.com>
To: kbusch@kernel.org, axboe@kernel.dk, hch@lst.de, sagi@grimberg.me,
        kch@nvidia.com, linux-kernel@vger.kernel.org,
        linux-nvme@lists.infradead.org
Cc: ping_gan@dell.com, jacky_gam_2001@163.com
Subject: [PATCH 3/4] nvmet: support bio polling queue request
Date: Wed, 13 Sep 2023 16:36:43 +0800
Message-Id: 
 <cb74516ed445c47cecd0e9500f5766d6e8615e83.1694592708.git.jacky_gam_2001@163.com>
X-Mailer: git-send-email 2.26.2
In-Reply-To: <cover.1694592708.git.jacky_gam_2001@163.com>
References: <cover.1694592708.git.jacky_gam_2001@163.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-CM-TRANSID: _____wAnln7QdAFli8i+Bw--.56942S5
X-Coremail-Antispam: 1Uf129KBjvJXoW3Xr4UJFWxGFyxKF1UtryUAwb_yoWfXr4kpF
        y3JFWktrZ7GrsY9a13Jry7Aay3Ka48Aa4DJr4xWrn3Gr4ft3s3WF1UKFyFvFyfKr95uFZr
        Gwn0yFWxuw45W3DanT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2
        9KBjDUYxBIdaVFxhVjvjDU0xZFpf9x0zR1mhrUUUUU=
X-Originating-IP: [223.166.237.2]
X-CM-SenderInfo: 5mdfy55bjdzsisqqiqqrwthudrp/1tbiWAfpKWNftmHX9QAAsW
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

If enabling bio polling queue task, we will split and chain
the bios if needed, then fill the request to the lossless ring
of polling queue task.

Signed-off-by: Ping Gan <jacky_gam_2001@163.com>
---
 drivers/nvme/target/io-cmd-bdev.c | 243 ++++++++++++++++++++++++++----
 1 file changed, 214 insertions(+), 29 deletions(-)

diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd=
-bdev.c
index 468833675cc9..6f7d04ae6cb7 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -184,6 +184,16 @@ static void nvmet_bio_done(struct bio *bio)
 	nvmet_req_bio_put(req, bio);
 }
=20
+static void nvmet_pqt_bio_done(struct bio *bio)
+{
+	struct nvmet_pqt_bio_req *req_done =3D bio->bi_private;
+
+	nvmet_req_complete(req_done->req, blk_to_nvme_status(req_done->req,
+							bio->bi_status));
+	nvmet_req_bio_put(req_done->req, bio);
+	req_done->io_completed =3D 1;
+}
+
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 static int nvmet_bdev_alloc_bip(struct nvmet_req *req, struct bio *bio,
 				struct sg_mapping_iter *miter)
@@ -237,6 +247,38 @@ static int nvmet_bdev_alloc_bip(struct nvmet_req *req,=
 struct bio *bio,
 }
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
=20
+#ifdef CONFIG_NVME_MULTIPATH
+extern struct block_device *nvme_mpath_get_bdev(struct block_device *bdev);
+extern const struct block_device_operations nvme_ns_head_ops;
+#endif
+
+static inline int nvmet_chain_par_bio(struct nvmet_req *req, struct bio **=
bio,
+					struct sg_mapping_iter *prot_miter, struct block_device *bdev,
+					sector_t sector, struct bio_list *blist)
+{
+	struct bio *parent, *child;
+	unsigned int vec_cnt;
+	int rc;
+
+	parent =3D *bio;
+	vec_cnt =3D queue_max_segments(bdev->bd_disk->queue);
+	if (req->metadata_len) {
+		rc =3D nvmet_bdev_alloc_bip(req, parent,
+						prot_miter);
+		if (unlikely(rc))
+			return rc;
+	}
+	child =3D bio_alloc(bdev, vec_cnt, parent->bi_opf, GFP_KERNEL);
+	child->bi_iter.bi_sector =3D sector;
+	*bio =3D child;
+	bio_chain(*bio, parent);
+	parent->bi_opf |=3D REQ_POLLED;
+	parent->bi_opf |=3D REQ_NOWAIT;
+	parent->bi_opf |=3D REQ_NOMERGE;
+	bio_list_add(blist, parent);
+	return 0;
+}
+
 static void nvmet_bdev_execute_rw(struct nvmet_req *req)
 {
 	unsigned int sg_cnt =3D req->sg_cnt;
@@ -247,8 +289,13 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *re=
q)
 	blk_opf_t opf;
 	int i, rc;
 	struct sg_mapping_iter prot_miter;
-	unsigned int iter_flags;
+	unsigned int iter_flags, max_sectors;
+	unsigned short vec_cnt, max_segments;
 	unsigned int total_len =3D nvmet_rw_data_len(req) + req->metadata_len;
+	bool pqt_enabled =3D nvmet_pqt_enabled();
+	unsigned int sg_len;
+	struct nvmet_pqt_bio_req *req_done =3D NULL;
+	struct block_device *bdev =3D req->ns->bdev;
=20
 	if (!nvmet_check_transfer_len(req, total_len))
 		return;
@@ -268,6 +315,24 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *re=
q)
 		iter_flags =3D SG_MITER_FROM_SG;
 	}
=20
+#ifdef CONFIG_NVME_MULTIPATH
+	if (pqt_enabled && bdev->bd_disk->fops =3D=3D &nvme_ns_head_ops) {
+		bdev =3D nvme_mpath_get_bdev(bdev);
+		if (!bdev) {
+			nvmet_req_complete(req, 0);
+			return;
+		}
+		opf |=3D REQ_DRV;
+	}
+#endif
+	if (pqt_enabled) {
+		req_done =3D kmalloc(sizeof(struct nvmet_pqt_bio_req), GFP_KERNEL);
+		if (!req_done) {
+			nvmet_req_complete(req, 0);
+			return;
+		}
+	}
+
 	if (is_pci_p2pdma_page(sg_page(req->sg)))
 		opf |=3D REQ_NOMERGE;
=20
@@ -278,54 +343,174 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *=
req)
 		bio_init(bio, req->ns->bdev, req->inline_bvec,
 			 ARRAY_SIZE(req->inline_bvec), opf);
 	} else {
-		bio =3D bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt), opf,
+		vec_cnt =3D bio_max_segs(sg_cnt);
+		if (pqt_enabled)
+			vec_cnt =3D queue_max_segments(bdev->bd_disk->queue);
+		bio =3D bio_alloc(bdev, vec_cnt, opf,
 				GFP_KERNEL);
 	}
 	bio->bi_iter.bi_sector =3D sector;
-	bio->bi_private =3D req;
-	bio->bi_end_io =3D nvmet_bio_done;
+	if (!pqt_enabled) {
+		bio->bi_private =3D req;
+		bio->bi_end_io =3D nvmet_bio_done;
+	} else {
+		req_done->req =3D req;
+		bio->bi_private =3D req_done;
+		bio->bi_end_io =3D nvmet_pqt_bio_done;
+	}
=20
-	blk_start_plug(&plug);
+	if (!pqt_enabled)
+		blk_start_plug(&plug);
 	if (req->metadata_len)
 		sg_miter_start(&prot_miter, req->metadata_sg,
 			       req->metadata_sg_cnt, iter_flags);
=20
-	for_each_sg(req->sg, sg, req->sg_cnt, i) {
-		while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset)
-				!=3D sg->length) {
-			struct bio *prev =3D bio;
-
-			if (req->metadata_len) {
-				rc =3D nvmet_bdev_alloc_bip(req, bio,
-							  &prot_miter);
-				if (unlikely(rc)) {
-					bio_io_error(bio);
-					return;
+	if (!pqt_enabled) {
+		for_each_sg(req->sg, sg, req->sg_cnt, i) {
+			while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset)
+					!=3D sg->length) {
+				struct bio *prev =3D bio;
+
+				if (req->metadata_len) {
+					rc =3D nvmet_bdev_alloc_bip(req, bio,
+								  &prot_miter);
+					if (unlikely(rc)) {
+						bio_io_error(bio);
+						return;
+					}
 				}
-			}
=20
-			bio =3D bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt),
-					opf, GFP_KERNEL);
-			bio->bi_iter.bi_sector =3D sector;
+				bio =3D bio_alloc(req->ns->bdev, bio_max_segs(sg_cnt),
+						opf, GFP_KERNEL);
+				bio->bi_iter.bi_sector =3D sector;
=20
-			bio_chain(bio, prev);
-			submit_bio(prev);
-		}
+				bio_chain(bio, prev);
+				submit_bio(prev);
+			}
=20
-		sector +=3D sg->length >> 9;
-		sg_cnt--;
+			sector +=3D sg->length >> 9;
+			sg_cnt--;
+		}
+	} else {
+		bio_list_init(&req_done->blist);
+		if (!test_bit(QUEUE_FLAG_POLL, &bdev->bd_disk->queue->queue_flags))
+			goto err_bio;
+		max_sectors =3D bdev->bd_disk->queue->limits.max_sectors;
+		max_sectors <<=3D 9;
+		max_segments =3D queue_max_segments(bdev->bd_disk->queue);
+		sg_len =3D 0;
+		unsigned int offset, len, vec_len, i;
+		bool sg_start_pg =3D true, need_chain_bio =3D false;
+		struct page *sglist_page, *max_sector_align;
+		sector_t temp_sector;
+
+		/*
+		 * for bio's polling mode we will split bio to
+		 * avoid low level's bio splitting when submit.
+		 */
+		for_each_sg(req->sg, sg, req->sg_cnt, i) {
+			temp_sector =3D sector;
+			offset =3D (sg->offset % PAGE_SIZE);
+			if (offset + sg->length > PAGE_SIZE) { // need to split
+				len =3D sg->length;
+				i =3D 0;
+				sglist_page =3D virt_to_page(page_to_virt(sg_page(sg)) + offset);
+				if (offset !=3D 0)
+					sg_start_pg =3D false;
+				while (len > PAGE_SIZE) {
+					max_sector_align =3D virt_to_page(page_to_virt(sglist_page) +
+											(PAGE_SIZE*i));
+					vec_len =3D sg_start_pg?PAGE_SIZE:(PAGE_SIZE - offset);
+					if (bio->bi_vcnt =3D=3D max_segments - 1 ||
+							sg_len + vec_len > max_sectors)
+						need_chain_bio =3D true;
+					else {
+						__bio_add_page(bio, max_sector_align,
+								vec_len, sg_start_pg?0:offset);
+						temp_sector +=3D vec_len >> 9;
+						sg_len +=3D vec_len;
+					}
+					if (need_chain_bio) {
+						rc =3D nvmet_chain_par_bio(req, &bio, &prot_miter,
+								bdev, temp_sector, &req_done->blist);
+						if (unlikely(rc))
+							goto err_bio;
+						__bio_add_page(bio, max_sector_align, vec_len,
+								sg_start_pg?0:(PAGE_SIZE - offset));
+						temp_sector +=3D vec_len >> 9;
+						sg_len =3D vec_len;
+						need_chain_bio =3D false;
+					}
+					if (!sg_start_pg) {
+						len -=3D (PAGE_SIZE - offset);
+						sg_start_pg =3D true;
+					} else {
+						len -=3D PAGE_SIZE;
+					}
+					i++;
+				}
+				if (len > 0) {
+					max_sector_align =3D virt_to_page(page_to_virt(sglist_page) +
+											(i * PAGE_SIZE));
+					if (bio->bi_vcnt =3D=3D max_segments - 1 ||
+							sg_len + len > max_sectors) {
+						rc =3D nvmet_chain_par_bio(req, &bio, &prot_miter,
+								bdev, temp_sector, &req_done->blist);
+						if (unlikely(rc))
+							goto err_bio;
+						sg_len =3D len;
+					} else {
+						sg_len +=3D len;
+					}
+					__bio_add_page(bio, max_sector_align, len, 0);
+					temp_sector +=3D len >> 9;
+				}
+			} else {
+				if (bio->bi_vcnt =3D=3D max_segments - 1 ||
+						sg_len + sg->length > max_sectors) {
+					rc =3D nvmet_chain_par_bio(req, &bio, &prot_miter,
+							bdev, temp_sector, &req_done->blist);
+					if (unlikely(rc))
+						goto err_bio;
+					sg_len =3D sg->length;
+				} else {
+					sg_len +=3D sg->length;
+				}
+				__bio_add_page(bio, sg_page(sg), sg->length, sg->offset);
+			}
+			sector +=3D sg->length >> 9;
+			sg_cnt--;
+		}
 	}
=20
 	if (req->metadata_len) {
 		rc =3D nvmet_bdev_alloc_bip(req, bio, &prot_miter);
 		if (unlikely(rc)) {
-			bio_io_error(bio);
-			return;
+			goto err_bio;
 		}
 	}
=20
-	submit_bio(bio);
-	blk_finish_plug(&plug);
+	if (pqt_enabled) {
+		bio->bi_opf |=3D REQ_POLLED;
+		bio->bi_opf |=3D REQ_NOWAIT;
+		bio->bi_opf |=3D REQ_NOMERGE;
+		bio_list_add(&req_done->blist, bio);
+		req_done->io_completed =3D 0;
+		rc =3D nvmet_pqt_ring_enqueue(req_done);
+		if (rc < 0)
+			goto err_bio;
+		nvmet_wakeup_pq_thread();
+	} else {
+	    submit_bio(bio);
+    }
+	if (!pqt_enabled)
+		blk_finish_plug(&plug);
+	return;
+err_bio:
+	bio_io_error(bio);
+	if (pqt_enabled)
+		kfree(req_done);
+	return;
 }
=20
 static void nvmet_bdev_execute_flush(struct nvmet_req *req)
--=20
2.26.2
From nobody Wed Feb 11 22:55:01 2026
Return-Path: <linux-kernel-owner@vger.kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 1564DCA550C
	for <linux-kernel@archiver.kernel.org>; Wed, 13 Sep 2023 08:38:54 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S239005AbjIMIi4 (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Wed, 13 Sep 2023 04:38:56 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:39254 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S239020AbjIMIir (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Wed, 13 Sep 2023 04:38:47 -0400
Received: from m12.mail.163.com (m12.mail.163.com [220.181.12.217])
        by lindbergh.monkeyblade.net (Postfix) with ESMTP id 825E719A9
        for <linux-kernel@vger.kernel.org>;
 Wed, 13 Sep 2023 01:38:42 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=163.com;
        s=s110527; h=From:Subject:Date:Message-Id:MIME-Version; bh=LqGLJ
        OXO5bR3TOdCilEfQtXqdvwCC4z7Oxex9EjtBO8=; b=YQ6g3k5/Re66qHo9slAZ1
        4QwceUzWNIqWFSJg63uHdIGdygqL6nWo7AZ6BAPIv2XeVk4LI8Py/BWZRvEysoN1
        74ugaDQCXjd4KDFE9d0xJRDLOp6GaRnzXB1nVGCJ0ewCRrgwZxHkfjE6IUEaeFUA
        iIYrTPYyTD0Ajak7JfjLYE=
Received: from localhost.localdomain (unknown [223.166.237.2])
        by zwqz-smtp-mta-g1-1 (Coremail) with SMTP id
 _____wAnln7QdAFli8i+Bw--.56942S6;
        Wed, 13 Sep 2023 16:38:00 +0800 (CST)
From: Ping Gan <jacky_gam_2001@163.com>
To: kbusch@kernel.org, axboe@kernel.dk, hch@lst.de, sagi@grimberg.me,
        kch@nvidia.com, linux-kernel@vger.kernel.org,
        linux-nvme@lists.infradead.org
Cc: ping_gan@dell.com, jacky_gam_2001@163.com
Subject: [PATCH 4/4] nvme-core: Get lowlevel disk for target polling queue
 task
Date: Wed, 13 Sep 2023 16:36:44 +0800
Message-Id: 
 <006b6aefe94d73ee64931c769af4a908616439ad.1694592708.git.jacky_gam_2001@163.com>
X-Mailer: git-send-email 2.26.2
In-Reply-To: <cover.1694592708.git.jacky_gam_2001@163.com>
References: <cover.1694592708.git.jacky_gam_2001@163.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
X-CM-TRANSID: _____wAnln7QdAFli8i+Bw--.56942S6
X-Coremail-Antispam: 1Uf129KBjvJXoW7Cr4UWF48ZryrKw47Cw4xtFb_yoW8Xw4DpF
        yDXF98Ar4xGF12gFsrZF4UArsxKw10g3WUCF9rA3s0qr9xKrZ5uFySkF1YyFZ3tFWDWFW7
        Xa4UKr13Gr1rCF7anT9S1TB71UUUUUUqnTZGkaVYY2UrUUUUjbIjqfuFe4nvWSU5nxnvy2
        9KBjDUYxBIdaVFxhVjvjDU0xZFpf9x0pio7KsUUUUU=
X-Originating-IP: [223.166.237.2]
X-CM-SenderInfo: 5mdfy55bjdzsisqqiqqrwthudrp/xtbBog7pKVaEJdIJmgAAsa
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

When enabling multipath, if the block device of nvmet is
a nvme_ns_head disk, then we should get the lowlevel block
device to do bio split.

Signed-off-by: Ping Gan <jacky_gam_2001@163.com>
---
 drivers/nvme/host/multipath.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 0a88d7bdc5e3..f6063600e06e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -371,6 +371,25 @@ static bool nvme_available_path(struct nvme_ns_head *h=
ead)
 	return false;
 }
=20
+//for polling queue task to get lowlevel block device
+struct block_device *nvme_mpath_get_bdev(struct block_device *bdev)
+{
+	struct nvme_ns_head *head =3D bdev->bd_disk->private_data;
+	int srcu_idx;
+	struct nvme_ns *ns;
+	struct block_device *ret =3D NULL;
+
+	if (!multipath)
+		return NULL;
+	srcu_idx =3D srcu_read_lock(&head->srcu);
+	ns =3D nvme_find_path(head);
+	if (likely(ns))
+		ret =3D ns->disk->part0;
+	srcu_read_unlock(&head->srcu, srcu_idx);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(nvme_mpath_get_bdev);
+
 static void nvme_ns_head_submit_bio(struct bio *bio)
 {
 	struct nvme_ns_head *head =3D bio->bi_bdev->bd_disk->private_data;
@@ -452,6 +471,7 @@ const struct block_device_operations nvme_ns_head_ops =
=3D {
 	.report_zones	=3D nvme_ns_head_report_zones,
 	.pr_ops		=3D &nvme_pr_ops,
 };
+EXPORT_SYMBOL_GPL(nvme_ns_head_ops);
=20
 static inline struct nvme_ns_head *cdev_to_ns_head(struct cdev *cdev)
 {
--=20
2.26.2