block: revert avoid acquiring cpu hotplug lock in group_cpus_evenly

[PATCH 1/3] nvme: failover requests for inactive hctx

Posted by Daniel Wagner 1 month, 1 week ago

When the ctrl is not in LIVE state, a hardware queue can be in the
INACTIVE state due to CPU hotplug offlining operations. In this case,
the driver will freeze and quiesce the request queue and doesn't expect
new request entering via queue_rq. Though a request will fail eventually,
though shortcut it and fail it earlier.

Check if a request is targeted for an inactive hardware queue and use
nvme_failover_req and hand it back to the block layer.

Signed-off-by: Daniel Wagner <wagi@kernel.org>
---
 drivers/nvme/host/core.c      | 55 ++++++++++++++++++++++++++++++++++++++++++-
 drivers/nvme/host/multipath.c | 43 ---------------------------------
 drivers/nvme/host/nvme.h      |  1 -
 3 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f5ebcaa2f859..e84df1a2d321 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -454,6 +454,51 @@ void nvme_end_req(struct request *req)
 	blk_mq_end_request(req, status);
 }
 
+static void nvme_failover_req(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
+	unsigned long flags;
+	struct bio *bio;
+
+	if (nvme_ns_head_multipath(ns->head))
+		nvme_mpath_clear_current_path(ns);
+
+	/*
+	 * If we got back an ANA error, we know the controller is alive but not
+	 * ready to serve this namespace.  Kick of a re-read of the ANA
+	 * information page, and just try any other available path for now.
+	 */
+	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
+		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+		queue_work(nvme_wq, &ns->ctrl->ana_work);
+	}
+
+	spin_lock_irqsave(&ns->head->requeue_lock, flags);
+	for (bio = req->bio; bio; bio = bio->bi_next) {
+		if (nvme_ns_head_multipath(ns->head))
+			bio_set_dev(bio, ns->head->disk->part0);
+		if (bio->bi_opf & REQ_POLLED) {
+			bio->bi_opf &= ~REQ_POLLED;
+			bio->bi_cookie = BLK_QC_T_NONE;
+		}
+		/*
+		 * The alternate request queue that we may end up submitting
+		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
+		 * will fail the I/O immediately with EAGAIN to the issuer.
+		 * We are not in the issuer context which cannot block. Clear
+		 * the flag to avoid spurious EAGAIN I/O failures.
+		 */
+		bio->bi_opf &= ~REQ_NOWAIT;
+	}
+	blk_steal_bios(&ns->head->requeue_list, req);
+	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
+
+	nvme_req(req)->status = 0;
+	nvme_end_req(req);
+	kblockd_schedule_work(&ns->head->requeue_work);
+}
+
 void nvme_complete_rq(struct request *req)
 {
 	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
@@ -762,8 +807,13 @@ blk_status_t nvme_fail_nonready_command(struct nvme_ctrl *ctrl,
 	    state != NVME_CTRL_DELETING &&
 	    state != NVME_CTRL_DEAD &&
 	    !test_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags) &&
-	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH))
+	    !blk_noretry_request(rq) && !(rq->cmd_flags & REQ_NVME_MPATH)) {
+		if (test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state)) {
+			nvme_failover_req(rq);
+			return BLK_STS_OK;
+		}
 		return BLK_STS_RESOURCE;
+	}
 
 	if (!(rq->rq_flags & RQF_DONTPREP))
 		nvme_clear_nvme_request(rq);
@@ -809,6 +859,9 @@ bool __nvme_check_ready(struct nvme_ctrl *ctrl, struct request *rq,
 		}
 	}
 
+	if (test_bit(BLK_MQ_S_INACTIVE, &rq->mq_hctx->state))
+		return false;
+
 	return queue_live;
 }
 EXPORT_SYMBOL_GPL(__nvme_check_ready);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 174027d1cc19..cce3a23f6de5 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -134,49 +134,6 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 			blk_freeze_queue_start(h->disk->queue);
 }
 
-void nvme_failover_req(struct request *req)
-{
-	struct nvme_ns *ns = req->q->queuedata;
-	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
-	unsigned long flags;
-	struct bio *bio;
-
-	nvme_mpath_clear_current_path(ns);
-
-	/*
-	 * If we got back an ANA error, we know the controller is alive but not
-	 * ready to serve this namespace.  Kick of a re-read of the ANA
-	 * information page, and just try any other available path for now.
-	 */
-	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
-		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-		queue_work(nvme_wq, &ns->ctrl->ana_work);
-	}
-
-	spin_lock_irqsave(&ns->head->requeue_lock, flags);
-	for (bio = req->bio; bio; bio = bio->bi_next) {
-		bio_set_dev(bio, ns->head->disk->part0);
-		if (bio->bi_opf & REQ_POLLED) {
-			bio->bi_opf &= ~REQ_POLLED;
-			bio->bi_cookie = BLK_QC_T_NONE;
-		}
-		/*
-		 * The alternate request queue that we may end up submitting
-		 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
-		 * will fail the I/O immediately with EAGAIN to the issuer.
-		 * We are not in the issuer context which cannot block. Clear
-		 * the flag to avoid spurious EAGAIN I/O failures.
-		 */
-		bio->bi_opf &= ~REQ_NOWAIT;
-	}
-	blk_steal_bios(&ns->head->requeue_list, req);
-	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
-
-	nvme_req(req)->status = 0;
-	nvme_end_req(req);
-	kblockd_schedule_work(&ns->head->requeue_work);
-}
-
 void nvme_mpath_start_request(struct request *rq)
 {
 	struct nvme_ns *ns = rq->q->queuedata;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9a5f28c5103c..dbd063413da9 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -967,7 +967,6 @@ void nvme_mpath_unfreeze(struct nvme_subsystem *subsys);
 void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
-void nvme_failover_req(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_sysfs_link(struct nvme_ns_head *ns);

-- 
2.53.0

Re: [PATCH 1/3] nvme: failover requests for inactive hctx

Posted by kernel test robot 1 month, 1 week ago

Hi Daniel,

kernel test robot noticed the following build errors:

[auto build test ERROR on 6de23f81a5e08be8fbf5e8d7e9febc72a5b5f27f]

url:    https://github.com/intel-lab-lkp/linux/commits/Daniel-Wagner/nvme-failover-requests-for-inactive-hctx/20260226-224213
base:   6de23f81a5e08be8fbf5e8d7e9febc72a5b5f27f
patch link:    https://lore.kernel.org/r/20260226-revert-cpu-read-lock-v1-1-eb005072566e%40kernel.org
patch subject: [PATCH 1/3] nvme: failover requests for inactive hctx
config: x86_64-randconfig-r071-20260227 (https://download.01.org/0day-ci/archive/20260227/202602270720.cugNS3m1-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
smatch version: v0.5.0-8994-gd50c5a4c
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260227/202602270720.cugNS3m1-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602270720.cugNS3m1-lkp@intel.com/

All errors (new ones prefixed by >>):

>> drivers/nvme/host/core.c:457:13: error: redefinition of 'nvme_failover_req'
     457 | static void nvme_failover_req(struct request *req)
         |             ^~~~~~~~~~~~~~~~~
   In file included from drivers/nvme/host/core.c:27:
   drivers/nvme/host/nvme.h:1020:20: note: previous definition of 'nvme_failover_req' with type 'void(struct request *)'
    1020 | static inline void nvme_failover_req(struct request *req)
         |                    ^~~~~~~~~~~~~~~~~
   drivers/nvme/host/core.c: In function 'nvme_failover_req':
>> drivers/nvme/host/core.c:472:50: error: 'struct nvme_ctrl' has no member named 'ana_log_buf'
     472 |         if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
         |                                                  ^~
>> drivers/nvme/host/core.c:474:48: error: 'struct nvme_ctrl' has no member named 'ana_work'; did you mean 'ka_work'?
     474 |                 queue_work(nvme_wq, &ns->ctrl->ana_work);
         |                                                ^~~~~~~~
         |                                                ka_work
   In file included from include/linux/sched.h:37,
                    from include/linux/ratelimit.h:6,
                    from include/linux/dev_printk.h:16,
                    from include/linux/device.h:15,
                    from include/linux/async.h:14,
                    from drivers/nvme/host/core.c:7:
>> drivers/nvme/host/core.c:477:36: error: 'struct nvme_ns_head' has no member named 'requeue_lock'
     477 |         spin_lock_irqsave(&ns->head->requeue_lock, flags);
         |                                    ^~
   include/linux/spinlock.h:244:48: note: in definition of macro 'raw_spin_lock_irqsave'
     244 |                 flags = _raw_spin_lock_irqsave(lock);   \
         |                                                ^~~~
   drivers/nvme/host/core.c:477:9: note: in expansion of macro 'spin_lock_irqsave'
     477 |         spin_lock_irqsave(&ns->head->requeue_lock, flags);
         |         ^~~~~~~~~~~~~~~~~
>> drivers/nvme/host/core.c:494:33: error: 'struct nvme_ns_head' has no member named 'requeue_list'
     494 |         blk_steal_bios(&ns->head->requeue_list, req);
         |                                 ^~
   drivers/nvme/host/core.c:495:41: error: 'struct nvme_ns_head' has no member named 'requeue_lock'
     495 |         spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
         |                                         ^~
>> drivers/nvme/host/core.c:499:40: error: 'struct nvme_ns_head' has no member named 'requeue_work'
     499 |         kblockd_schedule_work(&ns->head->requeue_work);
         |                                        ^~


vim +/nvme_failover_req +457 drivers/nvme/host/core.c

   456	
 > 457	static void nvme_failover_req(struct request *req)
   458	{
   459		struct nvme_ns *ns = req->q->queuedata;
   460		u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
   461		unsigned long flags;
   462		struct bio *bio;
   463	
   464		if (nvme_ns_head_multipath(ns->head))
   465			nvme_mpath_clear_current_path(ns);
   466	
   467		/*
   468		 * If we got back an ANA error, we know the controller is alive but not
   469		 * ready to serve this namespace.  Kick of a re-read of the ANA
   470		 * information page, and just try any other available path for now.
   471		 */
 > 472		if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
   473			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
 > 474			queue_work(nvme_wq, &ns->ctrl->ana_work);
   475		}
   476	
 > 477		spin_lock_irqsave(&ns->head->requeue_lock, flags);
   478		for (bio = req->bio; bio; bio = bio->bi_next) {
   479			if (nvme_ns_head_multipath(ns->head))
   480				bio_set_dev(bio, ns->head->disk->part0);
   481			if (bio->bi_opf & REQ_POLLED) {
   482				bio->bi_opf &= ~REQ_POLLED;
   483				bio->bi_cookie = BLK_QC_T_NONE;
   484			}
   485			/*
   486			 * The alternate request queue that we may end up submitting
   487			 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
   488			 * will fail the I/O immediately with EAGAIN to the issuer.
   489			 * We are not in the issuer context which cannot block. Clear
   490			 * the flag to avoid spurious EAGAIN I/O failures.
   491			 */
   492			bio->bi_opf &= ~REQ_NOWAIT;
   493		}
 > 494		blk_steal_bios(&ns->head->requeue_list, req);
   495		spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
   496	
   497		nvme_req(req)->status = 0;
   498		nvme_end_req(req);
 > 499		kblockd_schedule_work(&ns->head->requeue_work);
   500	}
   501	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH 1/3] nvme: failover requests for inactive hctx

Posted by kernel test robot 1 month, 1 week ago

Hi Daniel,

kernel test robot noticed the following build errors:

[auto build test ERROR on 6de23f81a5e08be8fbf5e8d7e9febc72a5b5f27f]

url:    https://github.com/intel-lab-lkp/linux/commits/Daniel-Wagner/nvme-failover-requests-for-inactive-hctx/20260226-224213
base:   6de23f81a5e08be8fbf5e8d7e9febc72a5b5f27f
patch link:    https://lore.kernel.org/r/20260226-revert-cpu-read-lock-v1-1-eb005072566e%40kernel.org
patch subject: [PATCH 1/3] nvme: failover requests for inactive hctx
config: riscv-defconfig (https://download.01.org/0day-ci/archive/20260227/202602270348.j0MMNhUj-lkp@intel.com/config)
compiler: clang version 23.0.0git (https://github.com/llvm/llvm-project 9a109fbb6e184ec9bcce10615949f598f4c974a9)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260227/202602270348.j0MMNhUj-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202602270348.j0MMNhUj-lkp@intel.com/

All errors (new ones prefixed by >>):

   drivers/nvme/host/core.c:457:13: error: redefinition of 'nvme_failover_req'
     457 | static void nvme_failover_req(struct request *req)
         |             ^
   drivers/nvme/host/nvme.h:1020:20: note: previous definition is here
    1020 | static inline void nvme_failover_req(struct request *req)
         |                    ^
>> drivers/nvme/host/core.c:472:45: error: no member named 'ana_log_buf' in 'struct nvme_ctrl'
     472 |         if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
         |                                          ~~~~~~~~  ^
>> drivers/nvme/host/core.c:474:34: error: no member named 'ana_work' in 'struct nvme_ctrl'
     474 |                 queue_work(nvme_wq, &ns->ctrl->ana_work);
         |                                      ~~~~~~~~  ^
>> drivers/nvme/host/core.c:477:31: error: no member named 'requeue_lock' in 'struct nvme_ns_head'
     477 |         spin_lock_irqsave(&ns->head->requeue_lock, flags);
         |                            ~~~~~~~~  ^
   include/linux/spinlock.h:376:39: note: expanded from macro 'spin_lock_irqsave'
     376 |         raw_spin_lock_irqsave(spinlock_check(lock), flags);     \
         |                                              ^~~~
   include/linux/spinlock.h:244:34: note: expanded from macro 'raw_spin_lock_irqsave'
     244 |                 flags = _raw_spin_lock_irqsave(lock);   \
         |                                                ^~~~
>> drivers/nvme/host/core.c:494:28: error: no member named 'requeue_list' in 'struct nvme_ns_head'
     494 |         blk_steal_bios(&ns->head->requeue_list, req);
         |                         ~~~~~~~~  ^
   drivers/nvme/host/core.c:495:36: error: no member named 'requeue_lock' in 'struct nvme_ns_head'
     495 |         spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
         |                                 ~~~~~~~~  ^
>> drivers/nvme/host/core.c:499:35: error: no member named 'requeue_work' in 'struct nvme_ns_head'
     499 |         kblockd_schedule_work(&ns->head->requeue_work);
         |                                ~~~~~~~~  ^
   7 errors generated.


vim +472 drivers/nvme/host/core.c

   456	
   457	static void nvme_failover_req(struct request *req)
   458	{
   459		struct nvme_ns *ns = req->q->queuedata;
   460		u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
   461		unsigned long flags;
   462		struct bio *bio;
   463	
   464		if (nvme_ns_head_multipath(ns->head))
   465			nvme_mpath_clear_current_path(ns);
   466	
   467		/*
   468		 * If we got back an ANA error, we know the controller is alive but not
   469		 * ready to serve this namespace.  Kick of a re-read of the ANA
   470		 * information page, and just try any other available path for now.
   471		 */
 > 472		if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
   473			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
 > 474			queue_work(nvme_wq, &ns->ctrl->ana_work);
   475		}
   476	
 > 477		spin_lock_irqsave(&ns->head->requeue_lock, flags);
   478		for (bio = req->bio; bio; bio = bio->bi_next) {
   479			if (nvme_ns_head_multipath(ns->head))
   480				bio_set_dev(bio, ns->head->disk->part0);
   481			if (bio->bi_opf & REQ_POLLED) {
   482				bio->bi_opf &= ~REQ_POLLED;
   483				bio->bi_cookie = BLK_QC_T_NONE;
   484			}
   485			/*
   486			 * The alternate request queue that we may end up submitting
   487			 * the bio to may be frozen temporarily, in this case REQ_NOWAIT
   488			 * will fail the I/O immediately with EAGAIN to the issuer.
   489			 * We are not in the issuer context which cannot block. Clear
   490			 * the flag to avoid spurious EAGAIN I/O failures.
   491			 */
   492			bio->bi_opf &= ~REQ_NOWAIT;
   493		}
 > 494		blk_steal_bios(&ns->head->requeue_list, req);
   495		spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
   496	
   497		nvme_req(req)->status = 0;
   498		nvme_end_req(req);
 > 499		kblockd_schedule_work(&ns->head->requeue_work);
   500	}
   501	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

[PATCH 1/3] nvme: failover requests for inactive hctx
[PATCH 2/3] blk-mq: add handshake for offlinig hw queues
[PATCH 3/3] Revert "lib/group_cpus.c: avoid acquiring cpu hotplug lock in group_cpus_evenly"