nvme: add support for command quiesce timeout

[PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Daniel Wagner 10 months, 3 weeks ago

The TP4129 mendates that the failover should be delayed by CQT.  Thus when
nvme_decide_disposition returns FAILOVER do not immediately re-queue it on
the namespace level instead queue it on the ctrl's request_list and
moved later to the namespace's requeue_list.

Signed-off-by: Daniel Wagner <wagi@kernel.org>
---
 drivers/nvme/host/core.c      | 19 ++++++++++++++++
 drivers/nvme/host/fc.c        |  4 ++++
 drivers/nvme/host/multipath.c | 52 ++++++++++++++++++++++++++++++++++++++++---
 drivers/nvme/host/nvme.h      | 15 +++++++++++++
 drivers/nvme/host/rdma.c      |  2 ++
 drivers/nvme/host/tcp.c       |  1 +
 6 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 135045528ea1c79eac0d6d47d5f7f05a7c98acc4..f3155c7735e75e06c4359c26db8931142c067e1d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -239,6 +239,7 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
 
 	flush_work(&ctrl->reset_work);
 	nvme_stop_ctrl(ctrl);
+	nvme_flush_failover(ctrl);
 	nvme_remove_namespaces(ctrl);
 	ctrl->ops->delete_ctrl(ctrl);
 	nvme_uninit_ctrl(ctrl);
@@ -1310,6 +1311,19 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
 	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
 }
 
+void nvme_schedule_failover(struct nvme_ctrl *ctrl)
+{
+	unsigned long delay;
+
+	if (ctrl->cqt)
+		delay = msecs_to_jiffies(ctrl->cqt);
+	else
+		delay = ctrl->kato * HZ;
+
+	queue_delayed_work(nvme_wq, &ctrl->failover_work, delay);
+}
+EXPORT_SYMBOL_GPL(nvme_schedule_failover);
+
 static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
 						 blk_status_t status)
 {
@@ -1336,6 +1350,8 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
 		dev_err(ctrl->device,
 			"failed nvme_keep_alive_end_io error=%d\n",
 				status);
+
+		nvme_schedule_failover(ctrl);
 		return RQ_END_IO_NONE;
 	}
 
@@ -4716,6 +4732,7 @@ EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
 
 void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
 {
+	nvme_schedule_failover(ctrl);
 	nvme_mpath_stop(ctrl);
 	nvme_auth_stop(ctrl);
 	nvme_stop_failfast_work(ctrl);
@@ -4842,6 +4859,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 
 	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
 	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
+	INIT_DELAYED_WORK(&ctrl->failover_work, nvme_failover_work);
+	INIT_LIST_HEAD(&ctrl->failover_list);
 	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
 	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
 	ctrl->ka_last_check_time = jiffies;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index cdc1ba277a5c23ef1afd26e6911b082f3d12b215..bd897b29cd286008b781bbcb4230e08019da6b6b 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2553,6 +2553,8 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
 {
 	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
 
+	nvme_schedule_failover(&ctrl->ctrl);
+
 	/*
 	 * if an error (io timeout, etc) while (re)connecting, the remote
 	 * port requested terminating of the association (disconnect_ls)
@@ -3378,6 +3380,8 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
 	/* will block will waiting for io to terminate */
 	nvme_fc_delete_association(ctrl);
 
+	nvme_schedule_failover(&ctrl->ctrl);
+
 	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
 		dev_err(ctrl->ctrl.device,
 			"NVME-FC{%d}: error_recovery: Couldn't change state "
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 2a7635565083046c575efe1793362ae10581defd..a14b055796b982df96609f53174a5d1334c1c0c4 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
+	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
 	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
 	unsigned long flags;
 	struct bio *bio;
+	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 
 	nvme_mpath_clear_current_path(ns);
 
@@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
 	blk_steal_bios(&ns->head->requeue_list, req);
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 
-	nvme_req(req)->status = 0;
-	nvme_end_req(req);
-	kblockd_schedule_work(&ns->head->requeue_work);
+	spin_lock_irqsave(&ctrl->lock, flags);
+	list_add_tail(&req->queuelist, &ctrl->failover_list);
+	spin_unlock_irqrestore(&ctrl->lock, flags);
+
+	if (state == NVME_CTRL_DELETING) {
+		/*
+		 * request which fail over in the DELETING state were
+		 * canceled and blk_mq_tagset_wait_completed_request will
+		 * block until they have been proceed. Though
+		 * nvme_failover_work is already stopped. Thus schedule
+		 * a failover; it's still necessary to delay these commands
+		 * by CQT.
+		 */
+		nvme_schedule_failover(ctrl);
+	}
+}
+
+void nvme_flush_failover(struct nvme_ctrl *ctrl)
+{
+	LIST_HEAD(failover_list);
+	struct request *rq;
+	bool kick = false;
+
+	spin_lock_irq(&ctrl->lock);
+	list_splice_init(&ctrl->failover_list, &failover_list);
+	spin_unlock_irq(&ctrl->lock);
+
+	while (!list_empty(&failover_list)) {
+		rq = list_entry(failover_list.next,
+				struct request, queuelist);
+		list_del_init(&rq->queuelist);
+
+		nvme_req(rq)->status = 0;
+		nvme_end_req(rq);
+		kick = true;
+	}
+
+	if (kick)
+		nvme_kick_requeue_lists(ctrl);
+}
+
+void nvme_failover_work(struct work_struct *work)
+{
+	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
+					struct nvme_ctrl, failover_work);
+
+	nvme_flush_failover(ctrl);
 }
 
 void nvme_mpath_start_request(struct request *rq)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7563332b5b7b76fc6165ec8c6f2d144737d4fe85..10eb323bdaf139526959180c1e66ab4579bb145d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -411,6 +411,9 @@ struct nvme_ctrl {
 
 	enum nvme_ctrl_type cntrltype;
 	enum nvme_dctype dctype;
+
+	struct delayed_work failover_work;
+	struct list_head failover_list;
 };
 
 static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
@@ -954,6 +957,9 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
 void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
 void nvme_failover_req(struct request *req);
+void nvme_failover_work(struct work_struct *work);
+void nvme_schedule_failover(struct nvme_ctrl *ctrl);
+void nvme_flush_failover(struct nvme_ctrl *ctrl);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
@@ -996,6 +1002,15 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 static inline void nvme_failover_req(struct request *req)
 {
 }
+static inline void nvme_failover_work(struct work_struct *work)
+{
+}
+static inline void nvme_schedule_failover(struct nvme_ctrl *ctrl)
+{
+}
+static inline void nvme_flush_failover(struct nvme_ctrl *ctrl)
+{
+}
 static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 }
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 86a2891d9bcc7a990cd214a7fe93fa5c55b292c7..9bee376f881b4c3ebe5502abf23a8e76829780ff 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1127,6 +1127,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
 
 	nvme_stop_keep_alive(&ctrl->ctrl);
 	flush_work(&ctrl->ctrl.async_event_work);
+	nvme_schedule_failover(&ctrl->ctrl);
 	nvme_rdma_teardown_io_queues(ctrl, false);
 	nvme_unquiesce_io_queues(&ctrl->ctrl);
 	nvme_rdma_teardown_admin_queue(ctrl, false);
@@ -2153,6 +2154,7 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
 
 static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 {
+	nvme_schedule_failover(&ctrl->ctrl);
 	nvme_rdma_teardown_io_queues(ctrl, shutdown);
 	nvme_quiesce_admin_queue(&ctrl->ctrl);
 	nvme_disable_ctrl(&ctrl->ctrl, shutdown);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index d0023bcfd8a79a193adf2807a24481c8c164a174..3a6c1d3febaf233996e4dcf684793327b5d1412f 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2345,6 +2345,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 
 	nvme_stop_keep_alive(ctrl);
 	flush_work(&ctrl->async_event_work);
+	nvme_schedule_failover(ctrl);
 	nvme_tcp_teardown_io_queues(ctrl, false);
 	/* unquiesce to fail fast pending requests */
 	nvme_unquiesce_io_queues(ctrl);

-- 
2.48.1

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Mohamed Khalfella 9 months, 4 weeks ago

On 2025-03-24 13:07:58 +0100, Daniel Wagner wrote:
> The TP4129 mendates that the failover should be delayed by CQT.  Thus when
> nvme_decide_disposition returns FAILOVER do not immediately re-queue it on
> the namespace level instead queue it on the ctrl's request_list and
> moved later to the namespace's requeue_list.
> 
> Signed-off-by: Daniel Wagner <wagi@kernel.org>
> ---
>  drivers/nvme/host/core.c      | 19 ++++++++++++++++
>  drivers/nvme/host/fc.c        |  4 ++++
>  drivers/nvme/host/multipath.c | 52 ++++++++++++++++++++++++++++++++++++++++---
>  drivers/nvme/host/nvme.h      | 15 +++++++++++++
>  drivers/nvme/host/rdma.c      |  2 ++
>  drivers/nvme/host/tcp.c       |  1 +
>  6 files changed, 90 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 135045528ea1c79eac0d6d47d5f7f05a7c98acc4..f3155c7735e75e06c4359c26db8931142c067e1d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -239,6 +239,7 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
>  
>  	flush_work(&ctrl->reset_work);
>  	nvme_stop_ctrl(ctrl);
> +	nvme_flush_failover(ctrl);
>  	nvme_remove_namespaces(ctrl);
>  	ctrl->ops->delete_ctrl(ctrl);
>  	nvme_uninit_ctrl(ctrl);
> @@ -1310,6 +1311,19 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
>  	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
>  }
>  
> +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> +{
> +	unsigned long delay;
> +
> +	if (ctrl->cqt)
> +		delay = msecs_to_jiffies(ctrl->cqt);
> +	else
> +		delay = ctrl->kato * HZ;
> +
> +	queue_delayed_work(nvme_wq, &ctrl->failover_work, delay);
> +}
> +EXPORT_SYMBOL_GPL(nvme_schedule_failover);
> +
>  static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
>  						 blk_status_t status)
>  {
> @@ -1336,6 +1350,8 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
>  		dev_err(ctrl->device,
>  			"failed nvme_keep_alive_end_io error=%d\n",
>  				status);
> +
> +		nvme_schedule_failover(ctrl);
>  		return RQ_END_IO_NONE;
>  	}
>  
> @@ -4716,6 +4732,7 @@ EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
>  
>  void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
>  {
> +	nvme_schedule_failover(ctrl);
>  	nvme_mpath_stop(ctrl);
>  	nvme_auth_stop(ctrl);
>  	nvme_stop_failfast_work(ctrl);
> @@ -4842,6 +4859,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>  
>  	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
>  	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
> +	INIT_DELAYED_WORK(&ctrl->failover_work, nvme_failover_work);
> +	INIT_LIST_HEAD(&ctrl->failover_list);
>  	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
>  	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
>  	ctrl->ka_last_check_time = jiffies;
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index cdc1ba277a5c23ef1afd26e6911b082f3d12b215..bd897b29cd286008b781bbcb4230e08019da6b6b 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -2553,6 +2553,8 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
>  {
>  	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
>  
> +	nvme_schedule_failover(&ctrl->ctrl);
> +
>  	/*
>  	 * if an error (io timeout, etc) while (re)connecting, the remote
>  	 * port requested terminating of the association (disconnect_ls)
> @@ -3378,6 +3380,8 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
>  	/* will block will waiting for io to terminate */
>  	nvme_fc_delete_association(ctrl);
>  
> +	nvme_schedule_failover(&ctrl->ctrl);
> +
>  	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
>  		dev_err(ctrl->ctrl.device,
>  			"NVME-FC{%d}: error_recovery: Couldn't change state "
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 2a7635565083046c575efe1793362ae10581defd..a14b055796b982df96609f53174a5d1334c1c0c4 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>  void nvme_failover_req(struct request *req)
>  {
>  	struct nvme_ns *ns = req->q->queuedata;
> +	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
>  	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
>  	unsigned long flags;
>  	struct bio *bio;
> +	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
>  
>  	nvme_mpath_clear_current_path(ns);
>  
> @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
>  	blk_steal_bios(&ns->head->requeue_list, req);
>  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>  
> -	nvme_req(req)->status = 0;
> -	nvme_end_req(req);
> -	kblockd_schedule_work(&ns->head->requeue_work);
> +	spin_lock_irqsave(&ctrl->lock, flags);
> +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> +	spin_unlock_irqrestore(&ctrl->lock, flags);
> +

In case the delay in nvme_schedule_failover() is larget than request
timeout, is it possible for timeout callback to be called while a
request is sitting in failover_list?

Is there any guarantee to prevent this from happening? I understand from
the patch that we do not want this to happen, right?

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Daniel Wagner 9 months, 4 weeks ago

On Tue, Apr 15, 2025 at 05:23:24PM -0700, Mohamed Khalfella wrote:
> > -	nvme_req(req)->status = 0;
> > -	nvme_end_req(req);
> > -	kblockd_schedule_work(&ns->head->requeue_work);
> > +	spin_lock_irqsave(&ctrl->lock, flags);
> > +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> > +	spin_unlock_irqrestore(&ctrl->lock, flags);
> > +
> 
> In case the delay in nvme_schedule_failover() is larget than request
> timeout, is it possible for timeout callback to be called while a
> request is sitting in failover_list?

Yes this can happen.

> Is there any guarantee to prevent this from happening? I understand from
> the patch that we do not want this to happen, right?

As already said, I think with failover queue we need to handle timeouts
on command basis.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Sagi Grimberg 10 months ago


On 24/03/2025 14:07, Daniel Wagner wrote:
> The TP4129 mendates that the failover should be delayed by CQT.  Thus when
> nvme_decide_disposition returns FAILOVER do not immediately re-queue it on
> the namespace level instead queue it on the ctrl's request_list and
> moved later to the namespace's requeue_list.
>
> Signed-off-by: Daniel Wagner <wagi@kernel.org>
> ---
>   drivers/nvme/host/core.c      | 19 ++++++++++++++++
>   drivers/nvme/host/fc.c        |  4 ++++
>   drivers/nvme/host/multipath.c | 52 ++++++++++++++++++++++++++++++++++++++++---
>   drivers/nvme/host/nvme.h      | 15 +++++++++++++
>   drivers/nvme/host/rdma.c      |  2 ++
>   drivers/nvme/host/tcp.c       |  1 +
>   6 files changed, 90 insertions(+), 3 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 135045528ea1c79eac0d6d47d5f7f05a7c98acc4..f3155c7735e75e06c4359c26db8931142c067e1d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -239,6 +239,7 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
>   
>   	flush_work(&ctrl->reset_work);
>   	nvme_stop_ctrl(ctrl);
> +	nvme_flush_failover(ctrl);

This will terminate all pending failvoer commands - this is the intended 
behavior?

>   	nvme_remove_namespaces(ctrl);
>   	ctrl->ops->delete_ctrl(ctrl);
>   	nvme_uninit_ctrl(ctrl);
> @@ -1310,6 +1311,19 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
>   	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
>   }
>   
> +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> +{
> +	unsigned long delay;
> +
> +	if (ctrl->cqt)
> +		delay = msecs_to_jiffies(ctrl->cqt);
> +	else
> +		delay = ctrl->kato * HZ;

This delay was there before?

> +
> +	queue_delayed_work(nvme_wq, &ctrl->failover_work, delay);
> +}
> +EXPORT_SYMBOL_GPL(nvme_schedule_failover);
> +
>   static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
>   						 blk_status_t status)
>   {
> @@ -1336,6 +1350,8 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
>   		dev_err(ctrl->device,
>   			"failed nvme_keep_alive_end_io error=%d\n",
>   				status);
> +
> +		nvme_schedule_failover(ctrl);
>   		return RQ_END_IO_NONE;
>   	}
>   
> @@ -4716,6 +4732,7 @@ EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
>   
>   void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
>   {
> +	nvme_schedule_failover(ctrl);
>   	nvme_mpath_stop(ctrl);
>   	nvme_auth_stop(ctrl);
>   	nvme_stop_failfast_work(ctrl);
> @@ -4842,6 +4859,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>   
>   	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
>   	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
> +	INIT_DELAYED_WORK(&ctrl->failover_work, nvme_failover_work);
> +	INIT_LIST_HEAD(&ctrl->failover_list);
>   	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
>   	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
>   	ctrl->ka_last_check_time = jiffies;
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index cdc1ba277a5c23ef1afd26e6911b082f3d12b215..bd897b29cd286008b781bbcb4230e08019da6b6b 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -2553,6 +2553,8 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
>   {
>   	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
>   
> +	nvme_schedule_failover(&ctrl->ctrl);
> +
>   	/*
>   	 * if an error (io timeout, etc) while (re)connecting, the remote
>   	 * port requested terminating of the association (disconnect_ls)
> @@ -3378,6 +3380,8 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
>   	/* will block will waiting for io to terminate */
>   	nvme_fc_delete_association(ctrl);
>   
> +	nvme_schedule_failover(&ctrl->ctrl);
> +
>   	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
>   		dev_err(ctrl->ctrl.device,
>   			"NVME-FC{%d}: error_recovery: Couldn't change state "
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 2a7635565083046c575efe1793362ae10581defd..a14b055796b982df96609f53174a5d1334c1c0c4 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>   void nvme_failover_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> +	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
>   	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
>   	unsigned long flags;
>   	struct bio *bio;
> +	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
>   
>   	nvme_mpath_clear_current_path(ns);
>   
> @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
>   	blk_steal_bios(&ns->head->requeue_list, req);
>   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>   
> -	nvme_req(req)->status = 0;
> -	nvme_end_req(req);
> -	kblockd_schedule_work(&ns->head->requeue_work);
> +	spin_lock_irqsave(&ctrl->lock, flags);
> +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> +	spin_unlock_irqrestore(&ctrl->lock, flags);

So these request - which we stripped their bios, are now placed
on a failover queue?

> +
> +	if (state == NVME_CTRL_DELETING) {
> +		/*
> +		 * request which fail over in the DELETING state were
> +		 * canceled and blk_mq_tagset_wait_completed_request will
> +		 * block until they have been proceed. Though
> +		 * nvme_failover_work is already stopped. Thus schedule
> +		 * a failover; it's still necessary to delay these commands
> +		 * by CQT.
> +		 */
> +		nvme_schedule_failover(ctrl);

This condition is unclear. Isn't any request failing over should do this?
Something is unclear here.

> +	}
> +}
> +
> +void nvme_flush_failover(struct nvme_ctrl *ctrl)
> +{
> +	LIST_HEAD(failover_list);
> +	struct request *rq;
> +	bool kick = false;
> +
> +	spin_lock_irq(&ctrl->lock);
> +	list_splice_init(&ctrl->failover_list, &failover_list);
> +	spin_unlock_irq(&ctrl->lock);
> +
> +	while (!list_empty(&failover_list)) {
> +		rq = list_entry(failover_list.next,
> +				struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +
> +		nvme_req(rq)->status = 0;
> +		nvme_end_req(rq);
> +		kick = true;
> +	}
> +
> +	if (kick)
> +		nvme_kick_requeue_lists(ctrl);
> +}
> +
> +void nvme_failover_work(struct work_struct *work)
> +{
> +	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
> +					struct nvme_ctrl, failover_work);
> +
> +	nvme_flush_failover(ctrl);
>   }
>   
>   void nvme_mpath_start_request(struct request *rq)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 7563332b5b7b76fc6165ec8c6f2d144737d4fe85..10eb323bdaf139526959180c1e66ab4579bb145d 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -411,6 +411,9 @@ struct nvme_ctrl {
>   
>   	enum nvme_ctrl_type cntrltype;
>   	enum nvme_dctype dctype;
> +
> +	struct delayed_work failover_work;
> +	struct list_head failover_list;
>   };
>   
>   static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
> @@ -954,6 +957,9 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
>   void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
>   void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
>   void nvme_failover_req(struct request *req);
> +void nvme_failover_work(struct work_struct *work);
> +void nvme_schedule_failover(struct nvme_ctrl *ctrl);
> +void nvme_flush_failover(struct nvme_ctrl *ctrl);
>   void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
>   int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
>   void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
> @@ -996,6 +1002,15 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
>   static inline void nvme_failover_req(struct request *req)
>   {
>   }
> +static inline void nvme_failover_work(struct work_struct *work)
> +{
> +}
> +static inline void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> +{
> +}
> +static inline void nvme_flush_failover(struct nvme_ctrl *ctrl)
> +{
> +}
>   static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
>   {
>   }
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 86a2891d9bcc7a990cd214a7fe93fa5c55b292c7..9bee376f881b4c3ebe5502abf23a8e76829780ff 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -1127,6 +1127,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
>   
>   	nvme_stop_keep_alive(&ctrl->ctrl);
>   	flush_work(&ctrl->ctrl.async_event_work);
> +	nvme_schedule_failover(&ctrl->ctrl);
>   	nvme_rdma_teardown_io_queues(ctrl, false);
>   	nvme_unquiesce_io_queues(&ctrl->ctrl);
>   	nvme_rdma_teardown_admin_queue(ctrl, false);
> @@ -2153,6 +2154,7 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
>   
>   static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
>   {
> +	nvme_schedule_failover(&ctrl->ctrl);
>   	nvme_rdma_teardown_io_queues(ctrl, shutdown);
>   	nvme_quiesce_admin_queue(&ctrl->ctrl);
>   	nvme_disable_ctrl(&ctrl->ctrl, shutdown);
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index d0023bcfd8a79a193adf2807a24481c8c164a174..3a6c1d3febaf233996e4dcf684793327b5d1412f 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -2345,6 +2345,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
>   
>   	nvme_stop_keep_alive(ctrl);
>   	flush_work(&ctrl->async_event_work);
> +	nvme_schedule_failover(ctrl);
>   	nvme_tcp_teardown_io_queues(ctrl, false);
>   	/* unquiesce to fail fast pending requests */
>   	nvme_unquiesce_io_queues(ctrl);
>

What is the reason for rdma and tcp not being identical?

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Daniel Wagner 9 months, 4 weeks ago

On Mon, Apr 14, 2025 at 01:03:57AM +0300, Sagi Grimberg wrote:
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -239,6 +239,7 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
> >   	flush_work(&ctrl->reset_work);
> >   	nvme_stop_ctrl(ctrl);
> > +	nvme_flush_failover(ctrl);
> 
> This will terminate all pending failvoer commands - this is the intended
> behavior?

Yes it will, I don't think so. From the feedback I gather so far, it
seems the correct way (spec) is to handle each command individually.

FWIW, we are shipping

  https://lore.kernel.org/linux-nvme/20230908100049.80809-3-hare@suse.de/

as stop-gap solution for a while and our customer wasn't able to
reproduce the ghost writes anymore. And as far I can tell it does also
failover all pending commands.

> 
> >   	nvme_remove_namespaces(ctrl);
> >   	ctrl->ops->delete_ctrl(ctrl);
> >   	nvme_uninit_ctrl(ctrl);
> > @@ -1310,6 +1311,19 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
> >   	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
> >   }
> > +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> > +{
> > +	unsigned long delay;
> > +
> > +	if (ctrl->cqt)
> > +		delay = msecs_to_jiffies(ctrl->cqt);
> > +	else
> > +		delay = ctrl->kato * HZ;
> 
> This delay was there before?

No, this is function returns the additional time the host should wait
for it starts to failover. So this is the CQT value, after the KATO
timeout protocol and this timeout is not present in the nvme subsystem.

> >   void nvme_failover_req(struct request *req)
> >   {
> >   	struct nvme_ns *ns = req->q->queuedata;
> > +	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
> >   	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
> >   	unsigned long flags;
> >   	struct bio *bio;
> > +	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
> >   	nvme_mpath_clear_current_path(ns);
> > @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
> >   	blk_steal_bios(&ns->head->requeue_list, req);
> >   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> > -	nvme_req(req)->status = 0;
> > -	nvme_end_req(req);
> > -	kblockd_schedule_work(&ns->head->requeue_work);
> > +	spin_lock_irqsave(&ctrl->lock, flags);
> > +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> > +	spin_unlock_irqrestore(&ctrl->lock, flags);
> 
> So these request - which we stripped their bios, are now placed
> on a failover queue?

Yes, this is the idea.

> > +
> > +	if (state == NVME_CTRL_DELETING) {
> > +		/*
> > +		 * request which fail over in the DELETING state were
> > +		 * canceled and blk_mq_tagset_wait_completed_request will
> > +		 * block until they have been proceed. Though
> > +		 * nvme_failover_work is already stopped. Thus schedule
> > +		 * a failover; it's still necessary to delay these commands
> > +		 * by CQT.
> > +		 */
> > +		nvme_schedule_failover(ctrl);
> 
> This condition is unclear. Isn't any request failing over should do this?
> Something is unclear here.

Ye, the comment is not very clear. Sorry about that. I observed that
blk_mq_tagset_wait_completed_request would block when the controller was
already in DELETING state and request were canceled, IIRC. The commands
would be queued on the failover queue but never processed
nvme_failover_work.

> > +	nvme_schedule_failover(&ctrl->ctrl);
> >   	nvme_rdma_teardown_io_queues(ctrl, shutdown);
> >   	nvme_quiesce_admin_queue(&ctrl->ctrl);
> >   	nvme_disable_ctrl(&ctrl->ctrl, shutdown);
> > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > index d0023bcfd8a79a193adf2807a24481c8c164a174..3a6c1d3febaf233996e4dcf684793327b5d1412f 100644
> > --- a/drivers/nvme/host/tcp.c
> > +++ b/drivers/nvme/host/tcp.c
> > @@ -2345,6 +2345,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
> >   	nvme_stop_keep_alive(ctrl);
> >   	flush_work(&ctrl->async_event_work);
> > +	nvme_schedule_failover(ctrl);
> >   	nvme_tcp_teardown_io_queues(ctrl, false);
> >   	/* unquiesce to fail fast pending requests */
> >   	nvme_unquiesce_io_queues(ctrl);
> > 
> 
> What is the reason for rdma and tcp not being identical?

Sorry, can't remember. But I really want to start moving this code into
one place.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Jiewei Ke 10 months ago

Hi Daniel,

I just noticed that your patchset addresses a similar issue to the one I'm
trying to solve with my recently submitted patchset [1]. Compared to your
approach, mine differs in a few key aspects:

1. Only aborted requests are delayed for retry. In the current implementation,
nvmf_complete_timed_out_request and nvme_cancel_request set the request status
to NVME_SC_HOST_ABORTED_CMD. These requests are usually already sent to the
target, but may have timed out or been canceled before a response is received.
Since the target may still be processing them, the host needs to delay retrying
to ensure the target has completed or cleaned up the stale requests. On the
other hand, requests that are not aborted - such as those that never got
submitted due to no usable path (e.g., from nvme_ns_head_submit_bio), or those
that already received an ANA error from the target - do not need delayed retry.

2. The host explicitly disconnects and stops KeepAlive before delay scheduling
retrying requests. This aligns with Section 9.6 "Communication Loss Handling"
of the NVMe Base Specification 2.1. Once the host disconnects, the target may
take up to the KATO interval to detect the lost connection and begin cleaning
up any remaining requests.

@@ -2178,6 +2180,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 	nvme_quiesce_admin_queue(&ctrl->ctrl);
 	nvme_disable_ctrl(&ctrl->ctrl, shutdown);
 	nvme_rdma_teardown_admin_queue(ctrl, shutdown);
+	nvme_delay_kick_retry_lists(&ctrl->ctrl); <<< delay kick retry after teardown all queues
 }

3. Delayed retry of aborted requests is supported across multiple scenarios,
including error recovery work, controller reset, controller deletion, and
controller reconnect failure handling. Here's the relevant code for reference.

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 9109d5476417..f07b3960df7c 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2449,6 +2449,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
 destroy_admin:
 	nvme_stop_keep_alive(ctrl);
 	nvme_tcp_teardown_admin_queue(ctrl, new);
+	nvme_delay_kick_retry_lists(ctrl); <<< requests may be timed out when ctrl reconnects
 	return ret;
 }

@@ -2494,6 +2495,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 	nvme_tcp_teardown_admin_queue(ctrl, false);
 	nvme_unquiesce_admin_queue(ctrl);
 	nvme_auth_stop(ctrl);
+	nvme_delay_kick_retry_lists(ctrl); <<< retry_lists may contain timed out or cancelled requests

 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
 		/* state change failure is ok if we started ctrl delete */

@@ -2513,6 +2515,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
 	nvme_quiesce_admin_queue(ctrl);
 	nvme_disable_ctrl(ctrl, shutdown);
 	nvme_tcp_teardown_admin_queue(ctrl, shutdown);
+	nvme_delay_kick_retry_lists(ctrl); <<< retry_lists may contain timed out or cancelled requests when ctrl reset or delete
 }

Besides, in nvme_tcp_error_recovery_work, the delayed retry must occur after
nvme_tcp_teardown_io_queues, because the teardown cancels requests that may need
to be retried too.

One limitation of my patchset is that it does not yet include full CQT support,
and due to testing environment constraints, only nvme_tcp and nvme_rdma are
currently covered.

I'd be happy to discuss the pros and cons of both approaches - perhaps we can
combine the best aspects.

Looking forward to your thoughts.

Thanks,
Jiewei

[1] https://lore.kernel.org/linux-nvme/20250410122054.2526358-1-jiewei@smartx.com/

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Jiewei Ke 10 months ago

Hi Daniel,

I just noticed that your patchset addresses a similar issue to the one I'm
trying to solve with my recently submitted patchset [1]. Compared to your
approach, mine differs in a few key aspects:

1. Only aborted requests are delayed for retry. In the current implementation,
nvmf_complete_timed_out_request and nvme_cancel_request set the request status
to NVME_SC_HOST_ABORTED_CMD. These requests are usually already sent to the
target, but may have timed out or been canceled before a response is received.
Since the target may still be processing them, the host needs to delay retrying
to ensure the target has completed or cleaned up the stale requests. On the
other hand, requests that are not aborted - such as those that never got
submitted due to no usable path (e.g., from nvme_ns_head_submit_bio), or those
that already received an ANA error from the target - do not need delayed retry.

2. The host explicitly disconnects and stops KeepAlive before delay scheduling
retrying requests. This aligns with Section 9.6 "Communication Loss Handling"
of the NVMe Base Specification 2.1. Once the host disconnects, the target may
take up to the KATO interval to detect the lost connection and begin cleaning
up any remaining requests.

@@ -2178,6 +2180,7 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
 	nvme_quiesce_admin_queue(&ctrl->ctrl);
 	nvme_disable_ctrl(&ctrl->ctrl, shutdown);
 	nvme_rdma_teardown_admin_queue(ctrl, shutdown);
+	nvme_delay_kick_retry_lists(&ctrl->ctrl); <<< delay kick retry after teardown all queues
 }

3. Delayed retry of aborted requests is supported across multiple scenarios,
including error recovery work, controller reset, controller deletion, and
controller reconnect failure handling. Here's the relevant code for reference.

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 9109d5476417..f07b3960df7c 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2449,6 +2449,7 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
 destroy_admin:
 	nvme_stop_keep_alive(ctrl);
 	nvme_tcp_teardown_admin_queue(ctrl, new);
+	nvme_delay_kick_retry_lists(ctrl); <<< requests may be timed out when ctrl reconnects
 	return ret;
 }

@@ -2494,6 +2495,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 	nvme_tcp_teardown_admin_queue(ctrl, false);
 	nvme_unquiesce_admin_queue(ctrl);
 	nvme_auth_stop(ctrl);
+	nvme_delay_kick_retry_lists(ctrl); <<< retry_lists may contain timed out or cancelled requests

 	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
 		/* state change failure is ok if we started ctrl delete */

@@ -2513,6 +2515,7 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
 	nvme_quiesce_admin_queue(ctrl);
 	nvme_disable_ctrl(ctrl, shutdown);
 	nvme_tcp_teardown_admin_queue(ctrl, shutdown);
+	nvme_delay_kick_retry_lists(ctrl); <<< retry_lists may contain timed out or cancelled requests when ctrl reset or delete
 }

Besides, in nvme_tcp_error_recovery_work, the delayed retry must occur after
nvme_tcp_teardown_io_queues, because the teardown cancels requests that may need
to be retried too.

One limitation of my patchset is that it does not yet include full CQT support,
and due to testing environment constraints, only nvme_tcp and nvme_rdma are
currently covered.

I'd be happy to discuss the pros and cons of both approaches - perhaps we can
combine the best aspects.

Looking forward to your thoughts.

Thanks,
Jiewei

[1] https://lore.kernel.org/linux-nvme/20250410122054.2526358-1-jiewei@smartx.com/

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Mohamed Khalfella 10 months ago

On 2025-03-24 13:07:58 +0100, Daniel Wagner wrote:
> The TP4129 mendates that the failover should be delayed by CQT.  Thus when
> nvme_decide_disposition returns FAILOVER do not immediately re-queue it on
> the namespace level instead queue it on the ctrl's request_list and
> moved later to the namespace's requeue_list.
> 
> Signed-off-by: Daniel Wagner <wagi@kernel.org>
> ---
>  drivers/nvme/host/core.c      | 19 ++++++++++++++++
>  drivers/nvme/host/fc.c        |  4 ++++
>  drivers/nvme/host/multipath.c | 52 ++++++++++++++++++++++++++++++++++++++++---
>  drivers/nvme/host/nvme.h      | 15 +++++++++++++
>  drivers/nvme/host/rdma.c      |  2 ++
>  drivers/nvme/host/tcp.c       |  1 +
>  6 files changed, 90 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 135045528ea1c79eac0d6d47d5f7f05a7c98acc4..f3155c7735e75e06c4359c26db8931142c067e1d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -239,6 +239,7 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
>  
>  	flush_work(&ctrl->reset_work);
>  	nvme_stop_ctrl(ctrl);
> +	nvme_flush_failover(ctrl);
>  	nvme_remove_namespaces(ctrl);
>  	ctrl->ops->delete_ctrl(ctrl);
>  	nvme_uninit_ctrl(ctrl);
> @@ -1310,6 +1311,19 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
>  	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
>  }
>  
> +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> +{
> +	unsigned long delay;
> +
> +	if (ctrl->cqt)
> +		delay = msecs_to_jiffies(ctrl->cqt);
> +	else
> +		delay = ctrl->kato * HZ;

I thought that delay = m * ctrl->kato + ctrl->cqt
where m = ctrl->ctratt & NVME_CTRL_ATTR_TBKAS ? 3 : 2
no?

> +
> +	queue_delayed_work(nvme_wq, &ctrl->failover_work, delay);
> +}
> +EXPORT_SYMBOL_GPL(nvme_schedule_failover);
> +
>  static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
>  						 blk_status_t status)
>  {
> @@ -1336,6 +1350,8 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
>  		dev_err(ctrl->device,
>  			"failed nvme_keep_alive_end_io error=%d\n",
>  				status);
> +
> +		nvme_schedule_failover(ctrl);
>  		return RQ_END_IO_NONE;
>  	}
>  
> @@ -4716,6 +4732,7 @@ EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
>  
>  void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
>  {
> +	nvme_schedule_failover(ctrl);
>  	nvme_mpath_stop(ctrl);
>  	nvme_auth_stop(ctrl);
>  	nvme_stop_failfast_work(ctrl);
> @@ -4842,6 +4859,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>  
>  	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
>  	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
> +	INIT_DELAYED_WORK(&ctrl->failover_work, nvme_failover_work);
> +	INIT_LIST_HEAD(&ctrl->failover_list);
>  	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
>  	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
>  	ctrl->ka_last_check_time = jiffies;
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index cdc1ba277a5c23ef1afd26e6911b082f3d12b215..bd897b29cd286008b781bbcb4230e08019da6b6b 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -2553,6 +2553,8 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
>  {
>  	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
>  
> +	nvme_schedule_failover(&ctrl->ctrl);
> +
>  	/*
>  	 * if an error (io timeout, etc) while (re)connecting, the remote
>  	 * port requested terminating of the association (disconnect_ls)
> @@ -3378,6 +3380,8 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
>  	/* will block will waiting for io to terminate */
>  	nvme_fc_delete_association(ctrl);
>  
> +	nvme_schedule_failover(&ctrl->ctrl);
> +
>  	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
>  		dev_err(ctrl->ctrl.device,
>  			"NVME-FC{%d}: error_recovery: Couldn't change state "
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 2a7635565083046c575efe1793362ae10581defd..a14b055796b982df96609f53174a5d1334c1c0c4 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>  void nvme_failover_req(struct request *req)
>  {
>  	struct nvme_ns *ns = req->q->queuedata;
> +	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
>  	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
>  	unsigned long flags;
>  	struct bio *bio;
> +	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
>  
>  	nvme_mpath_clear_current_path(ns);
>  
> @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
>  	blk_steal_bios(&ns->head->requeue_list, req);
>  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>  
> -	nvme_req(req)->status = 0;
> -	nvme_end_req(req);
> -	kblockd_schedule_work(&ns->head->requeue_work);
> +	spin_lock_irqsave(&ctrl->lock, flags);
> +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> +	spin_unlock_irqrestore(&ctrl->lock, flags);

I see this is the only place where held requests are added to
failover_list.

- Will this hold admin requests in failover_list?

- What about requests that do not go through nvme_failover_req(), like
  passthrough requests, do we not want to hold these requests until it
  is safe for them to be retried?

- In case of controller reset or delete if nvme_disable_ctrl()
  successfully disables the controller, then we do not want to add
  canceled requests to failover_list, right? Does this implementation
  consider this case?

> +
> +	if (state == NVME_CTRL_DELETING) {
> +		/*
> +		 * request which fail over in the DELETING state were
> +		 * canceled and blk_mq_tagset_wait_completed_request will
> +		 * block until they have been proceed. Though
> +		 * nvme_failover_work is already stopped. Thus schedule
> +		 * a failover; it's still necessary to delay these commands
> +		 * by CQT.
> +		 */
> +		nvme_schedule_failover(ctrl);
> +	}
> +}
> +
> +void nvme_flush_failover(struct nvme_ctrl *ctrl)
> +{
> +	LIST_HEAD(failover_list);
> +	struct request *rq;
> +	bool kick = false;
> +
> +	spin_lock_irq(&ctrl->lock);
> +	list_splice_init(&ctrl->failover_list, &failover_list);
> +	spin_unlock_irq(&ctrl->lock);
> +
> +	while (!list_empty(&failover_list)) {
> +		rq = list_entry(failover_list.next,
> +				struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +
> +		nvme_req(rq)->status = 0;
> +		nvme_end_req(rq);
> +		kick = true;
> +	}
> +
> +	if (kick)
> +		nvme_kick_requeue_lists(ctrl);
> +}
> +
> +void nvme_failover_work(struct work_struct *work)
> +{
> +	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
> +					struct nvme_ctrl, failover_work);
> +
> +	nvme_flush_failover(ctrl);
>  }
>  
>  void nvme_mpath_start_request(struct request *rq)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 7563332b5b7b76fc6165ec8c6f2d144737d4fe85..10eb323bdaf139526959180c1e66ab4579bb145d 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -411,6 +411,9 @@ struct nvme_ctrl {
>  
>  	enum nvme_ctrl_type cntrltype;
>  	enum nvme_dctype dctype;
> +
> +	struct delayed_work failover_work;
> +	struct list_head failover_list;
>  };
>  
>  static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
> @@ -954,6 +957,9 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
>  void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
>  void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
>  void nvme_failover_req(struct request *req);
> +void nvme_failover_work(struct work_struct *work);
> +void nvme_schedule_failover(struct nvme_ctrl *ctrl);
> +void nvme_flush_failover(struct nvme_ctrl *ctrl);
>  void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
>  int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
>  void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
> @@ -996,6 +1002,15 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
>  static inline void nvme_failover_req(struct request *req)
>  {
>  }
> +static inline void nvme_failover_work(struct work_struct *work)
> +{
> +}
> +static inline void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> +{
> +}
> +static inline void nvme_flush_failover(struct nvme_ctrl *ctrl)
> +{
> +}
>  static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
>  {
>  }
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 86a2891d9bcc7a990cd214a7fe93fa5c55b292c7..9bee376f881b4c3ebe5502abf23a8e76829780ff 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -1127,6 +1127,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
>  
>  	nvme_stop_keep_alive(&ctrl->ctrl);
>  	flush_work(&ctrl->ctrl.async_event_work);
> +	nvme_schedule_failover(&ctrl->ctrl);
>  	nvme_rdma_teardown_io_queues(ctrl, false);
>  	nvme_unquiesce_io_queues(&ctrl->ctrl);
>  	nvme_rdma_teardown_admin_queue(ctrl, false);
> @@ -2153,6 +2154,7 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
>  
>  static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
>  {
> +	nvme_schedule_failover(&ctrl->ctrl);
>  	nvme_rdma_teardown_io_queues(ctrl, shutdown);
>  	nvme_quiesce_admin_queue(&ctrl->ctrl);
>  	nvme_disable_ctrl(&ctrl->ctrl, shutdown);
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index d0023bcfd8a79a193adf2807a24481c8c164a174..3a6c1d3febaf233996e4dcf684793327b5d1412f 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -2345,6 +2345,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
>  
>  	nvme_stop_keep_alive(ctrl);
>  	flush_work(&ctrl->async_event_work);
> +	nvme_schedule_failover(ctrl);
>  	nvme_tcp_teardown_io_queues(ctrl, false);
>  	/* unquiesce to fail fast pending requests */
>  	nvme_unquiesce_io_queues(ctrl);
> 
> -- 
> 2.48.1
>

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Daniel Wagner 9 months, 4 weeks ago

On Thu, Apr 10, 2025 at 01:51:37AM -0700, Mohamed Khalfella wrote:
> > +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> > +{
> > +	unsigned long delay;
> > +
> > +	if (ctrl->cqt)
> > +		delay = msecs_to_jiffies(ctrl->cqt);
> > +	else
> > +		delay = ctrl->kato * HZ;
> 
> I thought that delay = m * ctrl->kato + ctrl->cqt
> where m = ctrl->ctratt & NVME_CTRL_ATTR_TBKAS ? 3 : 2
> no?

The failover schedule delay is the additional amount of time we have to
wait for the target to cleanup (CQT). If the CTQ is not valid I thought
the spec said to wait for a KATO. Possible I got that wrong.

The factor 3 or 2 is relavant for the timeout value for the KATO command
we schedule. The failover schedule timeout is ontop of the command
timeout value.

> > --- a/drivers/nvme/host/multipath.c
> > +++ b/drivers/nvme/host/multipath.c
> > @@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
> >  void nvme_failover_req(struct request *req)
> >  {
> >  	struct nvme_ns *ns = req->q->queuedata;
> > +	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
> >  	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
> >  	unsigned long flags;
> >  	struct bio *bio;
> > +	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
> >  
> >  	nvme_mpath_clear_current_path(ns);
> >  
> > @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
> >  	blk_steal_bios(&ns->head->requeue_list, req);
> >  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> >  
> > -	nvme_req(req)->status = 0;
> > -	nvme_end_req(req);
> > -	kblockd_schedule_work(&ns->head->requeue_work);
> > +	spin_lock_irqsave(&ctrl->lock, flags);
> > +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> > +	spin_unlock_irqrestore(&ctrl->lock, flags);
> 
> I see this is the only place where held requests are added to
> failover_list.
> 
> - Will this hold admin requests in failover_list?

Yes.

> - What about requests that do not go through nvme_failover_req(), like
>   passthrough requests, do we not want to hold these requests until it
>   is safe for them to be retried?

Pasthrough commands should fail immediately. Userland is in charge here,
not the kernel. At least this what should happen here.

> - In case of controller reset or delete if nvme_disable_ctrl()
>   successfully disables the controller, then we do not want to add
>   canceled requests to failover_list, right? Does this implementation
>   consider this case?

Not sure. I've tested a few things but I am pretty sure this RFC is far
from being complete.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Mohamed Khalfella 9 months, 4 weeks ago

On 2025-04-15 14:17:48 +0200, Daniel Wagner wrote:
> On Thu, Apr 10, 2025 at 01:51:37AM -0700, Mohamed Khalfella wrote:
> > > +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> > > +{
> > > +	unsigned long delay;
> > > +
> > > +	if (ctrl->cqt)
> > > +		delay = msecs_to_jiffies(ctrl->cqt);
> > > +	else
> > > +		delay = ctrl->kato * HZ;
> > 
> > I thought that delay = m * ctrl->kato + ctrl->cqt
> > where m = ctrl->ctratt & NVME_CTRL_ATTR_TBKAS ? 3 : 2
> > no?
> 
> The failover schedule delay is the additional amount of time we have to
> wait for the target to cleanup (CQT). If the CTQ is not valid I thought
> the spec said to wait for a KATO. Possible I got that wrong.
> 
> The factor 3 or 2 is relavant for the timeout value for the KATO command
> we schedule. The failover schedule timeout is ontop of the command
> timeout value.
> 
> > > --- a/drivers/nvme/host/multipath.c
> > > +++ b/drivers/nvme/host/multipath.c
> > > @@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
> > >  void nvme_failover_req(struct request *req)
> > >  {
> > >  	struct nvme_ns *ns = req->q->queuedata;
> > > +	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
> > >  	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
> > >  	unsigned long flags;
> > >  	struct bio *bio;
> > > +	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
> > >  
> > >  	nvme_mpath_clear_current_path(ns);
> > >  
> > > @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
> > >  	blk_steal_bios(&ns->head->requeue_list, req);
> > >  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> > >  
> > > -	nvme_req(req)->status = 0;
> > > -	nvme_end_req(req);
> > > -	kblockd_schedule_work(&ns->head->requeue_work);
> > > +	spin_lock_irqsave(&ctrl->lock, flags);
> > > +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> > > +	spin_unlock_irqrestore(&ctrl->lock, flags);
> > 
> > I see this is the only place where held requests are added to
> > failover_list.
> > 
> > - Will this hold admin requests in failover_list?
> 
> Yes.
> 
> > - What about requests that do not go through nvme_failover_req(), like
> >   passthrough requests, do we not want to hold these requests until it
> >   is safe for them to be retried?
> 
> Pasthrough commands should fail immediately. Userland is in charge here,
> not the kernel. At least this what should happen here.

I see your point. Unless I am missing something these requests should be
held equally to bio requests from multipath layer. Let us say app
submitted write a request that got canceled immediately, how does the app
know when it is safe to retry the write request?

Holding requests like write until it is safe to be retried is the whole
point of this work, right?

> 
> > - In case of controller reset or delete if nvme_disable_ctrl()
> >   successfully disables the controller, then we do not want to add
> >   canceled requests to failover_list, right? Does this implementation
> >   consider this case?
> 
> Not sure. I've tested a few things but I am pretty sure this RFC is far
> from being complete.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Daniel Wagner 9 months, 4 weeks ago

On Tue, Apr 15, 2025 at 05:40:16PM -0700, Mohamed Khalfella wrote:
> On 2025-04-15 14:17:48 +0200, Daniel Wagner wrote:
> > Pasthrough commands should fail immediately. Userland is in charge here,
> > not the kernel. At least this what should happen here.
> 
> I see your point. Unless I am missing something these requests should be
> held equally to bio requests from multipath layer. Let us say app
> submitted write a request that got canceled immediately, how does the app
> know when it is safe to retry the write request?

Good question, but nothing new as far I can tell. If the kernel doesn't
start to retry passthru IO commands, we have to figure out how to pass
additional information to the userland.

> Holding requests like write until it is safe to be retried is the whole
> point of this work, right?

My first goal was to address the IO commands submitted via the block
layer. I didn't had the IO passthru interface on my radar. I agree,
getting the IO passthru path correct is also good idea.

Anyway, I don't have a lot of experience with the IO passthru interface.
A quick test to fitgure out what the status quo is: 'nvme read'
results in an EINTR or 'Resource temporarily unavailable' (EAGAIN or
EWOULDBLOCK) when a tcp connection between host and target is blocked
(*), though it comes from an admin command. It looks like I have to dive
into this a bit more.

(*) I think this might be the same problem as the systemd folks have
reported.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Mohamed Khalfella 9 months, 4 weeks ago

On 2025-04-16 10:30:11 +0200, Daniel Wagner wrote:
> On Tue, Apr 15, 2025 at 05:40:16PM -0700, Mohamed Khalfella wrote:
> > On 2025-04-15 14:17:48 +0200, Daniel Wagner wrote:
> > > Pasthrough commands should fail immediately. Userland is in charge here,
> > > not the kernel. At least this what should happen here.
> > 
> > I see your point. Unless I am missing something these requests should be
> > held equally to bio requests from multipath layer. Let us say app
> > submitted write a request that got canceled immediately, how does the app
> > know when it is safe to retry the write request?
> 
> Good question, but nothing new as far I can tell. If the kernel doesn't
> start to retry passthru IO commands, we have to figure out how to pass
> additional information to the userland.
> 

nvme multipath does not retry passthru commands. That is said, there is
nothing prevents userspace from retrying canceled command immediately
resulting in the unwanted behavior these very patches try to address.

> > Holding requests like write until it is safe to be retried is the whole
> > point of this work, right?
> 
> My first goal was to address the IO commands submitted via the block
> layer. I didn't had the IO passthru interface on my radar. I agree,
> getting the IO passthru path correct is also good idea.

Okay. This will be addressed in the next revision, right?

> 
> Anyway, I don't have a lot of experience with the IO passthru interface.
> A quick test to fitgure out what the status quo is: 'nvme read'
> results in an EINTR or 'Resource temporarily unavailable' (EAGAIN or
> EWOULDBLOCK) when a tcp connection between host and target is blocked
> (*), though it comes from an admin command. It looks like I have to dive
> into this a bit more.
> 
> (*) I think this might be the same problem as the systemd folks have
> reported.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Sagi Grimberg 9 months, 3 weeks ago


On 16/04/2025 16:53, Mohamed Khalfella wrote:
> On 2025-04-16 10:30:11 +0200, Daniel Wagner wrote:
>> On Tue, Apr 15, 2025 at 05:40:16PM -0700, Mohamed Khalfella wrote:
>>> On 2025-04-15 14:17:48 +0200, Daniel Wagner wrote:
>>>> Pasthrough commands should fail immediately. Userland is in charge here,
>>>> not the kernel. At least this what should happen here.
>>> I see your point. Unless I am missing something these requests should be
>>> held equally to bio requests from multipath layer. Let us say app
>>> submitted write a request that got canceled immediately, how does the app
>>> know when it is safe to retry the write request?
>> Good question, but nothing new as far I can tell. If the kernel doesn't
>> start to retry passthru IO commands, we have to figure out how to pass
>> additional information to the userland.
>>
> nvme multipath does not retry passthru commands. That is said, there is
> nothing prevents userspace from retrying canceled command immediately
> resulting in the unwanted behavior these very patches try to address.

userspace can read the controller cqt and implement the retry logic on 
its own.
If it doesn't/can't, it should use normal fs io. the driver does not 
handle passthru retries.

>
>>> Holding requests like write until it is safe to be retried is the whole
>>> point of this work, right?
>> My first goal was to address the IO commands submitted via the block
>> layer. I didn't had the IO passthru interface on my radar. I agree,
>> getting the IO passthru path correct is also good idea.
> Okay. This will be addressed in the next revision, right?

I don't think it should. passthru IO requires the issuer to understand 
the nvme
device, and CQT falls under this definition.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Mohamed Khalfella 9 months, 3 weeks ago

On 2025-04-17 01:21:08 +0300, Sagi Grimberg wrote:
> 
> 
> On 16/04/2025 16:53, Mohamed Khalfella wrote:
> > On 2025-04-16 10:30:11 +0200, Daniel Wagner wrote:
> >> On Tue, Apr 15, 2025 at 05:40:16PM -0700, Mohamed Khalfella wrote:
> >>> On 2025-04-15 14:17:48 +0200, Daniel Wagner wrote:
> >>>> Pasthrough commands should fail immediately. Userland is in charge here,
> >>>> not the kernel. At least this what should happen here.
> >>> I see your point. Unless I am missing something these requests should be
> >>> held equally to bio requests from multipath layer. Let us say app
> >>> submitted write a request that got canceled immediately, how does the app
> >>> know when it is safe to retry the write request?
> >> Good question, but nothing new as far I can tell. If the kernel doesn't
> >> start to retry passthru IO commands, we have to figure out how to pass
> >> additional information to the userland.
> >>
> > nvme multipath does not retry passthru commands. That is said, there is
> > nothing prevents userspace from retrying canceled command immediately
> > resulting in the unwanted behavior these very patches try to address.
> 
> userspace can read the controller cqt and implement the retry logic on 
> its own.
> If it doesn't/can't, it should use normal fs io. the driver does not 
> handle passthru retries.

passthru requests are not very different from normal IO. If the driver
holds normal IO requests to prevent corruption, it should hold passthru
requests too, for the same reason, no?

IMO, keeping the request holding logic in the driver makes more sense
than implementing it in userspace. One reason is that CCR can help
release requests held requests faster.

> 
> >
> >>> Holding requests like write until it is safe to be retried is the whole
> >>> point of this work, right?
> >> My first goal was to address the IO commands submitted via the block
> >> layer. I didn't had the IO passthru interface on my radar. I agree,
> >> getting the IO passthru path correct is also good idea.
> > Okay. This will be addressed in the next revision, right?
> 
> I don't think it should. passthru IO requires the issuer to understand 
> the nvme
> device, and CQT falls under this definition.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Hannes Reinecke 9 months, 3 weeks ago

On 4/17/25 00:59, Mohamed Khalfella wrote:
> On 2025-04-17 01:21:08 +0300, Sagi Grimberg wrote:
>>
>>
>> On 16/04/2025 16:53, Mohamed Khalfella wrote:
>>> On 2025-04-16 10:30:11 +0200, Daniel Wagner wrote:
>>>> On Tue, Apr 15, 2025 at 05:40:16PM -0700, Mohamed Khalfella wrote:
>>>>> On 2025-04-15 14:17:48 +0200, Daniel Wagner wrote:
>>>>>> Pasthrough commands should fail immediately. Userland is in charge here,
>>>>>> not the kernel. At least this what should happen here.
>>>>> I see your point. Unless I am missing something these requests should be
>>>>> held equally to bio requests from multipath layer. Let us say app
>>>>> submitted write a request that got canceled immediately, how does the app
>>>>> know when it is safe to retry the write request?
>>>> Good question, but nothing new as far I can tell. If the kernel doesn't
>>>> start to retry passthru IO commands, we have to figure out how to pass
>>>> additional information to the userland.
>>>>
>>> nvme multipath does not retry passthru commands. That is said, there is
>>> nothing prevents userspace from retrying canceled command immediately
>>> resulting in the unwanted behavior these very patches try to address.
>>
>> userspace can read the controller cqt and implement the retry logic on
>> its own.
>> If it doesn't/can't, it should use normal fs io. the driver does not
>> handle passthru retries.
> 
> passthru requests are not very different from normal IO. If the driver
> holds normal IO requests to prevent corruption, it should hold passthru
> requests too, for the same reason, no?
> 
> IMO, keeping the request holding logic in the driver makes more sense
> than implementing it in userspace. One reason is that CCR can help
> release requests held requests faster.
> 
One thing to keep in mind: We cannot hold requests during controller 
reset. Requests are an index into a statically allocated array from
the request queue, which gets deleted when the request queue is
removed during controller teardown.

So I _really_ would like to exclude handling of admin and passthrough
commands for now, as there are extremely few commands which are not
idempotent. If we really care we can just error them out upon submission
until error recovery is done.
But I'm not sure if it's worth the hassle; at this time we don't even
handle admin commands correctly (admin commands should not be affected
by the ANA status, yet they are).

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Mohamed Khalfella 9 months, 4 weeks ago

On 2025-04-15 14:17:48 +0200, Daniel Wagner wrote:
> On Thu, Apr 10, 2025 at 01:51:37AM -0700, Mohamed Khalfella wrote:
> > > +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> > > +{
> > > +	unsigned long delay;
> > > +
> > > +	if (ctrl->cqt)
> > > +		delay = msecs_to_jiffies(ctrl->cqt);
> > > +	else
> > > +		delay = ctrl->kato * HZ;
> > 
> > I thought that delay = m * ctrl->kato + ctrl->cqt
> > where m = ctrl->ctratt & NVME_CTRL_ATTR_TBKAS ? 3 : 2
> > no?
> 
> The failover schedule delay is the additional amount of time we have to
> wait for the target to cleanup (CQT). If the CTQ is not valid I thought
> the spec said to wait for a KATO. Possible I got that wrong.
> 
> The factor 3 or 2 is relavant for the timeout value for the KATO command
> we schedule. The failover schedule timeout is ontop of the command
> timeout value.
> 
> > > --- a/drivers/nvme/host/multipath.c
> > > +++ b/drivers/nvme/host/multipath.c
> > > @@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
> > >  void nvme_failover_req(struct request *req)
> > >  {
> > >  	struct nvme_ns *ns = req->q->queuedata;
> > > +	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
> > >  	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
> > >  	unsigned long flags;
> > >  	struct bio *bio;
> > > +	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
> > >  
> > >  	nvme_mpath_clear_current_path(ns);
> > >  
> > > @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
> > >  	blk_steal_bios(&ns->head->requeue_list, req);
> > >  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> > >  
> > > -	nvme_req(req)->status = 0;
> > > -	nvme_end_req(req);
> > > -	kblockd_schedule_work(&ns->head->requeue_work);
> > > +	spin_lock_irqsave(&ctrl->lock, flags);
> > > +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> > > +	spin_unlock_irqrestore(&ctrl->lock, flags);
> > 
> > I see this is the only place where held requests are added to
> > failover_list.
> > 
> > - Will this hold admin requests in failover_list?
> 
> Yes.

Help me see this:

- nvme_failover_req() is the only place reqs are added to failover_list.
- nvme_decide_disposition() returns FAILOVER only if req has REQ_NVME_MPATH set.

How/where do admin requests get REQ_NVME_MPATH set?

> 
> > - What about requests that do not go through nvme_failover_req(), like
> >   passthrough requests, do we not want to hold these requests until it
> >   is safe for them to be retried?
> 
> Pasthrough commands should fail immediately. Userland is in charge here,
> not the kernel. At least this what should happen here.
> 
> > - In case of controller reset or delete if nvme_disable_ctrl()
> >   successfully disables the controller, then we do not want to add
> >   canceled requests to failover_list, right? Does this implementation
> >   consider this case?
> 
> Not sure. I've tested a few things but I am pretty sure this RFC is far
> from being complete.

I think it does not, and maybe it should honor this. Otherwise every
controller reset/delete will end up holding requests unnecessarily.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Daniel Wagner 9 months, 4 weeks ago

On Tue, Apr 15, 2025 at 05:17:38PM -0700, Mohamed Khalfella wrote:
> Help me see this:
> 
> - nvme_failover_req() is the only place reqs are added to failover_list.
> - nvme_decide_disposition() returns FAILOVER only if req has REQ_NVME_MPATH set.
> 
> How/where do admin requests get REQ_NVME_MPATH set?

Admin commands don't set REQ_NVME_MPATH. This is what the current code
does and I have deliberately decided not to touch this with this RFC.

Given how much discussion the CQT/CCR feature triggers, I don't think
it's a good idea to add this topic to this discussion.

> > > - What about requests that do not go through nvme_failover_req(), like
> > >   passthrough requests, do we not want to hold these requests until it
> > >   is safe for them to be retried?
> > 
> > Pasthrough commands should fail immediately. Userland is in charge here,
> > not the kernel. At least this what should happen here.
> > 
> > > - In case of controller reset or delete if nvme_disable_ctrl()
> > >   successfully disables the controller, then we do not want to add
> > >   canceled requests to failover_list, right? Does this implementation
> > >   consider this case?
> > 
> > Not sure. I've tested a few things but I am pretty sure this RFC is far
> > from being complete.
> 
> I think it does not, and maybe it should honor this. Otherwise every
> controller reset/delete will end up holding requests unnecessarily.

Yes, this is one of the problems with the failover queue. It could be
solved by really starting to track the delay timeout for each commands.
But this is a lot of logic code and complexity. Thus during the
discussion at LSFMM everyone including me, said failover queue idea
should not be our first choice.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Mohamed Khalfella 9 months, 4 weeks ago

On 2025-04-16 08:57:19 +0200, Daniel Wagner wrote:
> On Tue, Apr 15, 2025 at 05:17:38PM -0700, Mohamed Khalfella wrote:
> > Help me see this:
> > 
> > - nvme_failover_req() is the only place reqs are added to failover_list.
> > - nvme_decide_disposition() returns FAILOVER only if req has REQ_NVME_MPATH set.
> > 
> > How/where do admin requests get REQ_NVME_MPATH set?
> 
> Admin commands don't set REQ_NVME_MPATH. This is what the current code
> does and I have deliberately decided not to touch this with this RFC.
> 
> Given how much discussion the CQT/CCR feature triggers, I don't think
> it's a good idea to add this topic to this discussion.
> 

The point is that holding requests at nvme_failover_req() does not cover
admin requests. Do you plan to add support for holding admin requests in
the next revision of these patches?

> > > > - What about requests that do not go through nvme_failover_req(), like
> > > >   passthrough requests, do we not want to hold these requests until it
> > > >   is safe for them to be retried?
> > > 
> > > Pasthrough commands should fail immediately. Userland is in charge here,
> > > not the kernel. At least this what should happen here.
> > > 
> > > > - In case of controller reset or delete if nvme_disable_ctrl()
> > > >   successfully disables the controller, then we do not want to add
> > > >   canceled requests to failover_list, right? Does this implementation
> > > >   consider this case?
> > > 
> > > Not sure. I've tested a few things but I am pretty sure this RFC is far
> > > from being complete.
> > 
> > I think it does not, and maybe it should honor this. Otherwise every
> > controller reset/delete will end up holding requests unnecessarily.
> 
> Yes, this is one of the problems with the failover queue. It could be
> solved by really starting to track the delay timeout for each commands.
> But this is a lot of logic code and complexity. Thus during the
> discussion at LSFMM everyone including me, said failover queue idea
> should not be our first choice.

Got it. I assume this will be addressed in the next revision?

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Randy Jennings 9 months, 4 weeks ago

On Tue, Apr 15, 2025 at 5:17 AM Daniel Wagner <dwagner@suse.de> wrote:
> On Thu, Apr 10, 2025 at 01:51:37AM -0700, Mohamed Khalfella wrote:
> > > +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> > > +   if (ctrl->cqt)
> > > +           delay = msecs_to_jiffies(ctrl->cqt);
> > > +   else
> > > +           delay = ctrl->kato * HZ;
> >
> > I thought that delay = m * ctrl->kato + ctrl->cqt
> > where m = ctrl->ctratt & NVME_CTRL_ATTR_TBKAS ? 3 : 2
> > no?
>
> The failover schedule delay is the additional amount of time we have to
> wait for the target to cleanup (CQT). If the CTQ is not valid I thought
> the spec said to wait for a KATO. Possible I got that wrong.
This is correct (according to the spec, if CQT is 0, wait for KATO
instead of 0).  I would treat that as a suggestion, though.

> The factor 3 or 2 is relavant for the timeout value for the KATO command
> we schedule. The failover schedule timeout is ontop of the command
> timeout value.
3 or 2 is not related to the timeout value of the KATO command.  The
timeout value of the KATO command is whatever the host wants it to be
(with some values being more productive than others).  The target is
supposed to respond to the KATO command as soon as the target receives
it (roughly).  The host timeout value should account for network
delays and getting-around-to-it delays on the target.

2*/3*KATO is from when the host has detected a loss of communication
to when the host knows (given some assumptions) that the target has
detected a loss of communication.  A command timeout on the host is a
fine time for the host to decide that it has lost communication with
the target, but there are other events.  At the time the host has
detected a loss of communication, it needs to tear down the
association (which includes stopping KATO & starting disconnect on
_all_ the connections in the association).  CQT does not start until
the host knows that the target has detected a loss of communication.
So, Mohamed's delay is correct.

> > - What about requests that do not go through nvme_failover_req(), like
> >   passthrough requests, do we not want to hold these requests until it
> >   is safe for them to be retried?
>
> Pasthrough commands should fail immediately. Userland is in charge here,
> not the kernel. At least this what should happen here.
This is not correct according to the spec, and, I believe, not a good
implementation choice.  The driver (in the spec) is instructed _not_
to return an error for any request until it knows (given some
assumptions) that the target is no longer processing the request (or
that processing does not matter; as in the case of a READ).

Sincerely,
Randy Jennings

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Daniel Wagner 9 months, 4 weeks ago

On Tue, Apr 15, 2025 at 03:56:13PM -0700, Randy Jennings wrote:
> 2*/3*KATO is from when the host has detected a loss of communication
> to when the host knows (given some assumptions) that the target has
> detected a loss of communication.  A command timeout on the host is a
> fine time for the host to decide that it has lost communication with
> the target, but there are other events.  At the time the host has
> detected a loss of communication, it needs to tear down the
> association (which includes stopping KATO & starting disconnect on
> _all_ the connections in the association).  CQT does not start until
> the host knows that the target has detected a loss of communication.
> So, Mohamed's delay is correct.

I was answering the question why I have chosen the values
nvme_schedule_failover and wanted to point out these values are not
related to the timeout detection.

> > > - What about requests that do not go through nvme_failover_req(), like
> > >   passthrough requests, do we not want to hold these requests until it
> > >   is safe for them to be retried?
> >
> > Pasthrough commands should fail immediately. Userland is in charge here,
> > not the kernel. At least this what should happen here.
> This is not correct according to the spec, and, I believe, not a good
> implementation choice.  The driver (in the spec) is instructed _not_
> to return an error for any request until it knows (given some
> assumptions) that the target is no longer processing the request (or
> that processing does not matter; as in the case of a READ).

I was not precise enough: admin commands should fail.

I suggest to keep this topic separated for the time being. It makes
this discussion even more complex.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Sagi Grimberg 9 months, 4 weeks ago


On 10/04/2025 11:51, Mohamed Khalfella wrote:
> On 2025-03-24 13:07:58 +0100, Daniel Wagner wrote:
>> The TP4129 mendates that the failover should be delayed by CQT.  Thus when
>> nvme_decide_disposition returns FAILOVER do not immediately re-queue it on
>> the namespace level instead queue it on the ctrl's request_list and
>> moved later to the namespace's requeue_list.
>>
>> Signed-off-by: Daniel Wagner <wagi@kernel.org>
>> ---
>>   drivers/nvme/host/core.c      | 19 ++++++++++++++++
>>   drivers/nvme/host/fc.c        |  4 ++++
>>   drivers/nvme/host/multipath.c | 52 ++++++++++++++++++++++++++++++++++++++++---
>>   drivers/nvme/host/nvme.h      | 15 +++++++++++++
>>   drivers/nvme/host/rdma.c      |  2 ++
>>   drivers/nvme/host/tcp.c       |  1 +
>>   6 files changed, 90 insertions(+), 3 deletions(-)
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index 135045528ea1c79eac0d6d47d5f7f05a7c98acc4..f3155c7735e75e06c4359c26db8931142c067e1d 100644
>> --- a/drivers/nvme/host/core.c
>> +++ b/drivers/nvme/host/core.c
>> @@ -239,6 +239,7 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
>>   
>>   	flush_work(&ctrl->reset_work);
>>   	nvme_stop_ctrl(ctrl);
>> +	nvme_flush_failover(ctrl);
>>   	nvme_remove_namespaces(ctrl);
>>   	ctrl->ops->delete_ctrl(ctrl);
>>   	nvme_uninit_ctrl(ctrl);
>> @@ -1310,6 +1311,19 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
>>   	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
>>   }
>>   
>> +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
>> +{
>> +	unsigned long delay;
>> +
>> +	if (ctrl->cqt)
>> +		delay = msecs_to_jiffies(ctrl->cqt);
>> +	else
>> +		delay = ctrl->kato * HZ;
> I thought that delay = m * ctrl->kato + ctrl->cqt
> where m = ctrl->ctratt & NVME_CTRL_ATTR_TBKAS ? 3 : 2
> no?

This was said before, but if we are going to always start waiting for 
kato for failover purposes,
we first need a patch that prevent kato from being arbitrarily long.

Lets cap kato to something like 10 seconds (which is 2x the default 
which apparently no one is touching).

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Daniel Wagner 9 months, 4 weeks ago

On Tue, Apr 15, 2025 at 01:28:15AM +0300, Sagi Grimberg wrote:
> > > +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> > > +{
> > > +	unsigned long delay;
> > > +
> > > +	if (ctrl->cqt)
> > > +		delay = msecs_to_jiffies(ctrl->cqt);
> > > +	else
> > > +		delay = ctrl->kato * HZ;
> > I thought that delay = m * ctrl->kato + ctrl->cqt
> > where m = ctrl->ctratt & NVME_CTRL_ATTR_TBKAS ? 3 : 2
> > no?
> 
> This was said before, but if we are going to always start waiting for kato
> for failover purposes,
> we first need a patch that prevent kato from being arbitrarily long.

That should be addressed with the cross controller reset (CCR). The KATO*n
+ CQT is the upper limit for the target recovery. As soon we have CCR,
the recovery delay is reduced to the time the CCR exchange takes.

> Lets cap kato to something like 10 seconds (which is 2x the default which
> apparently no one is touching).

If I understood the TP4129 the upper limit is now defined, so we don't
have to define our own upper limit.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Sagi Grimberg 9 months, 4 weeks ago

On 15/04/2025 15:11, Daniel Wagner wrote:
> On Tue, Apr 15, 2025 at 01:28:15AM +0300, Sagi Grimberg wrote:
>>>> +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
>>>> +{
>>>> +	unsigned long delay;
>>>> +
>>>> +	if (ctrl->cqt)
>>>> +		delay = msecs_to_jiffies(ctrl->cqt);
>>>> +	else
>>>> +		delay = ctrl->kato * HZ;
>>> I thought that delay = m * ctrl->kato + ctrl->cqt
>>> where m = ctrl->ctratt & NVME_CTRL_ATTR_TBKAS ? 3 : 2
>>> no?
>> This was said before, but if we are going to always start waiting for kato
>> for failover purposes,
>> we first need a patch that prevent kato from being arbitrarily long.
> That should be addressed with the cross controller reset (CCR).

CCR is a better solution as it is explicit, and faster.

>   The KATO*n
> + CQT is the upper limit for the target recovery. As soon we have CCR,
> the recovery delay is reduced to the time the CCR exchange takes.

What I meant was that the user can no longer set kato to be arbitrarily 
long when we
now introduce failover dependency on it.

We need to set a sane maximum value that will failover in a reasonable 
timeframe.
In other words, kato cannot be allowed to be set by the user to 60 
minutes. While we didn't
care about it before, now it means that failover may take 60+ minutes.

Hence, my request to set kato to a max absolute value of seconds. My 
vote was 10 (2x of the default),
but we can also go with 30.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Randy Jennings 9 months, 4 weeks ago

On Tue, Apr 15, 2025 at 2:07 PM Sagi Grimberg <sagi@grimberg.me> wrote:
> On 15/04/2025 15:11, Daniel Wagner wrote:
> > On Tue, Apr 15, 2025 at 01:28:15AM +0300, Sagi Grimberg wrote:
> >>>> +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> >>>> +{
> >>>> +  unsigned long delay;
> >>>> +
> >>>> +  if (ctrl->cqt)
> >>>> +          delay = msecs_to_jiffies(ctrl->cqt);
> >>>> +  else
> >>>> +          delay = ctrl->kato * HZ;
> >>> I thought that delay = m * ctrl->kato + ctrl->cqt
> >>> where m = ctrl->ctratt & NVME_CTRL_ATTR_TBKAS ? 3 : 2
> >>> no?
> >> This was said before, but if we are going to always start waiting for kato
> >> for failover purposes,
> >> we first need a patch that prevent kato from being arbitrarily long.
> > That should be addressed with the cross controller reset (CCR).
>
> CCR is a better solution as it is explicit, and faster.
>
> >   The KATO*n
> > + CQT is the upper limit for the target recovery. As soon we have CCR,
> > the recovery delay is reduced to the time the CCR exchange takes.
>
> What I meant was that the user can no longer set kato to be arbitrarily
> long when we
> now introduce failover dependency on it.
>
> We need to set a sane maximum value that will failover in a reasonable
> timeframe.
> In other words, kato cannot be allowed to be set by the user to 60
> minutes. While we didn't
> care about it before, now it means that failover may take 60+ minutes.
>
> Hence, my request to set kato to a max absolute value of seconds. My
> vote was 10 (2x of the default),
> but we can also go with 30.
Adding a maximum value for KATO makes a lot of sense to me.  This will
help keep us away from a hung task timeout when the full delay is
taken into account.  30 makes sense to me from the perspective that
the maximum should be long enough to handle non-ideal situations
functionally, but not a value that you expect people to use regularly.

I think CQT should have a maximum allowed value for similar reasons.
If we do clamp down on the CQT, we could be opening ourselves to the
target not completely cleaning up, but it keeps us from a hung task
timeout, and _any_ delay will help most of the time.

CCR will not address arbitrarily long times for either because:
1. It is optional.
2. It may fail.
3. We still need a ceiling on the recovery time we can handle.

Sincerely,
Randy Jennings

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Sagi Grimberg 9 months, 4 weeks ago

>> What I meant was that the user can no longer set kato to be arbitrarily
>> long when we
>> now introduce failover dependency on it.
>>
>> We need to set a sane maximum value that will failover in a reasonable
>> timeframe.
>> In other words, kato cannot be allowed to be set by the user to 60
>> minutes. While we didn't
>> care about it before, now it means that failover may take 60+ minutes.
>>
>> Hence, my request to set kato to a max absolute value of seconds. My
>> vote was 10 (2x of the default),
>> but we can also go with 30.
> Adding a maximum value for KATO makes a lot of sense to me.  This will
> help keep us away from a hung task timeout when the full delay is
> taken into account.  30 makes sense to me from the perspective that
> the maximum should be long enough to handle non-ideal situations
> functionally, but not a value that you expect people to use regularly.
>
> I think CQT should have a maximum allowed value for similar reasons.
> If we do clamp down on the CQT, we could be opening ourselves to the
> target not completely cleaning up, but it keeps us from a hung task
> timeout, and _any_ delay will help most of the time.

CQT comes from the controller, and if it is high, it effectively means 
that the
controller cannot handle faster failover reliably. So I think we should 
leave it
as is. It is the vendor problem.

>
> CCR will not address arbitrarily long times for either because:
> 1. It is optional.
> 2. It may fail.
> 3. We still need a ceiling on the recovery time we can handle.

Yes, makes sense. if it fails, we need to wait until something expires, 
which would
be CQT.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Randy Jennings 9 months, 4 weeks ago

On Tue, Apr 15, 2025 at 4:35 PM Sagi Grimberg <sagi@grimberg.me> wrote:
>
>
> >> What I meant was that the user can no longer set kato to be arbitrarily
> >> long when we
> >> now introduce failover dependency on it.
> >>
> >> We need to set a sane maximum value that will failover in a reasonable
> >> timeframe.
> >> In other words, kato cannot be allowed to be set by the user to 60
> >> minutes. While we didn't
> >> care about it before, now it means that failover may take 60+ minutes.
> >>
> >> Hence, my request to set kato to a max absolute value of seconds. My
> >> vote was 10 (2x of the default),
> >> but we can also go with 30.
> > Adding a maximum value for KATO makes a lot of sense to me.  This will
> > help keep us away from a hung task timeout when the full delay is
> > taken into account.  30 makes sense to me from the perspective that
> > the maximum should be long enough to handle non-ideal situations
> > functionally, but not a value that you expect people to use regularly.
> >
> > I think CQT should have a maximum allowed value for similar reasons.
> > If we do clamp down on the CQT, we could be opening ourselves to the
> > target not completely cleaning up, but it keeps us from a hung task
> > timeout, and _any_ delay will help most of the time.
>
> CQT comes from the controller, and if it is high, it effectively means
> that the
> controller cannot handle faster failover reliably. So I think we should
> leave it
> as is. It is the vendor problem.
Okay, that is one way to approach it.  However, because of the hung
task issue, we would be allowing the vendor to panic the initiator
with a hung task.  Until CCR, and without implementing other checks
(for events which might not happen), this hung task would happen on
every messy disconnect with that vendor/array.

Sincerely,
Randy Jennings

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Sagi Grimberg 9 months, 3 weeks ago

>> CQT comes from the controller, and if it is high, it effectively means
>> that the
>> controller cannot handle faster failover reliably. So I think we should
>> leave it
>> as is. It is the vendor problem.
> Okay, that is one way to approach it.  However, because of the hung
> task issue, we would be allowing the vendor to panic the initiator
> with a hung task.  Until CCR, and without implementing other checks
> (for events which might not happen), this hung task would happen on
> every messy disconnect with that vendor/array.

Its kind of pick your poison situation I guess.
We can log an error for controllers that expose overly long CQT...

Not sure we'll see a hung task here tho, its not like there is a kthread 
blocking
on this, its a delayed work so I think the watchdog won't complain about 
it...

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Randy Jennings 9 months, 3 weeks ago

On Wed, Apr 16, 2025 at 3:15 PM Sagi Grimberg <sagi@grimberg.me> wrote:
>
>
> >> CQT comes from the controller, and if it is high, it effectively means
> >> that the
> >> controller cannot handle faster failover reliably. So I think we should
> >> leave it
> >> as is. It is the vendor problem.
> > Okay, that is one way to approach it.  However, because of the hung
> > task issue, we would be allowing the vendor to panic the initiator
> > with a hung task.  Until CCR, and without implementing other checks
> > (for events which might not happen), this hung task would happen on
> > every messy disconnect with that vendor/array.
>
> Its kind of pick your poison situation I guess.
> We can log an error for controllers that expose overly long CQT...
That sounds like a good idea.

> Not sure we'll see a hung task here tho, its not like there is a kthread
> blocking
> on this, its a delayed work so I think the watchdog won't complain about
> it...
That is probably true with this patch set.

I believe controller reset (for instance, requested through sysfs) is
not supposed to finish until all the requests are no longer being
processed (at least if it should have the semantics of a controller
level reset from the spec).  This patch set may not tie the two
together on a disconnected controller, but I think it should.  Also,
if reconnection in error recovery is tied to this delay, as it is in
the patches Mohamed posted (https://lkml.org/lkml/2025/3/24/1136),
there were other things waiting on error recovery finishing.  Delaying
reconnection in error recovery until the requests are dead makes a lot
of sense to me.

Sincerely,
Randy Jennings

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Nilay Shroff 10 months, 1 week ago


On 3/24/25 5:37 PM, Daniel Wagner wrote:
> The TP4129 mendates that the failover should be delayed by CQT.  Thus when
> nvme_decide_disposition returns FAILOVER do not immediately re-queue it on
> the namespace level instead queue it on the ctrl's request_list and
> moved later to the namespace's requeue_list.
> 
> Signed-off-by: Daniel Wagner <wagi@kernel.org>
> ---
>  drivers/nvme/host/core.c      | 19 ++++++++++++++++
>  drivers/nvme/host/fc.c        |  4 ++++
>  drivers/nvme/host/multipath.c | 52 ++++++++++++++++++++++++++++++++++++++++---
>  drivers/nvme/host/nvme.h      | 15 +++++++++++++
>  drivers/nvme/host/rdma.c      |  2 ++
>  drivers/nvme/host/tcp.c       |  1 +
>  6 files changed, 90 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 135045528ea1c79eac0d6d47d5f7f05a7c98acc4..f3155c7735e75e06c4359c26db8931142c067e1d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -239,6 +239,7 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
>  
>  	flush_work(&ctrl->reset_work);
>  	nvme_stop_ctrl(ctrl);
> +	nvme_flush_failover(ctrl);
>  	nvme_remove_namespaces(ctrl);
>  	ctrl->ops->delete_ctrl(ctrl);
>  	nvme_uninit_ctrl(ctrl);
> @@ -1310,6 +1311,19 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
>  	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
>  }
>  
> +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> +{
> +	unsigned long delay;
> +
> +	if (ctrl->cqt)
> +		delay = msecs_to_jiffies(ctrl->cqt);
> +	else
> +		delay = ctrl->kato * HZ;
> +
> +	queue_delayed_work(nvme_wq, &ctrl->failover_work, delay);
> +}
> +EXPORT_SYMBOL_GPL(nvme_schedule_failover);
> +
>  static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
>  						 blk_status_t status)
>  {
> @@ -1336,6 +1350,8 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
>  		dev_err(ctrl->device,
>  			"failed nvme_keep_alive_end_io error=%d\n",
>  				status);
> +
> +		nvme_schedule_failover(ctrl);
>  		return RQ_END_IO_NONE;
>  	}
>  
> @@ -4716,6 +4732,7 @@ EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
>  
>  void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
>  {
> +	nvme_schedule_failover(ctrl);
>  	nvme_mpath_stop(ctrl);
>  	nvme_auth_stop(ctrl);
>  	nvme_stop_failfast_work(ctrl);
> @@ -4842,6 +4859,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>  
>  	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
>  	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
> +	INIT_DELAYED_WORK(&ctrl->failover_work, nvme_failover_work);
> +	INIT_LIST_HEAD(&ctrl->failover_list);
>  	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
>  	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
>  	ctrl->ka_last_check_time = jiffies;
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index cdc1ba277a5c23ef1afd26e6911b082f3d12b215..bd897b29cd286008b781bbcb4230e08019da6b6b 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -2553,6 +2553,8 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
>  {
>  	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
>  
> +	nvme_schedule_failover(&ctrl->ctrl);
> +
>  	/*
>  	 * if an error (io timeout, etc) while (re)connecting, the remote
>  	 * port requested terminating of the association (disconnect_ls)
> @@ -3378,6 +3380,8 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
>  	/* will block will waiting for io to terminate */
>  	nvme_fc_delete_association(ctrl);
>  
> +	nvme_schedule_failover(&ctrl->ctrl);
> +
>  	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
>  		dev_err(ctrl->ctrl.device,
>  			"NVME-FC{%d}: error_recovery: Couldn't change state "
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 2a7635565083046c575efe1793362ae10581defd..a14b055796b982df96609f53174a5d1334c1c0c4 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>  void nvme_failover_req(struct request *req)
>  {
>  	struct nvme_ns *ns = req->q->queuedata;
> +	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
>  	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
>  	unsigned long flags;
>  	struct bio *bio;
> +	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
>  
>  	nvme_mpath_clear_current_path(ns);
>  
> @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
>  	blk_steal_bios(&ns->head->requeue_list, req);
>  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>  
> -	nvme_req(req)->status = 0;
> -	nvme_end_req(req);
> -	kblockd_schedule_work(&ns->head->requeue_work);
> +	spin_lock_irqsave(&ctrl->lock, flags);
> +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> +	spin_unlock_irqrestore(&ctrl->lock, flags);
> +

Why do we need to wait until error_recovery for scheduling failover? 
Can't we schedule failover as soon as we get path error? Also can't
we avoid failover_list? 

Thanks,
--Nilay

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Daniel Wagner 9 months, 4 weeks ago

On Tue, Apr 01, 2025 at 07:02:11PM +0530, Nilay Shroff wrote:
 > -	kblockd_schedule_work(&ns->head->requeue_work);
> > +	spin_lock_irqsave(&ctrl->lock, flags);
> > +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> > +	spin_unlock_irqrestore(&ctrl->lock, flags);
> > +
> 
> Why do we need to wait until error_recovery for scheduling failover?

This is where the delay is added to the processing. The failed requests
(timeout) are held back by the delay here and after the wait the are
immediately fall over

> Can't we schedule failover as soon as we get path error? Also can't
> we avoid failover_list? 

Then we have exactly what we have now. An failed request is rescheduled
to the next path immediately.

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Hannes Reinecke 10 months, 1 week ago

On 3/24/25 13:07, Daniel Wagner wrote:
> The TP4129 mendates that the failover should be delayed by CQT.  Thus when
> nvme_decide_disposition returns FAILOVER do not immediately re-queue it on
> the namespace level instead queue it on the ctrl's request_list and
> moved later to the namespace's requeue_list.
> 
> Signed-off-by: Daniel Wagner <wagi@kernel.org>
> ---
>   drivers/nvme/host/core.c      | 19 ++++++++++++++++
>   drivers/nvme/host/fc.c        |  4 ++++
>   drivers/nvme/host/multipath.c | 52 ++++++++++++++++++++++++++++++++++++++++---
>   drivers/nvme/host/nvme.h      | 15 +++++++++++++
>   drivers/nvme/host/rdma.c      |  2 ++
>   drivers/nvme/host/tcp.c       |  1 +
>   6 files changed, 90 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 135045528ea1c79eac0d6d47d5f7f05a7c98acc4..f3155c7735e75e06c4359c26db8931142c067e1d 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -239,6 +239,7 @@ static void nvme_do_delete_ctrl(struct nvme_ctrl *ctrl)
>   
>   	flush_work(&ctrl->reset_work);
>   	nvme_stop_ctrl(ctrl);
> +	nvme_flush_failover(ctrl);
>   	nvme_remove_namespaces(ctrl);
>   	ctrl->ops->delete_ctrl(ctrl);
>   	nvme_uninit_ctrl(ctrl);
> @@ -1310,6 +1311,19 @@ static void nvme_queue_keep_alive_work(struct nvme_ctrl *ctrl)
>   	queue_delayed_work(nvme_wq, &ctrl->ka_work, delay);
>   }
>   
> +void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> +{
> +	unsigned long delay;
> +
> +	if (ctrl->cqt)
> +		delay = msecs_to_jiffies(ctrl->cqt);
> +	else
> +		delay = ctrl->kato * HZ;
> +
> +	queue_delayed_work(nvme_wq, &ctrl->failover_work, delay);
> +}
> +EXPORT_SYMBOL_GPL(nvme_schedule_failover);
> +
>   static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
>   						 blk_status_t status)
>   {
> @@ -1336,6 +1350,8 @@ static enum rq_end_io_ret nvme_keep_alive_end_io(struct request *rq,
>   		dev_err(ctrl->device,
>   			"failed nvme_keep_alive_end_io error=%d\n",
>   				status);
> +
> +		nvme_schedule_failover(ctrl);
>   		return RQ_END_IO_NONE;
>   	}
>   
> @@ -4716,6 +4732,7 @@ EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
>   
>   void nvme_stop_ctrl(struct nvme_ctrl *ctrl)
>   {
> +	nvme_schedule_failover(ctrl);
>   	nvme_mpath_stop(ctrl);
>   	nvme_auth_stop(ctrl);
>   	nvme_stop_failfast_work(ctrl);
> @@ -4842,6 +4859,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
>   
>   	INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work);
>   	INIT_DELAYED_WORK(&ctrl->failfast_work, nvme_failfast_work);
> +	INIT_DELAYED_WORK(&ctrl->failover_work, nvme_failover_work);
> +	INIT_LIST_HEAD(&ctrl->failover_list);
>   	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
>   	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
>   	ctrl->ka_last_check_time = jiffies;
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index cdc1ba277a5c23ef1afd26e6911b082f3d12b215..bd897b29cd286008b781bbcb4230e08019da6b6b 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -2553,6 +2553,8 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
>   {
>   	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
>   
> +	nvme_schedule_failover(&ctrl->ctrl);
> +
>   	/*
>   	 * if an error (io timeout, etc) while (re)connecting, the remote
>   	 * port requested terminating of the association (disconnect_ls)
> @@ -3378,6 +3380,8 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
>   	/* will block will waiting for io to terminate */
>   	nvme_fc_delete_association(ctrl);
>   
> +	nvme_schedule_failover(&ctrl->ctrl);
> +
>   	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
>   		dev_err(ctrl->ctrl.device,
>   			"NVME-FC{%d}: error_recovery: Couldn't change state "
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 2a7635565083046c575efe1793362ae10581defd..a14b055796b982df96609f53174a5d1334c1c0c4 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -86,9 +86,11 @@ void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
>   void nvme_failover_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> +	struct nvme_ctrl *ctrl = nvme_req(req)->ctrl;
>   	u16 status = nvme_req(req)->status & NVME_SCT_SC_MASK;
>   	unsigned long flags;
>   	struct bio *bio;
> +	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
>   
>   	nvme_mpath_clear_current_path(ns);
>   
> @@ -121,9 +123,53 @@ void nvme_failover_req(struct request *req)
>   	blk_steal_bios(&ns->head->requeue_list, req);
>   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>   
> -	nvme_req(req)->status = 0;
> -	nvme_end_req(req);
> -	kblockd_schedule_work(&ns->head->requeue_work);
> +	spin_lock_irqsave(&ctrl->lock, flags);
> +	list_add_tail(&req->queuelist, &ctrl->failover_list);
> +	spin_unlock_irqrestore(&ctrl->lock, flags);
> +
> +	if (state == NVME_CTRL_DELETING) {
> +		/*
> +		 * request which fail over in the DELETING state were
> +		 * canceled and blk_mq_tagset_wait_completed_request will
> +		 * block until they have been proceed. Though
> +		 * nvme_failover_work is already stopped. Thus schedule
> +		 * a failover; it's still necessary to delay these commands
> +		 * by CQT.
> +		 */
> +		nvme_schedule_failover(ctrl);
> +	}
> +}
> +
> +void nvme_flush_failover(struct nvme_ctrl *ctrl)
> +{
> +	LIST_HEAD(failover_list);
> +	struct request *rq;
> +	bool kick = false;
> +
> +	spin_lock_irq(&ctrl->lock);
> +	list_splice_init(&ctrl->failover_list, &failover_list);
> +	spin_unlock_irq(&ctrl->lock);
> +
> +	while (!list_empty(&failover_list)) {
> +		rq = list_entry(failover_list.next,
> +				struct request, queuelist);
> +		list_del_init(&rq->queuelist);
> +
> +		nvme_req(rq)->status = 0;
> +		nvme_end_req(rq);
> +		kick = true;
> +	}
> +
> +	if (kick)
> +		nvme_kick_requeue_lists(ctrl);
> +}
> +
> +void nvme_failover_work(struct work_struct *work)
> +{
> +	struct nvme_ctrl *ctrl = container_of(to_delayed_work(work),
> +					struct nvme_ctrl, failover_work);
> +
> +	nvme_flush_failover(ctrl);
>   }
>   
>   void nvme_mpath_start_request(struct request *rq)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 7563332b5b7b76fc6165ec8c6f2d144737d4fe85..10eb323bdaf139526959180c1e66ab4579bb145d 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -411,6 +411,9 @@ struct nvme_ctrl {
>   
>   	enum nvme_ctrl_type cntrltype;
>   	enum nvme_dctype dctype;
> +
> +	struct delayed_work failover_work;
> +	struct list_head failover_list;
>   };
>   
>   static inline enum nvme_ctrl_state nvme_ctrl_state(struct nvme_ctrl *ctrl)
> @@ -954,6 +957,9 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
>   void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
>   void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys);
>   void nvme_failover_req(struct request *req);
> +void nvme_failover_work(struct work_struct *work);
> +void nvme_schedule_failover(struct nvme_ctrl *ctrl);
> +void nvme_flush_failover(struct nvme_ctrl *ctrl);
>   void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
>   int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
>   void nvme_mpath_add_disk(struct nvme_ns *ns, __le32 anagrpid);
> @@ -996,6 +1002,15 @@ static inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
>   static inline void nvme_failover_req(struct request *req)
>   {
>   }
> +static inline void nvme_failover_work(struct work_struct *work)
> +{
> +}
> +static inline void nvme_schedule_failover(struct nvme_ctrl *ctrl)
> +{
> +}
> +static inline void nvme_flush_failover(struct nvme_ctrl *ctrl)
> +{
> +}
>   static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
>   {
>   }
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 86a2891d9bcc7a990cd214a7fe93fa5c55b292c7..9bee376f881b4c3ebe5502abf23a8e76829780ff 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -1127,6 +1127,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
>   
>   	nvme_stop_keep_alive(&ctrl->ctrl);
>   	flush_work(&ctrl->ctrl.async_event_work);
> +	nvme_schedule_failover(&ctrl->ctrl);
>   	nvme_rdma_teardown_io_queues(ctrl, false);
>   	nvme_unquiesce_io_queues(&ctrl->ctrl);
>   	nvme_rdma_teardown_admin_queue(ctrl, false);
> @@ -2153,6 +2154,7 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
>   
>   static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
>   {
> +	nvme_schedule_failover(&ctrl->ctrl);
>   	nvme_rdma_teardown_io_queues(ctrl, shutdown);
>   	nvme_quiesce_admin_queue(&ctrl->ctrl);
>   	nvme_disable_ctrl(&ctrl->ctrl, shutdown);
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index d0023bcfd8a79a193adf2807a24481c8c164a174..3a6c1d3febaf233996e4dcf684793327b5d1412f 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -2345,6 +2345,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
>   
>   	nvme_stop_keep_alive(ctrl);
>   	flush_work(&ctrl->async_event_work);
> +	nvme_schedule_failover(ctrl);
>   	nvme_tcp_teardown_io_queues(ctrl, false);
>   	/* unquiesce to fail fast pending requests */
>   	nvme_unquiesce_io_queues(ctrl);
> 
Hmm. Rather not.

Why do we have to have a separate failover queue?
Can't we simply delay the error recovery by the cqt value?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

Re: [PATCH RFC 3/3] nvme: delay failover by command quiesce timeout

Posted by Daniel Wagner 9 months, 4 weeks ago

On Tue, Apr 01, 2025 at 11:37:29AM +0200, Hannes Reinecke wrote:
> > --- a/drivers/nvme/host/tcp.c
> > +++ b/drivers/nvme/host/tcp.c
> > @@ -2345,6 +2345,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
> >   	nvme_stop_keep_alive(ctrl);
> >   	flush_work(&ctrl->async_event_work);
> > +	nvme_schedule_failover(ctrl);
> >   	nvme_tcp_teardown_io_queues(ctrl, false);
> >   	/* unquiesce to fail fast pending requests */
> >   	nvme_unquiesce_io_queues(ctrl);
> > 
> Hmm. Rather not.
> 
> Why do we have to have a separate failover queue?

This RFC plays with the idea to handle the request which timeout on
ctrl level. The main point is to avoid touching every single transport.

An additional failover queue is likely to introduce new problems and
additional complexity. There is no free lunch. So I don't think it's a
good concept afterall.

And as discussed during LSFMM I think it would best to focus on factor
out the common code (the project I wanted to work on for a while...)
from the transports first and then figure out how to get the CQT, CCF
working.

> Can't we simply delay the error recovery by the cqt value?

Yes and no. As we know from our in house testing, it fixes the problem
for customers but it is not spec compliant.