[v1] TP8028 Rapid Path Failure Recovery

[RFC PATCH 13/14] nvme-fc: Use CCR to recover controller that hits an error

Posted by Mohamed Khalfella 2 months, 2 weeks ago

An alive nvme controller that hits an error will now move to RECOVERING
state instead of RESETTING state. In RECOVERING state, ctrl->err_work
will attempt to use cross-controller recovery to terminate inflight IOs
on the controller. If CCR succeeds, then switch to RESETTING state and
continue error recovery as usuall by tearing down the controller, and
attempting reconnect to target. If CCR fails, the behavior of recovery
depends on whether CQT is supported or not. If CQT is supported, switch
to time-based recovery by holding inflight IOs until it is safe for them
to be retried. If CQT is not supported proceed to retry requests
immediately, as the code currently does.

Currently, inflight IOs can get completed during time-based recovery.
This will be addressed in the next patch.

Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
 drivers/nvme/host/fc.c | 52 ++++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 8b6a7c80015c..0e4d271bb4b6 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -166,7 +166,7 @@ struct nvme_fc_ctrl {
 	struct blk_mq_tag_set	admin_tag_set;
 	struct blk_mq_tag_set	tag_set;
 
-	struct work_struct	ioerr_work;
+	struct delayed_work	ioerr_work;
 	struct delayed_work	connect_work;
 
 	struct kref		ref;
@@ -1862,11 +1862,48 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
 	}
 }
 
+static int nvme_fc_recover_ctrl(struct nvme_ctrl *ctrl)
+{
+	unsigned long rem;
+
+	if (test_and_clear_bit(NVME_CTRL_RECOVERED, &ctrl->flags)) {
+		dev_info(ctrl->device, "completed time-based recovery\n");
+		goto done;
+	}
+
+	rem = nvme_recover_ctrl(ctrl);
+	if (!rem)
+		goto done;
+
+	if (!ctrl->cqt) {
+		dev_info(ctrl->device,
+			  "CCR failed, CQT not supported, skip time-based recovery\n");
+		goto done;
+	}
+
+	dev_info(ctrl->device,
+		 "CCR failed, switch to time-based recovery, timeout = %ums\n",
+		 jiffies_to_msecs(rem));
+
+	set_bit(NVME_CTRL_RECOVERED, &ctrl->flags);
+	queue_delayed_work(nvme_reset_wq, &to_fc_ctrl(ctrl)->ioerr_work, rem);
+	return -EAGAIN;
+
+done:
+	nvme_end_ctrl_recovery(ctrl);
+	return 0;
+}
+
 static void
 nvme_fc_ctrl_ioerr_work(struct work_struct *work)
 {
-	struct nvme_fc_ctrl *ctrl =
-			container_of(work, struct nvme_fc_ctrl, ioerr_work);
+	struct nvme_fc_ctrl *ctrl = container_of(to_delayed_work(work),
+				struct nvme_fc_ctrl, ioerr_work);
+
+	if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_RECOVERING) {
+		if (nvme_fc_recover_ctrl(&ctrl->ctrl))
+			return;
+	}
 
 	nvme_fc_error_recovery(ctrl);
 }
@@ -1892,7 +1929,8 @@ EXPORT_SYMBOL_GPL(nvme_fc_io_getuuid);
 static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
 					 char *errmsg)
 {
-	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECOVERING) &&
+	    !nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
 		return;
 
 	dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: starting error recovery %s\n",
@@ -3227,7 +3265,7 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
 {
 	struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl);
 
-	cancel_work_sync(&ctrl->ioerr_work);
+	cancel_delayed_work_sync(&ctrl->ioerr_work);
 	cancel_delayed_work_sync(&ctrl->connect_work);
 	/*
 	 * kill the association on the link side.  this will block
@@ -3465,7 +3503,7 @@ nvme_fc_alloc_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_fc_reset_ctrl_work);
 	INIT_DELAYED_WORK(&ctrl->connect_work, nvme_fc_connect_ctrl_work);
-	INIT_WORK(&ctrl->ioerr_work, nvme_fc_ctrl_ioerr_work);
+	INIT_DELAYED_WORK(&ctrl->ioerr_work, nvme_fc_ctrl_ioerr_work);
 	spin_lock_init(&ctrl->lock);
 
 	/* io queue count */
@@ -3563,7 +3601,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
 
 fail_ctrl:
 	nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING);
-	cancel_work_sync(&ctrl->ioerr_work);
+	cancel_delayed_work_sync(&ctrl->ioerr_work);
 	cancel_work_sync(&ctrl->ctrl.reset_work);
 	cancel_delayed_work_sync(&ctrl->connect_work);
 
-- 
2.51.2

Re: [RFC PATCH 13/14] nvme-fc: Use CCR to recover controller that hits an error

Posted by Randy Jennings 1 month, 3 weeks ago

On Tue, Nov 25, 2025 at 6:13 PM Mohamed Khalfella
<mkhalfella@purestorage.com> wrote:
>
> An alive nvme controller that hits an error will now move to RECOVERING
> state instead of RESETTING state. In RECOVERING state, ctrl->err_work
> will attempt to use cross-controller recovery to terminate inflight IOs
> on the controller. If CCR succeeds, then switch to RESETTING state and
> continue error recovery as usuall by tearing down the controller, and
> attempting reconnect to target. If CCR fails, the behavior of recovery
"usuall" -> "usual"
"attempt reconnecting" -> "attempting to reconnect"

it would read better with "the" added:
"reconnect to the target"

> depends on whether CQT is supported or not. If CQT is supported, switch
> to time-based recovery by holding inflight IOs until it is safe for them
> to be retried. If CQT is not supported proceed to retry requests
> immediately, as the code currently does.

> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c

> @@ -1862,11 +1862,48 @@ __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,

> +static int nvme_fc_recover_ctrl(struct nvme_ctrl *ctrl)

> +       queue_delayed_work(nvme_reset_wq, &to_fc_ctrl(ctrl)->ioerr_work, rem);
Just like nvme_rdma_recover_ctrl,
nvme_fc_recover_ctrl is exactly the same as
nvme_tcp_recover_ctrl. Seems like a core.c function
nvme_recover_ctrl could take a delayed work queue,
unifying the code.

>  nvme_fc_ctrl_ioerr_work(struct work_struct *work)
>  {

> +       if (nvme_ctrl_state(&ctrl->ctrl) == NVME_CTRL_RECOVERING) {
> +               if (nvme_fc_recover_ctrl(&ctrl->ctrl))
> +                       return;
> +       }
>
>         nvme_fc_error_recovery(ctrl);
Inside of nvme_fc_error_recovery(), we call nvme_stop_keep_alive().
The state of the controller should not be LIVE while waiting for
recovery, so I do not think we will succeed in sending keep alives,
but I think this should move to before (or inside of)
nvme_fc_recover_ctrl().  You have replaced all the calls to
nvme_fc_error_recovery() with nvme_fc_start_ioerr_recovery(),
so that might be okay.

Sincerely,
Randy Jennings