[v2] TP8028 Rapid Path Failure Recovery

[PATCH v2 12/14] nvme-fc: Decouple error recovery from controller reset

Posted by Mohamed Khalfella 1 week, 2 days ago

nvme_fc_error_recovery() called from nvme_fc_timeout() while controller
in CONNECTING state results in deadlock reported in link below. Update
nvme_fc_timeout() to schedule error recovery to avoid the deadlock.

Previous to this change if controller was LIVE error recovery resets
the controller and this does not match nvme-tcp and nvme-rdma. Decouple
error recovery from controller reset to match other fabric transports.

Link: https://lore.kernel.org/all/20250529214928.2112990-1-mkhalfella@purestorage.com/
Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
 drivers/nvme/host/fc.c | 94 ++++++++++++++++++------------------------
 1 file changed, 41 insertions(+), 53 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 6948de3f438a..f8f6071b78ed 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -227,6 +227,8 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt);
 static struct device *fc_udev_device;
 
 static void nvme_fc_complete_rq(struct request *rq);
+static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
+					 char *errmsg);
 
 /* *********************** FC-NVME Port Management ************************ */
 
@@ -788,7 +790,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
 		"Reconnect", ctrl->cnum);
 
 	set_bit(ASSOC_FAILED, &ctrl->flags);
-	nvme_reset_ctrl(&ctrl->ctrl);
+	nvme_fc_start_ioerr_recovery(ctrl, "Connectivity Loss");
 }
 
 /**
@@ -985,7 +987,7 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
 static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
 static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
 
-static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
+static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl);
 
 static void
 __nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
@@ -1567,9 +1569,8 @@ nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop)
 	 * for the association have been ABTS'd by
 	 * nvme_fc_delete_association().
 	 */
-
-	/* fail the association */
-	nvme_fc_error_recovery(ctrl, "Disconnect Association LS received");
+	nvme_fc_start_ioerr_recovery(ctrl,
+				     "Disconnect Association LS received");
 
 	/* release the reference taken by nvme_fc_match_disconn_ls() */
 	nvme_fc_ctrl_put(ctrl);
@@ -1871,7 +1872,7 @@ nvme_fc_ctrl_ioerr_work(struct work_struct *work)
 	struct nvme_fc_ctrl *ctrl =
 			container_of(work, struct nvme_fc_ctrl, ioerr_work);
 
-	nvme_fc_error_recovery(ctrl, "transport detected io error");
+	nvme_fc_error_recovery(ctrl);
 }
 
 /*
@@ -1892,6 +1893,17 @@ char *nvme_fc_io_getuuid(struct nvmefc_fcp_req *req)
 }
 EXPORT_SYMBOL_GPL(nvme_fc_io_getuuid);
 
+static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
+					 char *errmsg)
+{
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
+		return;
+
+	dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: starting error recovery %s\n",
+		 ctrl->cnum, errmsg);
+	queue_work(nvme_reset_wq, &ctrl->ioerr_work);
+}
+
 static void
 nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 {
@@ -2049,9 +2061,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 		nvme_fc_complete_rq(rq);
 
 check_error:
-	if (terminate_assoc &&
-	    nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_RESETTING)
-		queue_work(nvme_reset_wq, &ctrl->ioerr_work);
+	if (terminate_assoc)
+		nvme_fc_start_ioerr_recovery(ctrl, "io error");
 }
 
 static int
@@ -2495,39 +2506,6 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
 		nvme_unquiesce_admin_queue(&ctrl->ctrl);
 }
 
-static void
-nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
-{
-	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
-
-	/*
-	 * if an error (io timeout, etc) while (re)connecting, the remote
-	 * port requested terminating of the association (disconnect_ls)
-	 * or an error (timeout or abort) occurred on an io while creating
-	 * the controller.  Abort any ios on the association and let the
-	 * create_association error path resolve things.
-	 */
-	if (state == NVME_CTRL_CONNECTING) {
-		__nvme_fc_abort_outstanding_ios(ctrl, true);
-		dev_warn(ctrl->ctrl.device,
-			"NVME-FC{%d}: transport error during (re)connect\n",
-			ctrl->cnum);
-		return;
-	}
-
-	/* Otherwise, only proceed if in LIVE state - e.g. on first error */
-	if (state != NVME_CTRL_LIVE)
-		return;
-
-	dev_warn(ctrl->ctrl.device,
-		"NVME-FC{%d}: transport association event: %s\n",
-		ctrl->cnum, errmsg);
-	dev_warn(ctrl->ctrl.device,
-		"NVME-FC{%d}: resetting controller\n", ctrl->cnum);
-
-	nvme_reset_ctrl(&ctrl->ctrl);
-}
-
 static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
 {
 	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
@@ -2536,24 +2514,14 @@ static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
 	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
 	struct nvme_command *sqe = &cmdiu->sqe;
 
-	/*
-	 * Attempt to abort the offending command. Command completion
-	 * will detect the aborted io and will fail the connection.
-	 */
 	dev_info(ctrl->ctrl.device,
 		"NVME-FC{%d.%d}: io timeout: opcode %d fctype %d (%s) w10/11: "
 		"x%08x/x%08x\n",
 		ctrl->cnum, qnum, sqe->common.opcode, sqe->fabrics.fctype,
 		nvme_fabrics_opcode_str(qnum, sqe),
 		sqe->common.cdw10, sqe->common.cdw11);
-	if (__nvme_fc_abort_op(ctrl, op))
-		nvme_fc_error_recovery(ctrl, "io timeout abort failed");
 
-	/*
-	 * the io abort has been initiated. Have the reset timer
-	 * restarted and the abort completion will complete the io
-	 * shortly. Avoids a synchronous wait while the abort finishes.
-	 */
+	nvme_fc_start_ioerr_recovery(ctrl, "io timeout");
 	return BLK_EH_RESET_TIMER;
 }
 
@@ -3352,6 +3320,26 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
 	}
 }
 
+static void
+nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl)
+{
+	nvme_stop_keep_alive(&ctrl->ctrl);
+	nvme_stop_ctrl(&ctrl->ctrl);
+
+	/* will block while waiting for io to terminate */
+	nvme_fc_delete_association(ctrl);
+
+	/* Do not reconnect if controller is being deleted */
+	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
+		return;
+
+	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) {
+		queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
+		return;
+	}
+
+	nvme_fc_reconnect_or_delete(ctrl, -ENOTCONN);
+}
 
 static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
 	.name			= "fc",
-- 
2.52.0

Re: [PATCH v2 12/14] nvme-fc: Decouple error recovery from controller reset

Posted by James Smart 5 days, 15 hours ago

On 1/30/2026 2:34 PM, Mohamed Khalfella wrote:
> nvme_fc_error_recovery() called from nvme_fc_timeout() while controller
> in CONNECTING state results in deadlock reported in link below. Update
> nvme_fc_timeout() to schedule error recovery to avoid the deadlock.
> 
> Previous to this change if controller was LIVE error recovery resets
> the controller and this does not match nvme-tcp and nvme-rdma.

It is not intended to match tcp/rda. Using the reset path was done to 
avoid code duplication of paths to teardown the association.  FC, given 
we interact with an HBA for device and io state and have a lot of async 
io completions, requires a lot more work than straight data structure 
teardown in rdma/tcp.

I agree with wanting to changeup the execution thread for the deadlock.


> Decouple> error recovery from controller reset to match other fabric transports.>
> Link: https://lore.kernel.org/all/20250529214928.2112990-1-mkhalfella@purestorage.com/
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> ---
>   drivers/nvme/host/fc.c | 94 ++++++++++++++++++------------------------
>   1 file changed, 41 insertions(+), 53 deletions(-)
> 
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index 6948de3f438a..f8f6071b78ed 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -227,6 +227,8 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt);
>   static struct device *fc_udev_device;
>   
>   static void nvme_fc_complete_rq(struct request *rq);
> +static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
> +					 char *errmsg);
>   
>   /* *********************** FC-NVME Port Management ************************ */
>   
> @@ -788,7 +790,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
>   		"Reconnect", ctrl->cnum);
>   
>   	set_bit(ASSOC_FAILED, &ctrl->flags);
> -	nvme_reset_ctrl(&ctrl->ctrl);
> +	nvme_fc_start_ioerr_recovery(ctrl, "Connectivity Loss");
>   }
>   
>   /**
> @@ -985,7 +987,7 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
>   static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
>   static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
>   
> -static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
> +static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl);
>   
>   static void
>   __nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
> @@ -1567,9 +1569,8 @@ nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop)
>   	 * for the association have been ABTS'd by
>   	 * nvme_fc_delete_association().
>   	 */
> -
> -	/* fail the association */
> -	nvme_fc_error_recovery(ctrl, "Disconnect Association LS received");
> +	nvme_fc_start_ioerr_recovery(ctrl,
> +				     "Disconnect Association LS received");
>   
>   	/* release the reference taken by nvme_fc_match_disconn_ls() */
>   	nvme_fc_ctrl_put(ctrl);
> @@ -1871,7 +1872,7 @@ nvme_fc_ctrl_ioerr_work(struct work_struct *work)
>   	struct nvme_fc_ctrl *ctrl =
>   			container_of(work, struct nvme_fc_ctrl, ioerr_work);
>   
> -	nvme_fc_error_recovery(ctrl, "transport detected io error");
> +	nvme_fc_error_recovery(ctrl);

hmm.. not sure how I feel about this. There is at least a break in reset 
processing that is no longer present - e.g. prior queued ioerr_work, 
which would then queue reset_work. This effectively calls the reset_work 
handler directly. I assume it should be ok.

>   }
>   
>   /*
> @@ -1892,6 +1893,17 @@ char *nvme_fc_io_getuuid(struct nvmefc_fcp_req *req)
>   }
>   EXPORT_SYMBOL_GPL(nvme_fc_io_getuuid);
>   
> +static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
> +					 char *errmsg)
> +{
> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
> +		return;
 > +> +	dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: starting error 
recovery %s\n",
> +		 ctrl->cnum, errmsg);
> +	queue_work(nvme_reset_wq, &ctrl->ioerr_work);
> +}
> +

Disagree with this.

The clause in error_recovery around the CONNECTING state is pretty 
important to terminate io occurring during connect/reconnect where the 
ctrl state should not change. we don't want start_ioerr making it RESETTING.

This should be reworked.

>   static void
>   nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
>   {
> @@ -2049,9 +2061,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
>   		nvme_fc_complete_rq(rq);
>   
>   check_error:
> -	if (terminate_assoc &&
> -	    nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_RESETTING)
> -		queue_work(nvme_reset_wq, &ctrl->ioerr_work);
> +	if (terminate_assoc)
> +		nvme_fc_start_ioerr_recovery(ctrl, "io error");

this is ok. the ioerr_recovery will bounce the RESETTING state if it's 
already in the state. So this is a little cleaner.

>   }
>   
>   static int
> @@ -2495,39 +2506,6 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
>   		nvme_unquiesce_admin_queue(&ctrl->ctrl);
>   }
>   
> -static void
> -nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
> -{
> -	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
> -
> -	/*
> -	 * if an error (io timeout, etc) while (re)connecting, the remote
> -	 * port requested terminating of the association (disconnect_ls)
> -	 * or an error (timeout or abort) occurred on an io while creating
> -	 * the controller.  Abort any ios on the association and let the
> -	 * create_association error path resolve things.
> -	 */
> -	if (state == NVME_CTRL_CONNECTING) {
> -		__nvme_fc_abort_outstanding_ios(ctrl, true);
> -		dev_warn(ctrl->ctrl.device,
> -			"NVME-FC{%d}: transport error during (re)connect\n",
> -			ctrl->cnum);
> -		return;
> -	}

This logic needs to be preserved. Its no longer part of 
nvme_fc_start_ioerr_recovery(). Failures during CONNECTING should not be 
"fenced". They should fail immediately.

> -
> -	/* Otherwise, only proceed if in LIVE state - e.g. on first error */
> -	if (state != NVME_CTRL_LIVE)
> -		return;

This was to filter out multiple requests of the reset. I guess that is 
what happens now in start_ioerr when attempting to set state to 
RESETTING and already RESETTING.

There is a small difference here in that The existing code avoids doing 
the ctrl reset if the controller is NEW. start_ioerr will change the 
ctrl to RESETTING. I'm not sure how much of an impact that is.


> -
> -	dev_warn(ctrl->ctrl.device,
> -		"NVME-FC{%d}: transport association event: %s\n",
> -		ctrl->cnum, errmsg);
> -	dev_warn(ctrl->ctrl.device,
> -		"NVME-FC{%d}: resetting controller\n", ctrl->cnum);

I haven't paid much attention, but keeping the transport messages for 
these cases is very very useful for diagnosis.

> -
> -	nvme_reset_ctrl(&ctrl->ctrl);
> -}
> -
>   static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
>   {
>   	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
> @@ -2536,24 +2514,14 @@ static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
>   	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
>   	struct nvme_command *sqe = &cmdiu->sqe;
>   
> -	/*
> -	 * Attempt to abort the offending command. Command completion
> -	 * will detect the aborted io and will fail the connection.
> -	 */
>   	dev_info(ctrl->ctrl.device,
>   		"NVME-FC{%d.%d}: io timeout: opcode %d fctype %d (%s) w10/11: "
>   		"x%08x/x%08x\n",
>   		ctrl->cnum, qnum, sqe->common.opcode, sqe->fabrics.fctype,
>   		nvme_fabrics_opcode_str(qnum, sqe),
>   		sqe->common.cdw10, sqe->common.cdw11);
> -	if (__nvme_fc_abort_op(ctrl, op))
> -		nvme_fc_error_recovery(ctrl, "io timeout abort failed");
>   
> -	/*
> -	 * the io abort has been initiated. Have the reset timer
> -	 * restarted and the abort completion will complete the io
> -	 * shortly. Avoids a synchronous wait while the abort finishes.
> -	 */
> +	nvme_fc_start_ioerr_recovery(ctrl, "io timeout");

Why get rid of the abort logic ?
Note: the error recovery/controller reset is only called when the abort 
failed.

I believe you should continue to abort the op.  The fence logic will 
kick in when the op completes later (along with other io completions). 
If nothing else, it allows a hw resource to be freed up.


>   	return BLK_EH_RESET_TIMER;
>   }
>   
> @@ -3352,6 +3320,26 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
>   	}
>   }
>   
> +static void
> +nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl)
> +{
> +	nvme_stop_keep_alive(&ctrl->ctrl);

Curious, why did the stop_keep_alive() call get added to this ?
Doesn't hurt.

I assume it was due to other transports having it as they originally 
were calling stop_ctrl, but then moved to stop_keep_alive. Shouldn't 
this be followed by flush_work((&ctrl->ctrl.async_event_work) ?

> +	nvme_stop_ctrl(&ctrl->ctrl);
> +
> +	/* will block while waiting for io to terminate */
> +	nvme_fc_delete_association(ctrl);
> +
> +	/* Do not reconnect if controller is being deleted */
> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
> +		return;
> +
> +	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) {
> +		queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
> +		return;
> +	}
> +
> +	nvme_fc_reconnect_or_delete(ctrl, -ENOTCONN);
> +}

This code and that in nvme_fc_reset_ctrl_work() need to be collapsed 
into a common helper function invoked by the 2 routines.  Also addresses 
the missing flush_delayed work in this routine.

>   
>   static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
>   	.name			= "fc",


-- James

(new email address. can always reach me at james.smart@broadcom.com as well)

Re: [PATCH v2 12/14] nvme-fc: Decouple error recovery from controller reset

Posted by Mohamed Khalfella 5 days, 10 hours ago

On Tue 2026-02-03 11:19:28 -0800, James Smart wrote:
> On 1/30/2026 2:34 PM, Mohamed Khalfella wrote:
> > nvme_fc_error_recovery() called from nvme_fc_timeout() while controller
> > in CONNECTING state results in deadlock reported in link below. Update
> > nvme_fc_timeout() to schedule error recovery to avoid the deadlock.
> > 
> > Previous to this change if controller was LIVE error recovery resets
> > the controller and this does not match nvme-tcp and nvme-rdma.
> 
> It is not intended to match tcp/rda. Using the reset path was done to 
> avoid code duplication of paths to teardown the association.  FC, given 
> we interact with an HBA for device and io state and have a lot of async 
> io completions, requires a lot more work than straight data structure 
> teardown in rdma/tcp.
> 
> I agree with wanting to changeup the execution thread for the deadlock.
> 
> 
> > Decouple> error recovery from controller reset to match other fabric transports.>
> > Link: https://lore.kernel.org/all/20250529214928.2112990-1-mkhalfella@purestorage.com/
> > Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> > ---
> >   drivers/nvme/host/fc.c | 94 ++++++++++++++++++------------------------
> >   1 file changed, 41 insertions(+), 53 deletions(-)
> > 
> > diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> > index 6948de3f438a..f8f6071b78ed 100644
> > --- a/drivers/nvme/host/fc.c
> > +++ b/drivers/nvme/host/fc.c
> > @@ -227,6 +227,8 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt);
> >   static struct device *fc_udev_device;
> >   
> >   static void nvme_fc_complete_rq(struct request *rq);
> > +static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
> > +					 char *errmsg);
> >   
> >   /* *********************** FC-NVME Port Management ************************ */
> >   
> > @@ -788,7 +790,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
> >   		"Reconnect", ctrl->cnum);
> >   
> >   	set_bit(ASSOC_FAILED, &ctrl->flags);
> > -	nvme_reset_ctrl(&ctrl->ctrl);
> > +	nvme_fc_start_ioerr_recovery(ctrl, "Connectivity Loss");
> >   }
> >   
> >   /**
> > @@ -985,7 +987,7 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
> >   static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
> >   static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
> >   
> > -static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
> > +static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl);
> >   
> >   static void
> >   __nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
> > @@ -1567,9 +1569,8 @@ nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop)
> >   	 * for the association have been ABTS'd by
> >   	 * nvme_fc_delete_association().
> >   	 */
> > -
> > -	/* fail the association */
> > -	nvme_fc_error_recovery(ctrl, "Disconnect Association LS received");
> > +	nvme_fc_start_ioerr_recovery(ctrl,
> > +				     "Disconnect Association LS received");
> >   
> >   	/* release the reference taken by nvme_fc_match_disconn_ls() */
> >   	nvme_fc_ctrl_put(ctrl);
> > @@ -1871,7 +1872,7 @@ nvme_fc_ctrl_ioerr_work(struct work_struct *work)
> >   	struct nvme_fc_ctrl *ctrl =
> >   			container_of(work, struct nvme_fc_ctrl, ioerr_work);
> >   
> > -	nvme_fc_error_recovery(ctrl, "transport detected io error");
> > +	nvme_fc_error_recovery(ctrl);
> 
> hmm.. not sure how I feel about this. There is at least a break in reset 
> processing that is no longer present - e.g. prior queued ioerr_work, 
> which would then queue reset_work. This effectively calls the reset_work 
> handler directly. I assume it should be ok.
> 
> >   }
> >   
> >   /*
> > @@ -1892,6 +1893,17 @@ char *nvme_fc_io_getuuid(struct nvmefc_fcp_req *req)
> >   }
> >   EXPORT_SYMBOL_GPL(nvme_fc_io_getuuid);
> >   
> > +static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
> > +					 char *errmsg)
> > +{
> > +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
> > +		return;
>  > +> +	dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: starting error 
> recovery %s\n",
> > +		 ctrl->cnum, errmsg);
> > +	queue_work(nvme_reset_wq, &ctrl->ioerr_work);
> > +}
> > +
> 
> Disagree with this.
> 
> The clause in error_recovery around the CONNECTING state is pretty 
> important to terminate io occurring during connect/reconnect where the 
> ctrl state should not change. we don't want start_ioerr making it RESETTING.
> 
> This should be reworked.

Like you pointed out this changes the current behavior for CONNECTING
state.

Before this change, as you pointed out the controller state stays in
CONNECTING while all IOs are aborted. Aborting the IOs causes
nvme_fc_create_association() to fail and reconnect might be attempted
again.

The new behavior switches to RESETTING and queues ctr->ioerr_work.
ioerr_work will abort oustanding IOs, swich back to CONNECING and
attempt reconnect.

nvme_fc_error_recovery() ->
  nvme_stop_keep_alive() /* should not make a difference */
  nvme_stop_ctrl()       /* should be okay to run */
  nvme_fc_delete_association() ->
    __nvme_fc_abort_outstanding_ios(ctrl, false)
    nvme_unquiesce_admin_queue()
    nvme_unquiesce_io_queues()
    nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)
    if (port_state == ONLINE)
      queue_work(ctrl->connect)
    else
      nvme_fc_reconnect_or_delete();

Yes, this is a different behavior. IMO it is simpler to follow and
closer to what other transports do, keeping in mind async abort nature
of fc.

Aside from it is different, what is wrong with it?

> 
> >   static void
> >   nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
> >   {
> > @@ -2049,9 +2061,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
> >   		nvme_fc_complete_rq(rq);
> >   
> >   check_error:
> > -	if (terminate_assoc &&
> > -	    nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_RESETTING)
> > -		queue_work(nvme_reset_wq, &ctrl->ioerr_work);
> > +	if (terminate_assoc)
> > +		nvme_fc_start_ioerr_recovery(ctrl, "io error");
> 
> this is ok. the ioerr_recovery will bounce the RESETTING state if it's 
> already in the state. So this is a little cleaner.
> 
> >   }
> >   
> >   static int
> > @@ -2495,39 +2506,6 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
> >   		nvme_unquiesce_admin_queue(&ctrl->ctrl);
> >   }
> >   
> > -static void
> > -nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
> > -{
> > -	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
> > -
> > -	/*
> > -	 * if an error (io timeout, etc) while (re)connecting, the remote
> > -	 * port requested terminating of the association (disconnect_ls)
> > -	 * or an error (timeout or abort) occurred on an io while creating
> > -	 * the controller.  Abort any ios on the association and let the
> > -	 * create_association error path resolve things.
> > -	 */
> > -	if (state == NVME_CTRL_CONNECTING) {
> > -		__nvme_fc_abort_outstanding_ios(ctrl, true);
> > -		dev_warn(ctrl->ctrl.device,
> > -			"NVME-FC{%d}: transport error during (re)connect\n",
> > -			ctrl->cnum);
> > -		return;
> > -	}
> 
> This logic needs to be preserved. Its no longer part of 
> nvme_fc_start_ioerr_recovery(). Failures during CONNECTING should not be 
> "fenced". They should fail immediately.

I think this is similar to the point above.

> 
> > -
> > -	/* Otherwise, only proceed if in LIVE state - e.g. on first error */
> > -	if (state != NVME_CTRL_LIVE)
> > -		return;
> 
> This was to filter out multiple requests of the reset. I guess that is 
> what happens now in start_ioerr when attempting to set state to 
> RESETTING and already RESETTING.

Yes. In this case nvme_fc_start_ioerr_recovery() will do nothing.

> 
> There is a small difference here in that The existing code avoids doing 
> the ctrl reset if the controller is NEW. start_ioerr will change the 
> ctrl to RESETTING. I'm not sure how much of an impact that is.
> 

I think there is little done while controller in NEW state.
Let me know if I am missing something.

> 
> > -
> > -	dev_warn(ctrl->ctrl.device,
> > -		"NVME-FC{%d}: transport association event: %s\n",
> > -		ctrl->cnum, errmsg);
> > -	dev_warn(ctrl->ctrl.device,
> > -		"NVME-FC{%d}: resetting controller\n", ctrl->cnum);
> 
> I haven't paid much attention, but keeping the transport messages for 
> these cases is very very useful for diagnosis.
> 
> > -
> > -	nvme_reset_ctrl(&ctrl->ctrl);
> > -}
> > -
> >   static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
> >   {
> >   	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
> > @@ -2536,24 +2514,14 @@ static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
> >   	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
> >   	struct nvme_command *sqe = &cmdiu->sqe;
> >   
> > -	/*
> > -	 * Attempt to abort the offending command. Command completion
> > -	 * will detect the aborted io and will fail the connection.
> > -	 */
> >   	dev_info(ctrl->ctrl.device,
> >   		"NVME-FC{%d.%d}: io timeout: opcode %d fctype %d (%s) w10/11: "
> >   		"x%08x/x%08x\n",
> >   		ctrl->cnum, qnum, sqe->common.opcode, sqe->fabrics.fctype,
> >   		nvme_fabrics_opcode_str(qnum, sqe),
> >   		sqe->common.cdw10, sqe->common.cdw11);
> > -	if (__nvme_fc_abort_op(ctrl, op))
> > -		nvme_fc_error_recovery(ctrl, "io timeout abort failed");
> >   
> > -	/*
> > -	 * the io abort has been initiated. Have the reset timer
> > -	 * restarted and the abort completion will complete the io
> > -	 * shortly. Avoids a synchronous wait while the abort finishes.
> > -	 */
> > +	nvme_fc_start_ioerr_recovery(ctrl, "io timeout");
> 
> Why get rid of the abort logic ?
> Note: the error recovery/controller reset is only called when the abort 
> failed.
> 
> I believe you should continue to abort the op.  The fence logic will 
> kick in when the op completes later (along with other io completions). 
> If nothing else, it allows a hw resource to be freed up.

The abort logic from nvme_fc_timeout() is problematic and it does not
play well with abort initiatored from ioerr_work or reset_work. The
problem is that op aborted from nvme_fc_timeout() is not accounted for
when the controller is reset.

Here is an example scenario.

The first time a request times out it gets aborted we see this codepath

nvme_fc_timeout() ->
  __nvme_fc_abort_op() ->
    atomic_xchg(&op->state, FCPOP_STATE_ABORTED)
      ops->abort()
        return 0;

nvme_fc_timeout() always return BLK_EH_RESET_TIMER so the same request
can timeout again. If the same request hits timeout again then
__nvme_fc_abort_op() returns -ECANCELED and nvme_fc_error_recovery()
gets called. Assuming the controller is LIVE it will be reset.

nvme_fc_reset_ctrl_work() ->
  nvme_fc_delete_association() ->
    __nvme_fc_abort_outstanding_ios() ->
      nvme_fc_terminate_exchange() ->
        __nvme_fc_abort_op()

__nvme_fc_abort_op() finds that op already aborted. As a result of that
ctrl->iocnt will not be incrmented for this op. This means that
nvme_fc_delete_association() will not wait for this op to be aborted.

I do not think we wait this behavior.

To continue the scenario above. The controller switches to CONNECTING
and the request times out again. This time we hit the deadlock described
in [1].

I think the first abort is the cause of the issue here. with this change
we should not hit the scenario described above.

1 - https://lore.kernel.org/all/20250529214928.2112990-1-mkhalfella@purestorage.com/

> 
> 
> >   	return BLK_EH_RESET_TIMER;
> >   }
> >   
> > @@ -3352,6 +3320,26 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
> >   	}
> >   }
> >   
> > +static void
> > +nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl)
> > +{
> > +	nvme_stop_keep_alive(&ctrl->ctrl);
> 
> Curious, why did the stop_keep_alive() call get added to this ?
> Doesn't hurt.
> 
> I assume it was due to other transports having it as they originally 
> were calling stop_ctrl, but then moved to stop_keep_alive. Shouldn't 
> this be followed by flush_work((&ctrl->ctrl.async_event_work) ?

Yes. I added it because it matches what other transports do.

nvme_fc_error_recovery() ->
  nvme_fc_delete_association() ->
    nvme_fc_abort_aen_ops() ->
      nvme_fc_term_aen_ops() ->
        cancel_work_sync(&ctrl->ctrl.async_event_work);

The above codepath takes care of async_event_work.

> 
> > +	nvme_stop_ctrl(&ctrl->ctrl);
> > +
> > +	/* will block while waiting for io to terminate */
> > +	nvme_fc_delete_association(ctrl);
> > +
> > +	/* Do not reconnect if controller is being deleted */
> > +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
> > +		return;
> > +
> > +	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) {
> > +		queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
> > +		return;
> > +	}
> > +
> > +	nvme_fc_reconnect_or_delete(ctrl, -ENOTCONN);
> > +}
> 
> This code and that in nvme_fc_reset_ctrl_work() need to be collapsed 
> into a common helper function invoked by the 2 routines.  Also addresses 
> the missing flush_delayed work in this routine.
> 

Agree, nvme_fc_error_recovery() and nvme_fc_reset_ctrl_work() have
common code that can be refactored. However, I do not plan to do this
part of this change. I will take a look after I get CCR work done.

> >   
> >   static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
> >   	.name			= "fc",
> 
> 
> -- James
> 
> (new email address. can always reach me at james.smart@broadcom.com as well)
>

Re: [PATCH v2 12/14] nvme-fc: Decouple error recovery from controller reset

Posted by James Smart 4 days, 10 hours ago

On 2/3/2026 4:11 PM, Mohamed Khalfella wrote:
> On Tue 2026-02-03 11:19:28 -0800, James Smart wrote:
>> On 1/30/2026 2:34 PM, Mohamed Khalfella wrote:
...
>>>    
>>> +static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
>>> +					 char *errmsg)
>>> +{
>>> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
>>> +		return;
>>   > +> +	dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: starting error
>> recovery %s\n",
>>> +		 ctrl->cnum, errmsg);
>>> +	queue_work(nvme_reset_wq, &ctrl->ioerr_work);
>>> +}
>>> +
>>
>> Disagree with this.
>>
>> The clause in error_recovery around the CONNECTING state is pretty
>> important to terminate io occurring during connect/reconnect where the
>> ctrl state should not change. we don't want start_ioerr making it RESETTING.
>>
>> This should be reworked.
> 
> Like you pointed out this changes the current behavior for CONNECTING
> state.
> 
> Before this change, as you pointed out the controller state stays in
> CONNECTING while all IOs are aborted. Aborting the IOs causes
> nvme_fc_create_association() to fail and reconnect might be attempted
> again.
> The new behavior switches to RESETTING and queues ctr->ioerr_work.
> ioerr_work will abort oustanding IOs, swich back to CONNECING and
> attempt reconnect.

Well, it won't actually switch to RESETTING, as CONNECTING->RESETTING is 
not a valid transition.  So things will silently stop in 
start_ioerr_recovery when the state transition fails (also a reason I 
dislike silent state transition failures).

When I look a little further into patch 13, I see the change to FENCING 
added. But that state transition will also fail for CONNECTING->FENCING. 
It will then fall into the resetting state change, which will silently 
fail, and we're stopped.  It says to me there was no consideration or 
testing of failures while CONNECTING with this patch set.  Even if 
RESETTING were allowed, its injecting a new flow into the code paths.

The CONNECTING issue also applies to tcp and rdma transports. I don't 
know if they call the error_recovery routines in the same way.

To be honest I'm not sure I remember the original reasons this loop was 
put in, but I do remember pain I went through when generating it and the 
number of test cases that were needed to cover testing. It may well be 
because I couldn't invoke the reset due to the CONNECTING->RESETTING 
block.  I'm being pedantic as I still feel residual pain for it.

> 
> nvme_fc_error_recovery() ->
>    nvme_stop_keep_alive() /* should not make a difference */
>    nvme_stop_ctrl()       /* should be okay to run */
>    nvme_fc_delete_association() ->
>      __nvme_fc_abort_outstanding_ios(ctrl, false)
>      nvme_unquiesce_admin_queue()
>      nvme_unquiesce_io_queues()
>      nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)
>      if (port_state == ONLINE)
>        queue_work(ctrl->connect)
>      else
>        nvme_fc_reconnect_or_delete();
> 
> Yes, this is a different behavior. IMO it is simpler to follow and
> closer to what other transports do, keeping in mind async abort nature
> of fc.
> 
> Aside from it is different, what is wrong with it?

See above.

...
>>>    static int
>>> @@ -2495,39 +2506,6 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
>>>    		nvme_unquiesce_admin_queue(&ctrl->ctrl);
>>>    }
>>>    
>>> -static void
>>> -nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
>>> -{
>>> -	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
>>> -
>>> -	/*
>>> -	 * if an error (io timeout, etc) while (re)connecting, the remote
>>> -	 * port requested terminating of the association (disconnect_ls)
>>> -	 * or an error (timeout or abort) occurred on an io while creating
>>> -	 * the controller.  Abort any ios on the association and let the
>>> -	 * create_association error path resolve things.
>>> -	 */
>>> -	if (state == NVME_CTRL_CONNECTING) {
>>> -		__nvme_fc_abort_outstanding_ios(ctrl, true);
>>> -		dev_warn(ctrl->ctrl.device,
>>> -			"NVME-FC{%d}: transport error during (re)connect\n",
>>> -			ctrl->cnum);
>>> -		return;
>>> -	}
>>
>> This logic needs to be preserved. Its no longer part of
>> nvme_fc_start_ioerr_recovery(). Failures during CONNECTING should not be
>> "fenced". They should fail immediately.
> 
> I think this is similar to the point above.

Forgetting whether or not the above "works", what I'm pointing out is 
that when in CONNECTING I don't believe you should be enacting the 
FENCED state and delaying. For CONNECTING, the cleanup should be 
immediate with no delay and no CCR attempt.  Only LIVE should transition 
to FENCED.

Looking at patch 14, fencing_work calls nvme_fence_ctrl() which 
unconditionally delays and tries to do CCR. We only want this if LIVE. 
I'll comment on that patch.

>> There is a small difference here in that The existing code avoids doing
>> the ctrl reset if the controller is NEW. start_ioerr will change the
>> ctrl to RESETTING. I'm not sure how much of an impact that is.
>>
> 
> I think there is little done while controller in NEW state.
> Let me know if I am missing something.

No - I had to update my understanding I was really out of date. Used to 
be NEW is what initial controller create was done under. Everybody does 
it now under CONNECTING.

...
>>>    static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
>>>    {
>>>    	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
>>> @@ -2536,24 +2514,14 @@ static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
>>>    	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
>>>    	struct nvme_command *sqe = &cmdiu->sqe;
>>>    
>>> -	/*
>>> -	 * Attempt to abort the offending command. Command completion
>>> -	 * will detect the aborted io and will fail the connection.
>>> -	 */
>>>    	dev_info(ctrl->ctrl.device,
>>>    		"NVME-FC{%d.%d}: io timeout: opcode %d fctype %d (%s) w10/11: "
>>>    		"x%08x/x%08x\n",
>>>    		ctrl->cnum, qnum, sqe->common.opcode, sqe->fabrics.fctype,
>>>    		nvme_fabrics_opcode_str(qnum, sqe),
>>>    		sqe->common.cdw10, sqe->common.cdw11);
>>> -	if (__nvme_fc_abort_op(ctrl, op))
>>> -		nvme_fc_error_recovery(ctrl, "io timeout abort failed");
>>>    
>>> -	/*
>>> -	 * the io abort has been initiated. Have the reset timer
>>> -	 * restarted and the abort completion will complete the io
>>> -	 * shortly. Avoids a synchronous wait while the abort finishes.
>>> -	 */
>>> +	nvme_fc_start_ioerr_recovery(ctrl, "io timeout");
>>
>> Why get rid of the abort logic ?
>> Note: the error recovery/controller reset is only called when the abort
>> failed.
>>
>> I believe you should continue to abort the op.  The fence logic will
>> kick in when the op completes later (along with other io completions).
>> If nothing else, it allows a hw resource to be freed up.
> 
> The abort logic from nvme_fc_timeout() is problematic and it does not
> play well with abort initiatored from ioerr_work or reset_work. The
> problem is that op aborted from nvme_fc_timeout() is not accounted for
> when the controller is reset.

note: I'll wait to be shown otherwise, but if this were true it would be 
horribly broken for a long time.

> 
> Here is an example scenario.
> 
> The first time a request times out it gets aborted we see this codepath
> 
> nvme_fc_timeout() ->
>    __nvme_fc_abort_op() ->
>      atomic_xchg(&op->state, FCPOP_STATE_ABORTED)
>        ops->abort()
>          return 0;

there's more than this in in the code:
it changes op state to ABORTED, saving the old opstate.
if the opstate wasn't active - it means something else changed and it 
restores the old state (e.g. the aborts for the reset may have hit it).
if it was active (e.g. the aborts the reset haven't hit it yet) it 
checks the ctlr flag to see if the controller is being reset and 
tracking io termination (the TERMIO flag) and if so, increments the 
iocnt. So it is "included" in the reset.

if old state was active, it then sends the ABTS.
if old state wasn't active (we've been here before or io terminated by 
reset) it returns -ECANCELED, which will cause a controller reset to be 
attempted if there's not already one in process.

> 
> nvme_fc_timeout() always return BLK_EH_RESET_TIMER so the same request
> can timeout again. If the same request hits timeout again then
> __nvme_fc_abort_op() returns -ECANCELED and nvme_fc_error_recovery()
> gets called. Assuming the controller is LIVE it will be reset.

The normal case is timeout generates ABTS. ABTS usually completes 
quickly with the io completing and the io callback to iodone, which sees 
abort error status and resets controller. Its very typical for the ABTS 
to complete long before the 2nd EH timer timing out.

Abnormal case is ABTS takes longer to complete than the 2nd EH timer 
timing. Yes, that forces the controller reset.   I am aware that some 
arrays will delay ABTS ACC while they terminate the back end, but there 
are also frame drop conditions to consider.

if the controller is already resetting, all the above is largely n/a.

I see no reason to avoid the ABTS and wait for a 2nd EH timer to fire.

> 
> nvme_fc_reset_ctrl_work() ->
>    nvme_fc_delete_association() ->
>      __nvme_fc_abort_outstanding_ios() ->
>        nvme_fc_terminate_exchange() ->
>          __nvme_fc_abort_op()
> 
> __nvme_fc_abort_op() finds that op already aborted. As a result of that
> ctrl->iocnt will not be incrmented for this op. This means that
> nvme_fc_delete_association() will not wait for this op to be aborted.

see missing code stmt above.

> 
> I do not think we wait this behavior.
> 
> To continue the scenario above. The controller switches to CONNECTING
> and the request times out again. This time we hit the deadlock described
> in [1].
> 
> I think the first abort is the cause of the issue here. with this change
> we should not hit the scenario described above.
> 
> 1 - https://lore.kernel.org/all/20250529214928.2112990-1-mkhalfella@purestorage.com/

Something else happened here. You can't get to CONNECTING state unless 
all outstanding io was reaped in delete association. What is also harder 
to understand is how there was an io to timeout if they've all been 
reaped and queues haven't been restarted.  Timeout on one of the ios to 
instatiate/init the controller maybe, but it shouldn't have been one of 
those in the blk layer.

> 
>>
>>
>>>    	return BLK_EH_RESET_TIMER;
>>>    }
>>>    
>>> @@ -3352,6 +3320,26 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
>>>    	}
>>>    }
>>>    
>>> +static void
>>> +nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl)
>>> +{
>>> +	nvme_stop_keep_alive(&ctrl->ctrl);
>>
>> Curious, why did the stop_keep_alive() call get added to this ?
>> Doesn't hurt.
>>
>> I assume it was due to other transports having it as they originally
>> were calling stop_ctrl, but then moved to stop_keep_alive. Shouldn't
>> this be followed by flush_work((&ctrl->ctrl.async_event_work) ?
> 
> Yes. I added it because it matches what other transports do.
> 
> nvme_fc_error_recovery() ->
>    nvme_fc_delete_association() ->
>      nvme_fc_abort_aen_ops() ->
>        nvme_fc_term_aen_ops() ->
>          cancel_work_sync(&ctrl->ctrl.async_event_work);
> 
> The above codepath takes care of async_event_work.

True, but the flush_works were added for a reason to the other 
transports so I'm guessing timing matters. So waiting till ther later 
term_aen call isn't great.  But I also guess, we haven't had an issue 
prior and since we did take care if it in the aen routines, its likely 
unneeded now.  Ok to add it but if so, we should keep the flush_work as 
well. Also good to look same as the other transports.

> 
>>
>>> +	nvme_stop_ctrl(&ctrl->ctrl);
>>> +
>>> +	/* will block while waiting for io to terminate */
>>> +	nvme_fc_delete_association(ctrl);
>>> +
>>> +	/* Do not reconnect if controller is being deleted */
>>> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
>>> +		return;
>>> +
>>> +	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) {
>>> +		queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
>>> +		return;
>>> +	}
>>> +
>>> +	nvme_fc_reconnect_or_delete(ctrl, -ENOTCONN);
>>> +}
>>
>> This code and that in nvme_fc_reset_ctrl_work() need to be collapsed
>> into a common helper function invoked by the 2 routines.  Also addresses
>> the missing flush_delayed work in this routine.
>>
> 
> Agree, nvme_fc_error_recovery() and nvme_fc_reset_ctrl_work() have
> common code that can be refactored. However, I do not plan to do this
> part of this change. I will take a look after I get CCR work done.

Don't put it off. You are adding as much code as the refactoring is. 
Just make the change.

-- james

Re: [PATCH v2 12/14] nvme-fc: Decouple error recovery from controller reset

Posted by Mohamed Khalfella 4 days, 9 hours ago

On Wed 2026-02-04 16:08:12 -0800, James Smart wrote:
> On 2/3/2026 4:11 PM, Mohamed Khalfella wrote:
> > On Tue 2026-02-03 11:19:28 -0800, James Smart wrote:
> >> On 1/30/2026 2:34 PM, Mohamed Khalfella wrote:
> ...
> >>>    
> >>> +static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
> >>> +					 char *errmsg)
> >>> +{
> >>> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
> >>> +		return;
> >>   > +> +	dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: starting error
> >> recovery %s\n",
> >>> +		 ctrl->cnum, errmsg);
> >>> +	queue_work(nvme_reset_wq, &ctrl->ioerr_work);
> >>> +}
> >>> +
> >>
> >> Disagree with this.
> >>
> >> The clause in error_recovery around the CONNECTING state is pretty
> >> important to terminate io occurring during connect/reconnect where the
> >> ctrl state should not change. we don't want start_ioerr making it RESETTING.
> >>
> >> This should be reworked.
> > 
> > Like you pointed out this changes the current behavior for CONNECTING
> > state.
> > 
> > Before this change, as you pointed out the controller state stays in
> > CONNECTING while all IOs are aborted. Aborting the IOs causes
> > nvme_fc_create_association() to fail and reconnect might be attempted
> > again.
> > The new behavior switches to RESETTING and queues ctr->ioerr_work.
> > ioerr_work will abort oustanding IOs, swich back to CONNECING and
> > attempt reconnect.
> 
> Well, it won't actually switch to RESETTING, as CONNECTING->RESETTING is 
> not a valid transition.  So things will silently stop in 
> start_ioerr_recovery when the state transition fails (also a reason I 
> dislike silent state transition failures).

You are right. I missed the fact that there is no transition from
CONNECING to RESETTING. Need to go back and revisit this part.

> 
> When I look a little further into patch 13, I see the change to FENCING 
> added. But that state transition will also fail for CONNECTING->FENCING. 
> It will then fall into the resetting state change, which will silently 
> fail, and we're stopped.  It says to me there was no consideration or 
> testing of failures while CONNECTING with this patch set.  Even if 
> RESETTING were allowed, its injecting a new flow into the code paths.

I tested dropping ADMIN commands on the target side to see CONNECTING
failures. I have not seen issues, but I will revisit this part.

> 
> The CONNECTING issue also applies to tcp and rdma transports. I don't 
> know if they call the error_recovery routines in the same way.
> 
> To be honest I'm not sure I remember the original reasons this loop was 
> put in, but I do remember pain I went through when generating it and the 
> number of test cases that were needed to cover testing. It may well be 
> because I couldn't invoke the reset due to the CONNECTING->RESETTING 
> block.  I'm being pedantic as I still feel residual pain for it.
> 
> 
> > 
> > nvme_fc_error_recovery() ->
> >    nvme_stop_keep_alive() /* should not make a difference */
> >    nvme_stop_ctrl()       /* should be okay to run */
> >    nvme_fc_delete_association() ->
> >      __nvme_fc_abort_outstanding_ios(ctrl, false)
> >      nvme_unquiesce_admin_queue()
> >      nvme_unquiesce_io_queues()
> >      nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)
> >      if (port_state == ONLINE)
> >        queue_work(ctrl->connect)
> >      else
> >        nvme_fc_reconnect_or_delete();
> > 
> > Yes, this is a different behavior. IMO it is simpler to follow and
> > closer to what other transports do, keeping in mind async abort nature
> > of fc.
> > 
> > Aside from it is different, what is wrong with it?
> 
> See above.
> 
> ...
> >>>    static int
> >>> @@ -2495,39 +2506,6 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
> >>>    		nvme_unquiesce_admin_queue(&ctrl->ctrl);
> >>>    }
> >>>    
> >>> -static void
> >>> -nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
> >>> -{
> >>> -	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
> >>> -
> >>> -	/*
> >>> -	 * if an error (io timeout, etc) while (re)connecting, the remote
> >>> -	 * port requested terminating of the association (disconnect_ls)
> >>> -	 * or an error (timeout or abort) occurred on an io while creating
> >>> -	 * the controller.  Abort any ios on the association and let the
> >>> -	 * create_association error path resolve things.
> >>> -	 */
> >>> -	if (state == NVME_CTRL_CONNECTING) {
> >>> -		__nvme_fc_abort_outstanding_ios(ctrl, true);
> >>> -		dev_warn(ctrl->ctrl.device,
> >>> -			"NVME-FC{%d}: transport error during (re)connect\n",
> >>> -			ctrl->cnum);
> >>> -		return;
> >>> -	}
> >>
> >> This logic needs to be preserved. Its no longer part of
> >> nvme_fc_start_ioerr_recovery(). Failures during CONNECTING should not be
> >> "fenced". They should fail immediately.
> > 
> > I think this is similar to the point above.
> 
> Forgetting whether or not the above "works", what I'm pointing out is 
> that when in CONNECTING I don't believe you should be enacting the 
> FENCED state and delaying. For CONNECTING, the cleanup should be 
> immediate with no delay and no CCR attempt.  Only LIVE should transition 
> to FENCED.
> 
> Looking at patch 14, fencing_work calls nvme_fence_ctrl() which 
> unconditionally delays and tries to do CCR. We only want this if LIVE. 
> I'll comment on that patch.
> 
> 
> >> There is a small difference here in that The existing code avoids doing
> >> the ctrl reset if the controller is NEW. start_ioerr will change the
> >> ctrl to RESETTING. I'm not sure how much of an impact that is.
> >>
> > 
> > I think there is little done while controller in NEW state.
> > Let me know if I am missing something.
> 
> No - I had to update my understanding I was really out of date. Used to 
> be NEW is what initial controller create was done under. Everybody does 
> it now under CONNECTING.
> 
> ...
> >>>    static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
> >>>    {
> >>>    	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
> >>> @@ -2536,24 +2514,14 @@ static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
> >>>    	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
> >>>    	struct nvme_command *sqe = &cmdiu->sqe;
> >>>    
> >>> -	/*
> >>> -	 * Attempt to abort the offending command. Command completion
> >>> -	 * will detect the aborted io and will fail the connection.
> >>> -	 */
> >>>    	dev_info(ctrl->ctrl.device,
> >>>    		"NVME-FC{%d.%d}: io timeout: opcode %d fctype %d (%s) w10/11: "
> >>>    		"x%08x/x%08x\n",
> >>>    		ctrl->cnum, qnum, sqe->common.opcode, sqe->fabrics.fctype,
> >>>    		nvme_fabrics_opcode_str(qnum, sqe),
> >>>    		sqe->common.cdw10, sqe->common.cdw11);
> >>> -	if (__nvme_fc_abort_op(ctrl, op))
> >>> -		nvme_fc_error_recovery(ctrl, "io timeout abort failed");
> >>>    
> >>> -	/*
> >>> -	 * the io abort has been initiated. Have the reset timer
> >>> -	 * restarted and the abort completion will complete the io
> >>> -	 * shortly. Avoids a synchronous wait while the abort finishes.
> >>> -	 */
> >>> +	nvme_fc_start_ioerr_recovery(ctrl, "io timeout");
> >>
> >> Why get rid of the abort logic ?
> >> Note: the error recovery/controller reset is only called when the abort
> >> failed.
> >>
> >> I believe you should continue to abort the op.  The fence logic will
> >> kick in when the op completes later (along with other io completions).
> >> If nothing else, it allows a hw resource to be freed up.
> > 
> > The abort logic from nvme_fc_timeout() is problematic and it does not
> > play well with abort initiatored from ioerr_work or reset_work. The
> > problem is that op aborted from nvme_fc_timeout() is not accounted for
> > when the controller is reset.
> 
> note: I'll wait to be shown otherwise, but if this were true it would be 
> horribly broken for a long time.
> 
> > 
> > Here is an example scenario.
> > 
> > The first time a request times out it gets aborted we see this codepath
> > 
> > nvme_fc_timeout() ->
> >    __nvme_fc_abort_op() ->
> >      atomic_xchg(&op->state, FCPOP_STATE_ABORTED)
> >        ops->abort()
> >          return 0;
> 
> there's more than this in in the code:
> it changes op state to ABORTED, saving the old opstate.
> if the opstate wasn't active - it means something else changed and it 
> restores the old state (e.g. the aborts for the reset may have hit it).
> if it was active (e.g. the aborts the reset haven't hit it yet) it 
> checks the ctlr flag to see if the controller is being reset and 
> tracking io termination (the TERMIO flag) and if so, increments the 
> iocnt. So it is "included" in the reset.
> 
> if old state was active, it then sends the ABTS.
> if old state wasn't active (we've been here before or io terminated by 
> reset) it returns -ECANCELED, which will cause a controller reset to be 
> attempted if there's not already one in process.
> 
> 
> > 
> > nvme_fc_timeout() always return BLK_EH_RESET_TIMER so the same request
> > can timeout again. If the same request hits timeout again then
> > __nvme_fc_abort_op() returns -ECANCELED and nvme_fc_error_recovery()
> > gets called. Assuming the controller is LIVE it will be reset.
> 
> The normal case is timeout generates ABTS. ABTS usually completes 
> quickly with the io completing and the io callback to iodone, which sees 
> abort error status and resets controller. Its very typical for the ABTS 
> to complete long before the 2nd EH timer timing out.
> 
> Abnormal case is ABTS takes longer to complete than the 2nd EH timer 
> timing. Yes, that forces the controller reset.   I am aware that some 
> arrays will delay ABTS ACC while they terminate the back end, but there 
> are also frame drop conditions to consider.
> 
> if the controller is already resetting, all the above is largely n/a.
> 
> I see no reason to avoid the ABTS and wait for a 2nd EH timer to fire.
> 
> > 
> > nvme_fc_reset_ctrl_work() ->
> >    nvme_fc_delete_association() ->
> >      __nvme_fc_abort_outstanding_ios() ->
> >        nvme_fc_terminate_exchange() ->
> >          __nvme_fc_abort_op()
> > 
> > __nvme_fc_abort_op() finds that op already aborted. As a result of that
> > ctrl->iocnt will not be incrmented for this op. This means that
> > nvme_fc_delete_association() will not wait for this op to be aborted.
> 
> see missing code stmt above.
> 
> > 
> > I do not think we wait this behavior.
> > 
> > To continue the scenario above. The controller switches to CONNECTING
> > and the request times out again. This time we hit the deadlock described
> > in [1].
> > 
> > I think the first abort is the cause of the issue here. with this change
> > we should not hit the scenario described above.
> > 
> > 1 - https://lore.kernel.org/all/20250529214928.2112990-1-mkhalfella@purestorage.com/
> 
> Something else happened here. You can't get to CONNECTING state unless 
> all outstanding io was reaped in delete association. What is also harder 
> to understand is how there was an io to timeout if they've all been 
> reaped and queues haven't been restarted.  Timeout on one of the ios to 
> instatiate/init the controller maybe, but it shouldn't have been one of 
> those in the blk layer.

I will revisit this issue and hopefully provide more information.

> 
> > 
> >>
> >>
> >>>    	return BLK_EH_RESET_TIMER;
> >>>    }
> >>>    
> >>> @@ -3352,6 +3320,26 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
> >>>    	}
> >>>    }
> >>>    
> >>> +static void
> >>> +nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl)
> >>> +{
> >>> +	nvme_stop_keep_alive(&ctrl->ctrl);
> >>
> >> Curious, why did the stop_keep_alive() call get added to this ?
> >> Doesn't hurt.
> >>
> >> I assume it was due to other transports having it as they originally
> >> were calling stop_ctrl, but then moved to stop_keep_alive. Shouldn't
> >> this be followed by flush_work((&ctrl->ctrl.async_event_work) ?
> > 
> > Yes. I added it because it matches what other transports do.
> > 
> > nvme_fc_error_recovery() ->
> >    nvme_fc_delete_association() ->
> >      nvme_fc_abort_aen_ops() ->
> >        nvme_fc_term_aen_ops() ->
> >          cancel_work_sync(&ctrl->ctrl.async_event_work);
> > 
> > The above codepath takes care of async_event_work.
> 
> True, but the flush_works were added for a reason to the other 
> transports so I'm guessing timing matters. So waiting till ther later 
> term_aen call isn't great.  But I also guess, we haven't had an issue 
> prior and since we did take care if it in the aen routines, its likely 
> unneeded now.  Ok to add it but if so, we should keep the flush_work as 
> well. Also good to look same as the other transports.

It does not hard. Maybe I am missing something. I can put it back just
to be safe.

> 
> > 
> >>
> >>> +	nvme_stop_ctrl(&ctrl->ctrl);
> >>> +
> >>> +	/* will block while waiting for io to terminate */
> >>> +	nvme_fc_delete_association(ctrl);
> >>> +
> >>> +	/* Do not reconnect if controller is being deleted */
> >>> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
> >>> +		return;
> >>> +
> >>> +	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) {
> >>> +		queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
> >>> +		return;
> >>> +	}
> >>> +
> >>> +	nvme_fc_reconnect_or_delete(ctrl, -ENOTCONN);
> >>> +}
> >>
> >> This code and that in nvme_fc_reset_ctrl_work() need to be collapsed
> >> into a common helper function invoked by the 2 routines.  Also addresses
> >> the missing flush_delayed work in this routine.
> >>
> > 
> > Agree, nvme_fc_error_recovery() and nvme_fc_reset_ctrl_work() have
> > common code that can be refactored. However, I do not plan to do this
> > part of this change. I will take a look after I get CCR work done.
> 
> Don't put it off. You are adding as much code as the refactoring is. 
> Just make the change.

Okay. I will revist this change in light of CONNECTING issue and see if
I can merge tht two codepaths.

> 
> -- james
> 
>

Re: [PATCH v2 12/14] nvme-fc: Decouple error recovery from controller reset

Posted by James Smart 5 days, 12 hours ago

On 2/3/2026 11:19 AM, James Smart wrote:
> On 1/30/2026 2:34 PM, Mohamed Khalfella wrote:
...
>>   static void
>>   nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
>>   {
>> @@ -2049,9 +2061,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
>>           nvme_fc_complete_rq(rq);
>>   check_error:
>> -    if (terminate_assoc &&
>> -        nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_RESETTING)
>> -        queue_work(nvme_reset_wq, &ctrl->ioerr_work);
>> +    if (terminate_assoc)
>> +        nvme_fc_start_ioerr_recovery(ctrl, "io error");
> 
> this is ok. the ioerr_recovery will bounce the RESETTING state if it's 
> already in the state. So this is a little cleaner.a

What is problematic here is - if the start_ioerr path includes the 
CONNECTING logic that terminates i/o's, it's running in the LLDD's 
context that called this iodone routine. Not good. In existing code, the 
LLDD context was swapped to the work queue where error_recovery was called.

> 
>>   }
>>   static int
>> @@ -2495,39 +2506,6 @@ __nvme_fc_abort_outstanding_ios(struct 
>> nvme_fc_ctrl *ctrl, bool start_queues)
>>           nvme_unquiesce_admin_queue(&ctrl->ctrl);
>>   }
>> -static void
>> -nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
>> -{
>> -    enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
>> -
>> -    /*
>> -     * if an error (io timeout, etc) while (re)connecting, the remote
>> -     * port requested terminating of the association (disconnect_ls)
>> -     * or an error (timeout or abort) occurred on an io while creating
>> -     * the controller.  Abort any ios on the association and let the
>> -     * create_association error path resolve things.
>> -     */
>> -    if (state == NVME_CTRL_CONNECTING) {
>> -        __nvme_fc_abort_outstanding_ios(ctrl, true);
>> -        dev_warn(ctrl->ctrl.device,
>> -            "NVME-FC{%d}: transport error during (re)connect\n",
>> -            ctrl->cnum);
>> -        return;
>> -    }
> 
> This logic needs to be preserved. Its no longer part of 
> nvme_fc_start_ioerr_recovery(). Failures during CONNECTING should not be 
> "fenced". They should fail immediately.

this logic, if left in start_ioerr_recovery


-- james

Re: [PATCH v2 12/14] nvme-fc: Decouple error recovery from controller reset

Posted by Mohamed Khalfella 5 days, 10 hours ago

On Tue 2026-02-03 14:49:01 -0800, James Smart wrote:
> On 2/3/2026 11:19 AM, James Smart wrote:
> > On 1/30/2026 2:34 PM, Mohamed Khalfella wrote:
> ...
> >>   static void
> >>   nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
> >>   {
> >> @@ -2049,9 +2061,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
> >>           nvme_fc_complete_rq(rq);
> >>   check_error:
> >> -    if (terminate_assoc &&
> >> -        nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_RESETTING)
> >> -        queue_work(nvme_reset_wq, &ctrl->ioerr_work);
> >> +    if (terminate_assoc)
> >> +        nvme_fc_start_ioerr_recovery(ctrl, "io error");
> > 
> > this is ok. the ioerr_recovery will bounce the RESETTING state if it's 
> > already in the state. So this is a little cleaner.a
> 
> What is problematic here is - if the start_ioerr path includes the 
> CONNECTING logic that terminates i/o's, it's running in the LLDD's 
> context that called this iodone routine. Not good. In existing code, the 
> LLDD context was swapped to the work queue where error_recovery was called.

nvme_fc_start_ioerr_recovery() does not do the work in LLDD context. It
queues ctrl->ioerr_work. This is similar to existing code. I responed to
the issue with CONNECING state in another email.

> 
> > 
> >>   }
> >>   static int
> >> @@ -2495,39 +2506,6 @@ __nvme_fc_abort_outstanding_ios(struct 
> >> nvme_fc_ctrl *ctrl, bool start_queues)
> >>           nvme_unquiesce_admin_queue(&ctrl->ctrl);
> >>   }
> >> -static void
> >> -nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
> >> -{
> >> -    enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
> >> -
> >> -    /*
> >> -     * if an error (io timeout, etc) while (re)connecting, the remote
> >> -     * port requested terminating of the association (disconnect_ls)
> >> -     * or an error (timeout or abort) occurred on an io while creating
> >> -     * the controller.  Abort any ios on the association and let the
> >> -     * create_association error path resolve things.
> >> -     */
> >> -    if (state == NVME_CTRL_CONNECTING) {
> >> -        __nvme_fc_abort_outstanding_ios(ctrl, true);
> >> -        dev_warn(ctrl->ctrl.device,
> >> -            "NVME-FC{%d}: transport error during (re)connect\n",
> >> -            ctrl->cnum);
> >> -        return;
> >> -    }
> > 
> > This logic needs to be preserved. Its no longer part of 
> > nvme_fc_start_ioerr_recovery(). Failures during CONNECTING should not be 
> > "fenced". They should fail immediately.
> 
> this logic, if left in start_ioerr_recovery

I think it should be okay to rely on error recovery to handle this
situation.

> 
> 
> -- james

Re: [PATCH v2 12/14] nvme-fc: Decouple error recovery from controller reset

Posted by Hannes Reinecke 6 days, 5 hours ago

On 1/30/26 23:34, Mohamed Khalfella wrote:
> nvme_fc_error_recovery() called from nvme_fc_timeout() while controller
> in CONNECTING state results in deadlock reported in link below. Update
> nvme_fc_timeout() to schedule error recovery to avoid the deadlock.
> 
> Previous to this change if controller was LIVE error recovery resets
> the controller and this does not match nvme-tcp and nvme-rdma. Decouple
> error recovery from controller reset to match other fabric transports.
> 
> Link: https://lore.kernel.org/all/20250529214928.2112990-1-mkhalfella@purestorage.com/
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> ---
>   drivers/nvme/host/fc.c | 94 ++++++++++++++++++------------------------
>   1 file changed, 41 insertions(+), 53 deletions(-)
> 
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index 6948de3f438a..f8f6071b78ed 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -227,6 +227,8 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt);
>   static struct device *fc_udev_device;
>   
>   static void nvme_fc_complete_rq(struct request *rq);
> +static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
> +					 char *errmsg);
>   
>   /* *********************** FC-NVME Port Management ************************ */
>   
> @@ -788,7 +790,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
>   		"Reconnect", ctrl->cnum);
>   
>   	set_bit(ASSOC_FAILED, &ctrl->flags);
> -	nvme_reset_ctrl(&ctrl->ctrl);
> +	nvme_fc_start_ioerr_recovery(ctrl, "Connectivity Loss");
>   }
>   
>   /**
> @@ -985,7 +987,7 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
>   static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
>   static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
>   
> -static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
> +static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl);
>   
>   static void
>   __nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
> @@ -1567,9 +1569,8 @@ nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop)
>   	 * for the association have been ABTS'd by
>   	 * nvme_fc_delete_association().
>   	 */
> -
> -	/* fail the association */
> -	nvme_fc_error_recovery(ctrl, "Disconnect Association LS received");
> +	nvme_fc_start_ioerr_recovery(ctrl,
> +				     "Disconnect Association LS received");
>   
>   	/* release the reference taken by nvme_fc_match_disconn_ls() */
>   	nvme_fc_ctrl_put(ctrl);
> @@ -1871,7 +1872,7 @@ nvme_fc_ctrl_ioerr_work(struct work_struct *work)
>   	struct nvme_fc_ctrl *ctrl =
>   			container_of(work, struct nvme_fc_ctrl, ioerr_work);
>   
> -	nvme_fc_error_recovery(ctrl, "transport detected io error");
> +	nvme_fc_error_recovery(ctrl);
>   }
>   
>   /*
> @@ -1892,6 +1893,17 @@ char *nvme_fc_io_getuuid(struct nvmefc_fcp_req *req)
>   }
>   EXPORT_SYMBOL_GPL(nvme_fc_io_getuuid);
>   
> +static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
> +					 char *errmsg)
> +{
> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
> +		return;
> +
> +	dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: starting error recovery %s\n",
> +		 ctrl->cnum, errmsg);
> +	queue_work(nvme_reset_wq, &ctrl->ioerr_work);
> +}
> +
>   static void
>   nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
>   {
> @@ -2049,9 +2061,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
>   		nvme_fc_complete_rq(rq);
>   
>   check_error:
> -	if (terminate_assoc &&
> -	    nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_RESETTING)
> -		queue_work(nvme_reset_wq, &ctrl->ioerr_work);
> +	if (terminate_assoc)
> +		nvme_fc_start_ioerr_recovery(ctrl, "io error");
>   }
>   
>   static int
> @@ -2495,39 +2506,6 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
>   		nvme_unquiesce_admin_queue(&ctrl->ctrl);
>   }
>   
> -static void
> -nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
> -{
> -	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
> -
> -	/*
> -	 * if an error (io timeout, etc) while (re)connecting, the remote
> -	 * port requested terminating of the association (disconnect_ls)
> -	 * or an error (timeout or abort) occurred on an io while creating
> -	 * the controller.  Abort any ios on the association and let the
> -	 * create_association error path resolve things.
> -	 */
> -	if (state == NVME_CTRL_CONNECTING) {
> -		__nvme_fc_abort_outstanding_ios(ctrl, true);
> -		dev_warn(ctrl->ctrl.device,
> -			"NVME-FC{%d}: transport error during (re)connect\n",
> -			ctrl->cnum);
> -		return;
> -	}
> -
> -	/* Otherwise, only proceed if in LIVE state - e.g. on first error */
> -	if (state != NVME_CTRL_LIVE)
> -		return;
> -
> -	dev_warn(ctrl->ctrl.device,
> -		"NVME-FC{%d}: transport association event: %s\n",
> -		ctrl->cnum, errmsg);
> -	dev_warn(ctrl->ctrl.device,
> -		"NVME-FC{%d}: resetting controller\n", ctrl->cnum);
> -
> -	nvme_reset_ctrl(&ctrl->ctrl);
> -}
> -
>   static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
>   {
>   	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
> @@ -2536,24 +2514,14 @@ static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
>   	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
>   	struct nvme_command *sqe = &cmdiu->sqe;
>   
> -	/*
> -	 * Attempt to abort the offending command. Command completion
> -	 * will detect the aborted io and will fail the connection.
> -	 */
>   	dev_info(ctrl->ctrl.device,
>   		"NVME-FC{%d.%d}: io timeout: opcode %d fctype %d (%s) w10/11: "
>   		"x%08x/x%08x\n",
>   		ctrl->cnum, qnum, sqe->common.opcode, sqe->fabrics.fctype,
>   		nvme_fabrics_opcode_str(qnum, sqe),
>   		sqe->common.cdw10, sqe->common.cdw11);
> -	if (__nvme_fc_abort_op(ctrl, op))
> -		nvme_fc_error_recovery(ctrl, "io timeout abort failed");
>   
> -	/*
> -	 * the io abort has been initiated. Have the reset timer
> -	 * restarted and the abort completion will complete the io
> -	 * shortly. Avoids a synchronous wait while the abort finishes.
> -	 */
> +	nvme_fc_start_ioerr_recovery(ctrl, "io timeout");
>   	return BLK_EH_RESET_TIMER;
>   }
>   
> @@ -3352,6 +3320,26 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
>   	}
>   }
>   
> +static void
> +nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl)
> +{
> +	nvme_stop_keep_alive(&ctrl->ctrl);
> +	nvme_stop_ctrl(&ctrl->ctrl);
> +
> +	/* will block while waiting for io to terminate */
> +	nvme_fc_delete_association(ctrl);
> +
> +	/* Do not reconnect if controller is being deleted */
> +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
> +		return;
> +
> +	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) {
> +		queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
> +		return;
> +	}
> +
> +	nvme_fc_reconnect_or_delete(ctrl, -ENOTCONN);
> +}
>   
>   static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
>   	.name			= "fc",

I really don't get it. Why do you need to do additional steps here, when
all you do is split an existing function in half?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

Re: [PATCH v2 12/14] nvme-fc: Decouple error recovery from controller reset

Posted by Mohamed Khalfella 5 days, 13 hours ago

On Tue 2026-02-03 06:40:28 +0100, Hannes Reinecke wrote:
> On 1/30/26 23:34, Mohamed Khalfella wrote:
> > nvme_fc_error_recovery() called from nvme_fc_timeout() while controller
> > in CONNECTING state results in deadlock reported in link below. Update
> > nvme_fc_timeout() to schedule error recovery to avoid the deadlock.
> > 
> > Previous to this change if controller was LIVE error recovery resets
> > the controller and this does not match nvme-tcp and nvme-rdma. Decouple
> > error recovery from controller reset to match other fabric transports.
> > 
> > Link: https://lore.kernel.org/all/20250529214928.2112990-1-mkhalfella@purestorage.com/
> > Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> > ---
> >   drivers/nvme/host/fc.c | 94 ++++++++++++++++++------------------------
> >   1 file changed, 41 insertions(+), 53 deletions(-)
> > 
> > diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> > index 6948de3f438a..f8f6071b78ed 100644
> > --- a/drivers/nvme/host/fc.c
> > +++ b/drivers/nvme/host/fc.c
> > @@ -227,6 +227,8 @@ static DEFINE_IDA(nvme_fc_ctrl_cnt);
> >   static struct device *fc_udev_device;
> >   
> >   static void nvme_fc_complete_rq(struct request *rq);
> > +static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
> > +					 char *errmsg);
> >   
> >   /* *********************** FC-NVME Port Management ************************ */
> >   
> > @@ -788,7 +790,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl)
> >   		"Reconnect", ctrl->cnum);
> >   
> >   	set_bit(ASSOC_FAILED, &ctrl->flags);
> > -	nvme_reset_ctrl(&ctrl->ctrl);
> > +	nvme_fc_start_ioerr_recovery(ctrl, "Connectivity Loss");
> >   }
> >   
> >   /**
> > @@ -985,7 +987,7 @@ fc_dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
> >   static void nvme_fc_ctrl_put(struct nvme_fc_ctrl *);
> >   static int nvme_fc_ctrl_get(struct nvme_fc_ctrl *);
> >   
> > -static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg);
> > +static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl);
> >   
> >   static void
> >   __nvme_fc_finish_ls_req(struct nvmefc_ls_req_op *lsop)
> > @@ -1567,9 +1569,8 @@ nvme_fc_ls_disconnect_assoc(struct nvmefc_ls_rcv_op *lsop)
> >   	 * for the association have been ABTS'd by
> >   	 * nvme_fc_delete_association().
> >   	 */
> > -
> > -	/* fail the association */
> > -	nvme_fc_error_recovery(ctrl, "Disconnect Association LS received");
> > +	nvme_fc_start_ioerr_recovery(ctrl,
> > +				     "Disconnect Association LS received");
> >   
> >   	/* release the reference taken by nvme_fc_match_disconn_ls() */
> >   	nvme_fc_ctrl_put(ctrl);
> > @@ -1871,7 +1872,7 @@ nvme_fc_ctrl_ioerr_work(struct work_struct *work)
> >   	struct nvme_fc_ctrl *ctrl =
> >   			container_of(work, struct nvme_fc_ctrl, ioerr_work);
> >   
> > -	nvme_fc_error_recovery(ctrl, "transport detected io error");
> > +	nvme_fc_error_recovery(ctrl);
> >   }
> >   
> >   /*
> > @@ -1892,6 +1893,17 @@ char *nvme_fc_io_getuuid(struct nvmefc_fcp_req *req)
> >   }
> >   EXPORT_SYMBOL_GPL(nvme_fc_io_getuuid);
> >   
> > +static void nvme_fc_start_ioerr_recovery(struct nvme_fc_ctrl *ctrl,
> > +					 char *errmsg)
> > +{
> > +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
> > +		return;
> > +
> > +	dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: starting error recovery %s\n",
> > +		 ctrl->cnum, errmsg);
> > +	queue_work(nvme_reset_wq, &ctrl->ioerr_work);
> > +}
> > +
> >   static void
> >   nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
> >   {
> > @@ -2049,9 +2061,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
> >   		nvme_fc_complete_rq(rq);
> >   
> >   check_error:
> > -	if (terminate_assoc &&
> > -	    nvme_ctrl_state(&ctrl->ctrl) != NVME_CTRL_RESETTING)
> > -		queue_work(nvme_reset_wq, &ctrl->ioerr_work);
> > +	if (terminate_assoc)
> > +		nvme_fc_start_ioerr_recovery(ctrl, "io error");
> >   }
> >   
> >   static int
> > @@ -2495,39 +2506,6 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
> >   		nvme_unquiesce_admin_queue(&ctrl->ctrl);
> >   }
> >   
> > -static void
> > -nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg)
> > -{
> > -	enum nvme_ctrl_state state = nvme_ctrl_state(&ctrl->ctrl);
> > -
> > -	/*
> > -	 * if an error (io timeout, etc) while (re)connecting, the remote
> > -	 * port requested terminating of the association (disconnect_ls)
> > -	 * or an error (timeout or abort) occurred on an io while creating
> > -	 * the controller.  Abort any ios on the association and let the
> > -	 * create_association error path resolve things.
> > -	 */
> > -	if (state == NVME_CTRL_CONNECTING) {
> > -		__nvme_fc_abort_outstanding_ios(ctrl, true);
> > -		dev_warn(ctrl->ctrl.device,
> > -			"NVME-FC{%d}: transport error during (re)connect\n",
> > -			ctrl->cnum);
> > -		return;
> > -	}
> > -
> > -	/* Otherwise, only proceed if in LIVE state - e.g. on first error */
> > -	if (state != NVME_CTRL_LIVE)
> > -		return;
> > -
> > -	dev_warn(ctrl->ctrl.device,
> > -		"NVME-FC{%d}: transport association event: %s\n",
> > -		ctrl->cnum, errmsg);
> > -	dev_warn(ctrl->ctrl.device,
> > -		"NVME-FC{%d}: resetting controller\n", ctrl->cnum);
> > -
> > -	nvme_reset_ctrl(&ctrl->ctrl);
> > -}
> > -
> >   static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
> >   {
> >   	struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq);
> > @@ -2536,24 +2514,14 @@ static enum blk_eh_timer_return nvme_fc_timeout(struct request *rq)
> >   	struct nvme_fc_cmd_iu *cmdiu = &op->cmd_iu;
> >   	struct nvme_command *sqe = &cmdiu->sqe;
> >   
> > -	/*
> > -	 * Attempt to abort the offending command. Command completion
> > -	 * will detect the aborted io and will fail the connection.
> > -	 */
> >   	dev_info(ctrl->ctrl.device,
> >   		"NVME-FC{%d.%d}: io timeout: opcode %d fctype %d (%s) w10/11: "
> >   		"x%08x/x%08x\n",
> >   		ctrl->cnum, qnum, sqe->common.opcode, sqe->fabrics.fctype,
> >   		nvme_fabrics_opcode_str(qnum, sqe),
> >   		sqe->common.cdw10, sqe->common.cdw11);
> > -	if (__nvme_fc_abort_op(ctrl, op))
> > -		nvme_fc_error_recovery(ctrl, "io timeout abort failed");
> >   
> > -	/*
> > -	 * the io abort has been initiated. Have the reset timer
> > -	 * restarted and the abort completion will complete the io
> > -	 * shortly. Avoids a synchronous wait while the abort finishes.
> > -	 */
> > +	nvme_fc_start_ioerr_recovery(ctrl, "io timeout");
> >   	return BLK_EH_RESET_TIMER;
> >   }
> >   
> > @@ -3352,6 +3320,26 @@ nvme_fc_reset_ctrl_work(struct work_struct *work)
> >   	}
> >   }
> >   
> > +static void
> > +nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl)
> > +{
> > +	nvme_stop_keep_alive(&ctrl->ctrl);
> > +	nvme_stop_ctrl(&ctrl->ctrl);
> > +
> > +	/* will block while waiting for io to terminate */
> > +	nvme_fc_delete_association(ctrl);
> > +
> > +	/* Do not reconnect if controller is being deleted */
> > +	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
> > +		return;
> > +
> > +	if (ctrl->rport->remoteport.port_state == FC_OBJSTATE_ONLINE) {
> > +		queue_delayed_work(nvme_wq, &ctrl->connect_work, 0);
> > +		return;
> > +	}
> > +
> > +	nvme_fc_reconnect_or_delete(ctrl, -ENOTCONN);
> > +}
> >   
> >   static const struct nvme_ctrl_ops nvme_fc_ctrl_ops = {
> >   	.name			= "fc",
> 
> I really don't get it. Why do you need to do additional steps here, when
> all you do is split an existing function in half?
> 

Can you help me and point out to the part that is additional?

Is it nvme_stop_keep_alive()? If yes, then it matches other transports.
I am okay with removing it and we assume that the work will stop when it
hits an error, like what it does today.

> Cheers,
> 
> Hannes
> -- 
> Dr. Hannes Reinecke                  Kernel Storage Architect
> hare@suse.de                                +49 911 74053 688
> SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
> HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich