[v3] TP8028 Rapid Path Failure Recovery

[PATCH v3 14/21] nvme-fc: Hold inflight requests while in FENCING state

Posted by Mohamed Khalfella 1 month, 2 weeks ago

While in FENCING state, aborted inflight IOs should be held until fencing
is done. Update nvme_fc_fcpio_done() to not complete aborted requests or
requests with transport errors. These held requests will be canceled in
nvme_fc_delete_association() after fencing is done. nvme_fc_fcpio_done()
avoids racing with canceling aborted requests by making sure we complete
successful requests before waking up the waiting thread.

Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
 drivers/nvme/host/fc.c | 61 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 6ebabfb7e76d..e605dd3f4a40 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -172,7 +172,7 @@ struct nvme_fc_ctrl {
 
 	struct kref		ref;
 	unsigned long		flags;
-	u32			iocnt;
+	atomic_t		iocnt;
 	wait_queue_head_t	ioabort_wait;
 
 	struct nvme_fc_fcp_op	aen_ops[NVME_NR_AEN_COMMANDS];
@@ -1823,7 +1823,7 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
 		atomic_set(&op->state, opstate);
 	else if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) {
 		op->flags |= FCOP_FLAGS_TERMIO;
-		ctrl->iocnt++;
+		atomic_inc(&ctrl->iocnt);
 	}
 	spin_unlock_irqrestore(&ctrl->lock, flags);
 
@@ -1853,20 +1853,29 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
 }
 
 static inline void
+__nvme_fc_fcpop_count_one_down(struct nvme_fc_ctrl *ctrl)
+{
+	if (atomic_dec_return(&ctrl->iocnt) == 0)
+		wake_up(&ctrl->ioabort_wait);
+}
+
+static inline bool
 __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
 		struct nvme_fc_fcp_op *op, int opstate)
 {
 	unsigned long flags;
+	bool ret = false;
 
 	if (opstate == FCPOP_STATE_ABORTED) {
 		spin_lock_irqsave(&ctrl->lock, flags);
 		if (test_bit(FCCTRL_TERMIO, &ctrl->flags) &&
 		    op->flags & FCOP_FLAGS_TERMIO) {
-			if (!--ctrl->iocnt)
-				wake_up(&ctrl->ioabort_wait);
+			ret = true;
 		}
 		spin_unlock_irqrestore(&ctrl->lock, flags);
 	}
+
+	return ret;
 }
 
 static void nvme_fc_fencing_work(struct work_struct *work)
@@ -1969,7 +1978,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 	struct nvme_command *sqe = &op->cmd_iu.sqe;
 	__le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
 	union nvme_result result;
-	bool terminate_assoc = true;
+	bool op_term, terminate_assoc = true;
+	enum nvme_ctrl_state state;
 	int opstate;
 
 	/*
@@ -2102,16 +2112,38 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
 done:
 	if (op->flags & FCOP_FLAGS_AEN) {
 		nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
-		__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+		if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
+			__nvme_fc_fcpop_count_one_down(ctrl);
 		atomic_set(&op->state, FCPOP_STATE_IDLE);
 		op->flags = FCOP_FLAGS_AEN;	/* clear other flags */
 		nvme_fc_ctrl_put(ctrl);
 		goto check_error;
 	}
 
-	__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+	/*
+	 * We can not access op after the request is completed because it can
+	 * be reused immediately. At the same time we want to wakeup the thread
+	 * waiting for ongoing IOs _after_ requests are completed. This is
+	 * necessary because that thread will start canceling inflight IOs
+	 * and we want to avoid request completion racing with cancellation.
+	 */
+	op_term = __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+
+	/*
+	 * If we are going to terminate associations and the controller is
+	 * LIVE or FENCING, then do not complete this request now. Let error
+	 * recovery cancel this request when it is safe to do so.
+	 */
+	state = nvme_ctrl_state(&ctrl->ctrl);
+	if (terminate_assoc &&
+	    (state == NVME_CTRL_LIVE || state == NVME_CTRL_FENCING))
+		goto check_op_term;
+
 	if (!nvme_try_complete_req(rq, status, result))
 		nvme_fc_complete_rq(rq);
+check_op_term:
+	if (op_term)
+		__nvme_fc_fcpop_count_one_down(ctrl);
 
 check_error:
 	if (terminate_assoc)
@@ -2750,7 +2782,8 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
 		 * cmd with the csn was supposed to arrive.
 		 */
 		opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
-		__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+		if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
+			__nvme_fc_fcpop_count_one_down(ctrl);
 
 		if (!(op->flags & FCOP_FLAGS_AEN)) {
 			nvme_fc_unmap_data(ctrl, op->rq, op);
@@ -3219,7 +3252,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 
 	spin_lock_irqsave(&ctrl->lock, flags);
 	set_bit(FCCTRL_TERMIO, &ctrl->flags);
-	ctrl->iocnt = 0;
+	atomic_set(&ctrl->iocnt, 0);
 	spin_unlock_irqrestore(&ctrl->lock, flags);
 
 	__nvme_fc_abort_outstanding_ios(ctrl, false);
@@ -3228,11 +3261,19 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
 	nvme_fc_abort_aen_ops(ctrl);
 
 	/* wait for all io that had to be aborted */
+	wait_event(ctrl->ioabort_wait, atomic_read(&ctrl->iocnt) == 0);
 	spin_lock_irq(&ctrl->lock);
-	wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock);
 	clear_bit(FCCTRL_TERMIO, &ctrl->flags);
 	spin_unlock_irq(&ctrl->lock);
 
+	/*
+	 * At this point all inflight requests have been successfully
+	 * aborted. Now it is safe to cancel all requests we decided
+	 * not to complete in nvme_fc_fcpio_done().
+	 */
+	nvme_cancel_tagset(&ctrl->ctrl);
+	nvme_cancel_admin_tagset(&ctrl->ctrl);
+
 	nvme_fc_term_aen_ops(ctrl);
 
 	/*
-- 
2.52.0

Re: [PATCH v3 14/21] nvme-fc: Hold inflight requests while in FENCING state

Posted by James Smart 1 month ago

On 2/13/2026 8:25 PM, Mohamed Khalfella wrote:
> While in FENCING state, aborted inflight IOs should be held until fencing
> is done. Update nvme_fc_fcpio_done() to not complete aborted requests or
> requests with transport errors. These held requests will be canceled in
> nvme_fc_delete_association() after fencing is done. nvme_fc_fcpio_done()
> avoids racing with canceling aborted requests by making sure we complete
> successful requests before waking up the waiting thread.
> 
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> ---
>   drivers/nvme/host/fc.c | 61 +++++++++++++++++++++++++++++++++++-------
>   1 file changed, 51 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index 6ebabfb7e76d..e605dd3f4a40 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -172,7 +172,7 @@ struct nvme_fc_ctrl {
>   
>   	struct kref		ref;
>   	unsigned long		flags;
> -	u32			iocnt;
> +	atomic_t		iocnt;
>   	wait_queue_head_t	ioabort_wait;
>   
>   	struct nvme_fc_fcp_op	aen_ops[NVME_NR_AEN_COMMANDS];
> @@ -1823,7 +1823,7 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
>   		atomic_set(&op->state, opstate);
>   	else if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) {
>   		op->flags |= FCOP_FLAGS_TERMIO;
> -		ctrl->iocnt++;
> +		atomic_inc(&ctrl->iocnt);

the atomic change is probably what corrects deadlocks you saw.

>   	}
>   	spin_unlock_irqrestore(&ctrl->lock, flags);
>   
> @@ -1853,20 +1853,29 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
>   }
>   
>   static inline void
> +__nvme_fc_fcpop_count_one_down(struct nvme_fc_ctrl *ctrl)
> +{
> +	if (atomic_dec_return(&ctrl->iocnt) == 0)
> +		wake_up(&ctrl->ioabort_wait);
> +}
> +
> +static inline bool
>   __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
>   		struct nvme_fc_fcp_op *op, int opstate)
>   {
>   	unsigned long flags;
> +	bool ret = false;
>   
>   	if (opstate == FCPOP_STATE_ABORTED) {
>   		spin_lock_irqsave(&ctrl->lock, flags);
>   		if (test_bit(FCCTRL_TERMIO, &ctrl->flags) &&
>   		    op->flags & FCOP_FLAGS_TERMIO) {
> -			if (!--ctrl->iocnt)
> -				wake_up(&ctrl->ioabort_wait);
> +			ret = true;
>   		}
>   		spin_unlock_irqrestore(&ctrl->lock, flags);
>   	}
> +
> +	return ret;
>   }
>   
>   static void nvme_fc_fencing_work(struct work_struct *work)
> @@ -1969,7 +1978,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
>   	struct nvme_command *sqe = &op->cmd_iu.sqe;
>   	__le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
>   	union nvme_result result;
> -	bool terminate_assoc = true;
> +	bool op_term, terminate_assoc = true;
> +	enum nvme_ctrl_state state;
>   	int opstate;
>   
>   	/*
> @@ -2102,16 +2112,38 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
>   done:
>   	if (op->flags & FCOP_FLAGS_AEN) {
>   		nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
> -		__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> +		if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
> +			__nvme_fc_fcpop_count_one_down(ctrl);
>   		atomic_set(&op->state, FCPOP_STATE_IDLE);
>   		op->flags = FCOP_FLAGS_AEN;	/* clear other flags */
>   		nvme_fc_ctrl_put(ctrl);
>   		goto check_error;
>   	}
>   
> -	__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> +	/*
> +	 * We can not access op after the request is completed because it can
> +	 * be reused immediately. At the same time we want to wakeup the thread
> +	 * waiting for ongoing IOs _after_ requests are completed. This is
> +	 * necessary because that thread will start canceling inflight IOs
> +	 * and we want to avoid request completion racing with cancellation.
> +	 */
> +	op_term = __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> +
> +	/*
> +	 * If we are going to terminate associations and the controller is
> +	 * LIVE or FENCING, then do not complete this request now. Let error
> +	 * recovery cancel this request when it is safe to do so.
> +	 */
> +	state = nvme_ctrl_state(&ctrl->ctrl);
> +	if (terminate_assoc &&
> +	    (state == NVME_CTRL_LIVE || state == NVME_CTRL_FENCING))
> +		goto check_op_term;
> +
>   	if (!nvme_try_complete_req(rq, status, result))
>   		nvme_fc_complete_rq(rq);
> +check_op_term:
> +	if (op_term)
> +		__nvme_fc_fcpop_count_one_down(ctrl);
>   
>   check_error:
>   	if (terminate_assoc)
> @@ -2750,7 +2782,8 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
>   		 * cmd with the csn was supposed to arrive.
>   		 */
>   		opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
> -		__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> +		if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
> +			__nvme_fc_fcpop_count_one_down(ctrl);
>   
>   		if (!(op->flags & FCOP_FLAGS_AEN)) {
>   			nvme_fc_unmap_data(ctrl, op->rq, op);
> @@ -3219,7 +3252,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
>   
>   	spin_lock_irqsave(&ctrl->lock, flags);
>   	set_bit(FCCTRL_TERMIO, &ctrl->flags);
> -	ctrl->iocnt = 0;
> +	atomic_set(&ctrl->iocnt, 0);
>   	spin_unlock_irqrestore(&ctrl->lock, flags);
>   
>   	__nvme_fc_abort_outstanding_ios(ctrl, false);
> @@ -3228,11 +3261,19 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
>   	nvme_fc_abort_aen_ops(ctrl);
>   
>   	/* wait for all io that had to be aborted */
> +	wait_event(ctrl->ioabort_wait, atomic_read(&ctrl->iocnt) == 0);
>   	spin_lock_irq(&ctrl->lock);
> -	wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock);
>   	clear_bit(FCCTRL_TERMIO, &ctrl->flags);
>   	spin_unlock_irq(&ctrl->lock);
>   
> +	/*
> +	 * At this point all inflight requests have been successfully
> +	 * aborted. Now it is safe to cancel all requests we decided
> +	 * not to complete in nvme_fc_fcpio_done().
> +	 */
> +	nvme_cancel_tagset(&ctrl->ctrl);
> +	nvme_cancel_admin_tagset(&ctrl->ctrl);
> +
>   	nvme_fc_term_aen_ops(ctrl);
>   
>   	/*

This looks good

Signed-off-by: James Smart <jsmart833426@gmail.com>

-- james

Re: [PATCH v3 14/21] nvme-fc: Hold inflight requests while in FENCING state

Posted by Randy Jennings 1 month ago

On Fri, Feb 13, 2026 at 8:28 PM Mohamed Khalfella
<mkhalfella@purestorage.com> wrote:
>
> While in FENCING state, aborted inflight IOs should be held until fencing
> is done. Update nvme_fc_fcpio_done() to not complete aborted requests or
> requests with transport errors. These held requests will be canceled in
> nvme_fc_delete_association() after fencing is done. nvme_fc_fcpio_done()
> avoids racing with canceling aborted requests by making sure we complete
> successful requests before waking up the waiting thread.
>
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
>  static void nvme_fc_fencing_work(struct work_struct *work)
> @@ -1969,7 +1978,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
>         struct nvme_command *sqe = &op->cmd_iu.sqe;
>         __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
>         union nvme_result result;
> -       bool terminate_assoc = true;
> +       bool op_term, terminate_assoc = true;
> +       enum nvme_ctrl_state state;
>         int opstate;
>
>         /*
> @@ -2102,16 +2112,38 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
>  done:
>         if (op->flags & FCOP_FLAGS_AEN) {
>                 nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
> -               __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> +               if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
> +                       __nvme_fc_fcpop_count_one_down(ctrl);
>                 atomic_set(&op->state, FCPOP_STATE_IDLE);
>                 op->flags = FCOP_FLAGS_AEN;     /* clear other flags */
>                 nvme_fc_ctrl_put(ctrl);
>                 goto check_error;
>         }
>
> -       __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> +       /*
> +        * We can not access op after the request is completed because it can
> +        * be reused immediately. At the same time we want to wakeup the thread
> +        * waiting for ongoing IOs _after_ requests are completed. This is
> +        * necessary because that thread will start canceling inflight IOs
> +        * and we want to avoid request completion racing with cancellation.
> +        */
> +       op_term = __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> +
> +       /*
> +        * If we are going to terminate associations and the controller is
> +        * LIVE or FENCING, then do not complete this request now. Let error
> +        * recovery cancel this request when it is safe to do so.
> +        */
> +       state = nvme_ctrl_state(&ctrl->ctrl);
> +       if (terminate_assoc &&
> +           (state == NVME_CTRL_LIVE || state == NVME_CTRL_FENCING))
> +               goto check_op_term;
> +
>         if (!nvme_try_complete_req(rq, status, result))
>                 nvme_fc_complete_rq(rq);
> +check_op_term:
> +       if (op_term)
> +               __nvme_fc_fcpop_count_one_down(ctrl);

Although it is a more complicated boolean expression, I think it is
easier to grok:
> +       if (!(terminate_assoc &&
> +             (state == NVME_CTRL_LIVE || state == NVME_CTRL_FENCING)) &&
> +           !nvme_try_complete_req(rq, status, result))
>                 nvme_fc_complete_rq(rq);
resulting in one less goto.

Sincerely,
Randy Jennings