While in FENCING state, aborted inflight IOs should be held until fencing
is done. Update nvme_fc_fcpio_done() to not complete aborted requests or
requests with transport errors. These held requests will be canceled in
nvme_fc_delete_association() after fencing is done. nvme_fc_fcpio_done()
avoids racing with canceling aborted requests by making sure we complete
successful requests before waking up the waiting thread.
Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
drivers/nvme/host/fc.c | 61 +++++++++++++++++++++++++++++++++++-------
1 file changed, 51 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 6ebabfb7e76d..e605dd3f4a40 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -172,7 +172,7 @@ struct nvme_fc_ctrl {
struct kref ref;
unsigned long flags;
- u32 iocnt;
+ atomic_t iocnt;
wait_queue_head_t ioabort_wait;
struct nvme_fc_fcp_op aen_ops[NVME_NR_AEN_COMMANDS];
@@ -1823,7 +1823,7 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
atomic_set(&op->state, opstate);
else if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) {
op->flags |= FCOP_FLAGS_TERMIO;
- ctrl->iocnt++;
+ atomic_inc(&ctrl->iocnt);
}
spin_unlock_irqrestore(&ctrl->lock, flags);
@@ -1853,20 +1853,29 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
}
static inline void
+__nvme_fc_fcpop_count_one_down(struct nvme_fc_ctrl *ctrl)
+{
+ if (atomic_dec_return(&ctrl->iocnt) == 0)
+ wake_up(&ctrl->ioabort_wait);
+}
+
+static inline bool
__nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
struct nvme_fc_fcp_op *op, int opstate)
{
unsigned long flags;
+ bool ret = false;
if (opstate == FCPOP_STATE_ABORTED) {
spin_lock_irqsave(&ctrl->lock, flags);
if (test_bit(FCCTRL_TERMIO, &ctrl->flags) &&
op->flags & FCOP_FLAGS_TERMIO) {
- if (!--ctrl->iocnt)
- wake_up(&ctrl->ioabort_wait);
+ ret = true;
}
spin_unlock_irqrestore(&ctrl->lock, flags);
}
+
+ return ret;
}
static void nvme_fc_fencing_work(struct work_struct *work)
@@ -1969,7 +1978,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
struct nvme_command *sqe = &op->cmd_iu.sqe;
__le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
union nvme_result result;
- bool terminate_assoc = true;
+ bool op_term, terminate_assoc = true;
+ enum nvme_ctrl_state state;
int opstate;
/*
@@ -2102,16 +2112,38 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
done:
if (op->flags & FCOP_FLAGS_AEN) {
nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
- __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+ if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
+ __nvme_fc_fcpop_count_one_down(ctrl);
atomic_set(&op->state, FCPOP_STATE_IDLE);
op->flags = FCOP_FLAGS_AEN; /* clear other flags */
nvme_fc_ctrl_put(ctrl);
goto check_error;
}
- __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+ /*
+ * We can not access op after the request is completed because it can
+ * be reused immediately. At the same time we want to wakeup the thread
+ * waiting for ongoing IOs _after_ requests are completed. This is
+ * necessary because that thread will start canceling inflight IOs
+ * and we want to avoid request completion racing with cancellation.
+ */
+ op_term = __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+
+ /*
+ * If we are going to terminate associations and the controller is
+ * LIVE or FENCING, then do not complete this request now. Let error
+ * recovery cancel this request when it is safe to do so.
+ */
+ state = nvme_ctrl_state(&ctrl->ctrl);
+ if (terminate_assoc &&
+ (state == NVME_CTRL_LIVE || state == NVME_CTRL_FENCING))
+ goto check_op_term;
+
if (!nvme_try_complete_req(rq, status, result))
nvme_fc_complete_rq(rq);
+check_op_term:
+ if (op_term)
+ __nvme_fc_fcpop_count_one_down(ctrl);
check_error:
if (terminate_assoc)
@@ -2750,7 +2782,8 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
* cmd with the csn was supposed to arrive.
*/
opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
- __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
+ if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
+ __nvme_fc_fcpop_count_one_down(ctrl);
if (!(op->flags & FCOP_FLAGS_AEN)) {
nvme_fc_unmap_data(ctrl, op->rq, op);
@@ -3219,7 +3252,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
spin_lock_irqsave(&ctrl->lock, flags);
set_bit(FCCTRL_TERMIO, &ctrl->flags);
- ctrl->iocnt = 0;
+ atomic_set(&ctrl->iocnt, 0);
spin_unlock_irqrestore(&ctrl->lock, flags);
__nvme_fc_abort_outstanding_ios(ctrl, false);
@@ -3228,11 +3261,19 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
nvme_fc_abort_aen_ops(ctrl);
/* wait for all io that had to be aborted */
+ wait_event(ctrl->ioabort_wait, atomic_read(&ctrl->iocnt) == 0);
spin_lock_irq(&ctrl->lock);
- wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock);
clear_bit(FCCTRL_TERMIO, &ctrl->flags);
spin_unlock_irq(&ctrl->lock);
+ /*
+ * At this point all inflight requests have been successfully
+ * aborted. Now it is safe to cancel all requests we decided
+ * not to complete in nvme_fc_fcpio_done().
+ */
+ nvme_cancel_tagset(&ctrl->ctrl);
+ nvme_cancel_admin_tagset(&ctrl->ctrl);
+
nvme_fc_term_aen_ops(ctrl);
/*
--
2.52.0
On 2/13/2026 8:25 PM, Mohamed Khalfella wrote:
> While in FENCING state, aborted inflight IOs should be held until fencing
> is done. Update nvme_fc_fcpio_done() to not complete aborted requests or
> requests with transport errors. These held requests will be canceled in
> nvme_fc_delete_association() after fencing is done. nvme_fc_fcpio_done()
> avoids racing with canceling aborted requests by making sure we complete
> successful requests before waking up the waiting thread.
>
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> ---
> drivers/nvme/host/fc.c | 61 +++++++++++++++++++++++++++++++++++-------
> 1 file changed, 51 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
> index 6ebabfb7e76d..e605dd3f4a40 100644
> --- a/drivers/nvme/host/fc.c
> +++ b/drivers/nvme/host/fc.c
> @@ -172,7 +172,7 @@ struct nvme_fc_ctrl {
>
> struct kref ref;
> unsigned long flags;
> - u32 iocnt;
> + atomic_t iocnt;
> wait_queue_head_t ioabort_wait;
>
> struct nvme_fc_fcp_op aen_ops[NVME_NR_AEN_COMMANDS];
> @@ -1823,7 +1823,7 @@ __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op)
> atomic_set(&op->state, opstate);
> else if (test_bit(FCCTRL_TERMIO, &ctrl->flags)) {
> op->flags |= FCOP_FLAGS_TERMIO;
> - ctrl->iocnt++;
> + atomic_inc(&ctrl->iocnt);
the atomic change is probably what corrects deadlocks you saw.
> }
> spin_unlock_irqrestore(&ctrl->lock, flags);
>
> @@ -1853,20 +1853,29 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl)
> }
>
> static inline void
> +__nvme_fc_fcpop_count_one_down(struct nvme_fc_ctrl *ctrl)
> +{
> + if (atomic_dec_return(&ctrl->iocnt) == 0)
> + wake_up(&ctrl->ioabort_wait);
> +}
> +
> +static inline bool
> __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl,
> struct nvme_fc_fcp_op *op, int opstate)
> {
> unsigned long flags;
> + bool ret = false;
>
> if (opstate == FCPOP_STATE_ABORTED) {
> spin_lock_irqsave(&ctrl->lock, flags);
> if (test_bit(FCCTRL_TERMIO, &ctrl->flags) &&
> op->flags & FCOP_FLAGS_TERMIO) {
> - if (!--ctrl->iocnt)
> - wake_up(&ctrl->ioabort_wait);
> + ret = true;
> }
> spin_unlock_irqrestore(&ctrl->lock, flags);
> }
> +
> + return ret;
> }
>
> static void nvme_fc_fencing_work(struct work_struct *work)
> @@ -1969,7 +1978,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
> struct nvme_command *sqe = &op->cmd_iu.sqe;
> __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
> union nvme_result result;
> - bool terminate_assoc = true;
> + bool op_term, terminate_assoc = true;
> + enum nvme_ctrl_state state;
> int opstate;
>
> /*
> @@ -2102,16 +2112,38 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
> done:
> if (op->flags & FCOP_FLAGS_AEN) {
> nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
> - __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> + if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
> + __nvme_fc_fcpop_count_one_down(ctrl);
> atomic_set(&op->state, FCPOP_STATE_IDLE);
> op->flags = FCOP_FLAGS_AEN; /* clear other flags */
> nvme_fc_ctrl_put(ctrl);
> goto check_error;
> }
>
> - __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> + /*
> + * We can not access op after the request is completed because it can
> + * be reused immediately. At the same time we want to wakeup the thread
> + * waiting for ongoing IOs _after_ requests are completed. This is
> + * necessary because that thread will start canceling inflight IOs
> + * and we want to avoid request completion racing with cancellation.
> + */
> + op_term = __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> +
> + /*
> + * If we are going to terminate associations and the controller is
> + * LIVE or FENCING, then do not complete this request now. Let error
> + * recovery cancel this request when it is safe to do so.
> + */
> + state = nvme_ctrl_state(&ctrl->ctrl);
> + if (terminate_assoc &&
> + (state == NVME_CTRL_LIVE || state == NVME_CTRL_FENCING))
> + goto check_op_term;
> +
> if (!nvme_try_complete_req(rq, status, result))
> nvme_fc_complete_rq(rq);
> +check_op_term:
> + if (op_term)
> + __nvme_fc_fcpop_count_one_down(ctrl);
>
> check_error:
> if (terminate_assoc)
> @@ -2750,7 +2782,8 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
> * cmd with the csn was supposed to arrive.
> */
> opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE);
> - __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> + if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
> + __nvme_fc_fcpop_count_one_down(ctrl);
>
> if (!(op->flags & FCOP_FLAGS_AEN)) {
> nvme_fc_unmap_data(ctrl, op->rq, op);
> @@ -3219,7 +3252,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
>
> spin_lock_irqsave(&ctrl->lock, flags);
> set_bit(FCCTRL_TERMIO, &ctrl->flags);
> - ctrl->iocnt = 0;
> + atomic_set(&ctrl->iocnt, 0);
> spin_unlock_irqrestore(&ctrl->lock, flags);
>
> __nvme_fc_abort_outstanding_ios(ctrl, false);
> @@ -3228,11 +3261,19 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
> nvme_fc_abort_aen_ops(ctrl);
>
> /* wait for all io that had to be aborted */
> + wait_event(ctrl->ioabort_wait, atomic_read(&ctrl->iocnt) == 0);
> spin_lock_irq(&ctrl->lock);
> - wait_event_lock_irq(ctrl->ioabort_wait, ctrl->iocnt == 0, ctrl->lock);
> clear_bit(FCCTRL_TERMIO, &ctrl->flags);
> spin_unlock_irq(&ctrl->lock);
>
> + /*
> + * At this point all inflight requests have been successfully
> + * aborted. Now it is safe to cancel all requests we decided
> + * not to complete in nvme_fc_fcpio_done().
> + */
> + nvme_cancel_tagset(&ctrl->ctrl);
> + nvme_cancel_admin_tagset(&ctrl->ctrl);
> +
> nvme_fc_term_aen_ops(ctrl);
>
> /*
This looks good
Signed-off-by: James Smart <jsmart833426@gmail.com>
-- james
On Fri, Feb 13, 2026 at 8:28 PM Mohamed Khalfella
<mkhalfella@purestorage.com> wrote:
>
> While in FENCING state, aborted inflight IOs should be held until fencing
> is done. Update nvme_fc_fcpio_done() to not complete aborted requests or
> requests with transport errors. These held requests will be canceled in
> nvme_fc_delete_association() after fencing is done. nvme_fc_fcpio_done()
> avoids racing with canceling aborted requests by making sure we complete
> successful requests before waking up the waiting thread.
>
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> static void nvme_fc_fencing_work(struct work_struct *work)
> @@ -1969,7 +1978,8 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
> struct nvme_command *sqe = &op->cmd_iu.sqe;
> __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1);
> union nvme_result result;
> - bool terminate_assoc = true;
> + bool op_term, terminate_assoc = true;
> + enum nvme_ctrl_state state;
> int opstate;
>
> /*
> @@ -2102,16 +2112,38 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req)
> done:
> if (op->flags & FCOP_FLAGS_AEN) {
> nvme_complete_async_event(&queue->ctrl->ctrl, status, &result);
> - __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> + if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate))
> + __nvme_fc_fcpop_count_one_down(ctrl);
> atomic_set(&op->state, FCPOP_STATE_IDLE);
> op->flags = FCOP_FLAGS_AEN; /* clear other flags */
> nvme_fc_ctrl_put(ctrl);
> goto check_error;
> }
>
> - __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> + /*
> + * We can not access op after the request is completed because it can
> + * be reused immediately. At the same time we want to wakeup the thread
> + * waiting for ongoing IOs _after_ requests are completed. This is
> + * necessary because that thread will start canceling inflight IOs
> + * and we want to avoid request completion racing with cancellation.
> + */
> + op_term = __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate);
> +
> + /*
> + * If we are going to terminate associations and the controller is
> + * LIVE or FENCING, then do not complete this request now. Let error
> + * recovery cancel this request when it is safe to do so.
> + */
> + state = nvme_ctrl_state(&ctrl->ctrl);
> + if (terminate_assoc &&
> + (state == NVME_CTRL_LIVE || state == NVME_CTRL_FENCING))
> + goto check_op_term;
> +
> if (!nvme_try_complete_req(rq, status, result))
> nvme_fc_complete_rq(rq);
> +check_op_term:
> + if (op_term)
> + __nvme_fc_fcpop_count_one_down(ctrl);
Although it is a more complicated boolean expression, I think it is
easier to grok:
> + if (!(terminate_assoc &&
> + (state == NVME_CTRL_LIVE || state == NVME_CTRL_FENCING)) &&
> + !nvme_try_complete_req(rq, status, result))
> nvme_fc_complete_rq(rq);
resulting in one less goto.
Sincerely,
Randy Jennings
© 2016 - 2026 Red Hat, Inc.