[v1] TP8028 Rapid Path Failure Recovery

[RFC PATCH 10/14] nvme-tcp: Use CCR to recover controller that hits an error

Posted by Mohamed Khalfella 2 months, 2 weeks ago

An alive nvme controller that hits an error now will move to RECOVERING
state instead of RESETTING state. In RECOVERING state ctrl->err_work
will attempt to use cross-controller recovery to terminate inflight IOs
on the controller. If CCR succeeds, then switch to RESETTING state and
continue error recovery as usuall by tearing down controller and attempt
reconnecting to target. If CCR fails, then the behavior of recovery
depends on whether CQT is supported or not. If CQT is supported, switch
to time-based recovery by holding inflight IOs until it is safe for them
to be retried. If CQT is not supported proceed to retry requests
immediately, as the code currently does.

To support implementing time-based recovery turn ctrl->err_work into
delayed work. Update nvme_tcp_timeout() to not complete inflight IOs
while controller in RECOVERING state.

Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
 drivers/nvme/host/tcp.c | 52 +++++++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 9a96df1a511c..ec9a713490a9 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -193,7 +193,7 @@ struct nvme_tcp_ctrl {
 	struct sockaddr_storage src_addr;
 	struct nvme_ctrl	ctrl;
 
-	struct work_struct	err_work;
+	struct delayed_work	err_work;
 	struct delayed_work	connect_work;
 	struct nvme_tcp_request async_req;
 	u32			io_queues[HCTX_MAX_TYPES];
@@ -611,11 +611,12 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
 
 static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
 {
-	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RECOVERING) &&
+	    !nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 		return;
 
 	dev_warn(ctrl->device, "starting error recovery\n");
-	queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
+	queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work, 0);
 }
 
 static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
@@ -2470,12 +2471,48 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
 	nvme_tcp_reconnect_or_remove(ctrl, ret);
 }
 
+static int nvme_tcp_recover_ctrl(struct nvme_ctrl *ctrl)
+{
+	unsigned long rem;
+
+	if (test_and_clear_bit(NVME_CTRL_RECOVERED, &ctrl->flags)) {
+		dev_info(ctrl->device, "completed time-based recovery\n");
+		goto done;
+	}
+
+	rem = nvme_recover_ctrl(ctrl);
+	if (!rem)
+		goto done;
+
+	if (!ctrl->cqt) {
+		dev_info(ctrl->device,
+			 "CCR failed, CQT not supported, skip time-based recovery\n");
+		goto done;
+	}
+
+	dev_info(ctrl->device,
+		 "CCR failed, switch to time-based recovery, timeout = %ums\n",
+		 jiffies_to_msecs(rem));
+	set_bit(NVME_CTRL_RECOVERED, &ctrl->flags);
+	queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work, rem);
+	return -EAGAIN;
+
+done:
+	nvme_end_ctrl_recovery(ctrl);
+	return 0;
+}
+
 static void nvme_tcp_error_recovery_work(struct work_struct *work)
 {
-	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
+	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
 				struct nvme_tcp_ctrl, err_work);
 	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
 
+	if (nvme_ctrl_state(ctrl) == NVME_CTRL_RECOVERING) {
+		if (nvme_tcp_recover_ctrl(ctrl))
+			return;
+	}
+
 	if (nvme_tcp_key_revoke_needed(ctrl))
 		nvme_auth_revoke_tls_key(ctrl);
 	nvme_stop_keep_alive(ctrl);
@@ -2545,7 +2582,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
 
 static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
 {
-	flush_work(&to_tcp_ctrl(ctrl)->err_work);
+	flush_delayed_work(&to_tcp_ctrl(ctrl)->err_work);
 	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
 }
 
@@ -2640,6 +2677,7 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
 {
 	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
 	struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
+	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
 	struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
 	struct nvme_command *cmd = &pdu->cmd;
 	int qid = nvme_tcp_queue_id(req->queue);
@@ -2649,7 +2687,7 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
 		 rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
 		 nvme_fabrics_opcode_str(qid, cmd), qid);
 
-	if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
+	if (state != NVME_CTRL_LIVE && state != NVME_CTRL_RECOVERING) {
 		/*
 		 * If we are resetting, connecting or deleting we should
 		 * complete immediately because we may block controller
@@ -2903,7 +2941,7 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
 
 	INIT_DELAYED_WORK(&ctrl->connect_work,
 			nvme_tcp_reconnect_ctrl_work);
-	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
+	INIT_DELAYED_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
 
 	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
-- 
2.51.2

Re: [RFC PATCH 10/14] nvme-tcp: Use CCR to recover controller that hits an error

Posted by Sagi Grimberg 1 month, 1 week ago


On 26/11/2025 4:11, Mohamed Khalfella wrote:
> An alive nvme controller that hits an error now will move to RECOVERING
> state instead of RESETTING state. In RECOVERING state ctrl->err_work
> will attempt to use cross-controller recovery to terminate inflight IOs
> on the controller. If CCR succeeds, then switch to RESETTING state and
> continue error recovery as usuall by tearing down controller and attempt
> reconnecting to target. If CCR fails, then the behavior of recovery
> depends on whether CQT is supported or not. If CQT is supported, switch
> to time-based recovery by holding inflight IOs until it is safe for them
> to be retried. If CQT is not supported proceed to retry requests
> immediately, as the code currently does.
>
> To support implementing time-based recovery turn ctrl->err_work into
> delayed work. Update nvme_tcp_timeout() to not complete inflight IOs
> while controller in RECOVERING state.
>
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> ---
>   drivers/nvme/host/tcp.c | 52 +++++++++++++++++++++++++++++++++++------
>   1 file changed, 45 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 9a96df1a511c..ec9a713490a9 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -193,7 +193,7 @@ struct nvme_tcp_ctrl {
>   	struct sockaddr_storage src_addr;
>   	struct nvme_ctrl	ctrl;
>   
> -	struct work_struct	err_work;
> +	struct delayed_work	err_work;
>   	struct delayed_work	connect_work;
>   	struct nvme_tcp_request async_req;
>   	u32			io_queues[HCTX_MAX_TYPES];
> @@ -611,11 +611,12 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
>   
>   static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
>   {
> -	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> +	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RECOVERING) &&
> +	    !nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))

This warrants an explanation. It is not clear at all why we should allow 
two different
transitions to allow error recovery to start...

>   		return;
>   
>   	dev_warn(ctrl->device, "starting error recovery\n");
> -	queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
> +	queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work, 0);
>   }
>   
>   static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
> @@ -2470,12 +2471,48 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
>   	nvme_tcp_reconnect_or_remove(ctrl, ret);
>   }
>   
> +static int nvme_tcp_recover_ctrl(struct nvme_ctrl *ctrl)
> +{
> +	unsigned long rem;
> +
> +	if (test_and_clear_bit(NVME_CTRL_RECOVERED, &ctrl->flags)) {
> +		dev_info(ctrl->device, "completed time-based recovery\n");
> +		goto done;
> +	}

This is also not clear, why should we get here when NVME_CTRL_RECOVERED 
is set?
> +
> +	rem = nvme_recover_ctrl(ctrl);
> +	if (!rem)
> +		goto done;
> +
> +	if (!ctrl->cqt) {
> +		dev_info(ctrl->device,
> +			 "CCR failed, CQT not supported, skip time-based recovery\n");
> +		goto done;
> +	}
> +
> +	dev_info(ctrl->device,
> +		 "CCR failed, switch to time-based recovery, timeout = %ums\n",
> +		 jiffies_to_msecs(rem));
> +	set_bit(NVME_CTRL_RECOVERED, &ctrl->flags);
> +	queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work, rem);
> +	return -EAGAIN;

I don't think that reusing the same work to handle two completely 
different things
is the right approach here.

How about splitting to fence_work and err_work? That should eliminate 
some of the
ctrl state inspections and simplify error recovery.

> +
> +done:
> +	nvme_end_ctrl_recovery(ctrl);
> +	return 0;
> +}
> +
>   static void nvme_tcp_error_recovery_work(struct work_struct *work)
>   {
> -	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> +	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
>   				struct nvme_tcp_ctrl, err_work);
>   	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>   
> +	if (nvme_ctrl_state(ctrl) == NVME_CTRL_RECOVERING) {
> +		if (nvme_tcp_recover_ctrl(ctrl))
> +			return;
> +	}
> +

Yea, I think we want to rework the current design.

Re: [RFC PATCH 10/14] nvme-tcp: Use CCR to recover controller that hits an error

Posted by Mohamed Khalfella 1 month, 1 week ago

On Sat 2025-12-27 12:35:23 +0200, Sagi Grimberg wrote:
> 
> 
> On 26/11/2025 4:11, Mohamed Khalfella wrote:
> > An alive nvme controller that hits an error now will move to RECOVERING
> > state instead of RESETTING state. In RECOVERING state ctrl->err_work
> > will attempt to use cross-controller recovery to terminate inflight IOs
> > on the controller. If CCR succeeds, then switch to RESETTING state and
> > continue error recovery as usuall by tearing down controller and attempt
> > reconnecting to target. If CCR fails, then the behavior of recovery
> > depends on whether CQT is supported or not. If CQT is supported, switch
> > to time-based recovery by holding inflight IOs until it is safe for them
> > to be retried. If CQT is not supported proceed to retry requests
> > immediately, as the code currently does.
> >
> > To support implementing time-based recovery turn ctrl->err_work into
> > delayed work. Update nvme_tcp_timeout() to not complete inflight IOs
> > while controller in RECOVERING state.
> >
> > Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> > ---
> >   drivers/nvme/host/tcp.c | 52 +++++++++++++++++++++++++++++++++++------
> >   1 file changed, 45 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > index 9a96df1a511c..ec9a713490a9 100644
> > --- a/drivers/nvme/host/tcp.c
> > +++ b/drivers/nvme/host/tcp.c
> > @@ -193,7 +193,7 @@ struct nvme_tcp_ctrl {
> >   	struct sockaddr_storage src_addr;
> >   	struct nvme_ctrl	ctrl;
> >   
> > -	struct work_struct	err_work;
> > +	struct delayed_work	err_work;
> >   	struct delayed_work	connect_work;
> >   	struct nvme_tcp_request async_req;
> >   	u32			io_queues[HCTX_MAX_TYPES];
> > @@ -611,11 +611,12 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
> >   
> >   static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
> >   {
> > -	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> > +	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RECOVERING) &&
> > +	    !nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> 
> This warrants an explanation. It is not clear at all why we should allow 
> two different
> transitions to allow error recovery to start...

The behavior of the ctrl->err_work depends on the controller state. We
go to RECOVERING only if the controller is LIVE. Otherwise, we attempt
to got to RESETTING.

> 
> >   		return;
> >   
> >   	dev_warn(ctrl->device, "starting error recovery\n");
> > -	queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
> > +	queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work, 0);
> >   }
> >   
> >   static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
> > @@ -2470,12 +2471,48 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
> >   	nvme_tcp_reconnect_or_remove(ctrl, ret);
> >   }
> >   
> > +static int nvme_tcp_recover_ctrl(struct nvme_ctrl *ctrl)
> > +{
> > +	unsigned long rem;
> > +
> > +	if (test_and_clear_bit(NVME_CTRL_RECOVERED, &ctrl->flags)) {
> > +		dev_info(ctrl->device, "completed time-based recovery\n");
> > +		goto done;
> > +	}
> 
> This is also not clear, why should we get here when NVME_CTRL_RECOVERED 
> is set?

NVME_CTRL_RECOVERED flag is set before scheduling ctrl->err_work as
delayed work. This is how how time-based recovery is implemented.
We get here when ctrl->err_work runs for the second time, and at this
point we know that it is safe to just reset the controller and cancel
inflight requests.

> > +
> > +	rem = nvme_recover_ctrl(ctrl);
> > +	if (!rem)
> > +		goto done;
> > +
> > +	if (!ctrl->cqt) {
> > +		dev_info(ctrl->device,
> > +			 "CCR failed, CQT not supported, skip time-based recovery\n");
> > +		goto done;
> > +	}
> > +
> > +	dev_info(ctrl->device,
> > +		 "CCR failed, switch to time-based recovery, timeout = %ums\n",
> > +		 jiffies_to_msecs(rem));
> > +	set_bit(NVME_CTRL_RECOVERED, &ctrl->flags);
> > +	queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work, rem);
> > +	return -EAGAIN;
> 
> I don't think that reusing the same work to handle two completely 
> different things
> is the right approach here.
> 
> How about splitting to fence_work and err_work? That should eliminate 
> some of the
> ctrl state inspections and simplify error recovery.
> 
> > +
> > +done:
> > +	nvme_end_ctrl_recovery(ctrl);
> > +	return 0;
> > +}
> > +
> >   static void nvme_tcp_error_recovery_work(struct work_struct *work)
> >   {
> > -	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> > +	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
> >   				struct nvme_tcp_ctrl, err_work);
> >   	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
> >   
> > +	if (nvme_ctrl_state(ctrl) == NVME_CTRL_RECOVERING) {
> > +		if (nvme_tcp_recover_ctrl(ctrl))
> > +			return;
> > +	}
> > +
> 
> Yea, I think we want to rework the current design.

Good point. Splitting ctrl->fence_work simplifies things. The if
condition above will be moved to fence_work. However, we will still need
to reschedule ctrl->fence_work from within its self to implement
time-based recovery. Is this good option?

If not, and we prefer to drop NVME_CTRL_RECOVERED flag above and not
reschedule ctrl->fence_work from within its self, then we can add
another ctr->fenced_work. How about that?

Re: [RFC PATCH 10/14] nvme-tcp: Use CCR to recover controller that hits an error

Posted by Randy Jennings 1 month, 1 week ago

On Sat, Dec 27, 2025 at 2:35 AM Sagi Grimberg <sagi@grimberg.me> wrote:
> On 26/11/2025 4:11, Mohamed Khalfella wrote:
...
> > +     dev_info(ctrl->device,
> > +              "CCR failed, switch to time-based recovery, timeout = %ums\n",
> > +              jiffies_to_msecs(rem));
> > +     set_bit(NVME_CTRL_RECOVERED, &ctrl->flags);
> > +     queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work, rem);
> > +     return -EAGAIN;
>
> I don't think that reusing the same work to handle two completely
> different things
> is the right approach here.
>
> How about splitting to fence_work and err_work? That should eliminate
> some of the
> ctrl state inspections and simplify error recovery.
If the work was independent and could happen separately (probably
in parallel), I could understand having separate work structures.  But they
are not independent, and they have a definite relationship.  Like Mohamed,
I thought of them as different stages of the same work.  Having an extra
work item takes up more space (I would be concerned about scalability to
thousands or 10s of thousands of associations and then go one order of
magnitude higher for margin), and it also causes a connection object
(referenced during IO) to take up more cache lines.  Is it worth taking up
that space, when the separate work items would be different, dependent
stages in the same process?

Sincerely,
Randy Jennings

Re: [RFC PATCH 10/14] nvme-tcp: Use CCR to recover controller that hits an error

Posted by Sagi Grimberg 1 month ago


On 31/12/2025 2:13, Randy Jennings wrote:
> On Sat, Dec 27, 2025 at 2:35 AM Sagi Grimberg <sagi@grimberg.me> wrote:
>> On 26/11/2025 4:11, Mohamed Khalfella wrote:
> ...
>>> +     dev_info(ctrl->device,
>>> +              "CCR failed, switch to time-based recovery, timeout = %ums\n",
>>> +              jiffies_to_msecs(rem));
>>> +     set_bit(NVME_CTRL_RECOVERED, &ctrl->flags);
>>> +     queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work, rem);
>>> +     return -EAGAIN;
>> I don't think that reusing the same work to handle two completely
>> different things
>> is the right approach here.
>>
>> How about splitting to fence_work and err_work? That should eliminate
>> some of the
>> ctrl state inspections and simplify error recovery.
> If the work was independent and could happen separately (probably
> in parallel), I could understand having separate work structures.  But they
> are not independent, and they have a definite relationship.

The relationship that is defined here is that error recovery does not start
before fencing is completed.

>    Like Mohamed,
> I thought of them as different stages of the same work.  Having an extra
> work item takes up more space (I would be concerned about scalability to
> thousands or 10s of thousands of associations and then go one order of
> magnitude higher for margin), and it also causes a connection object
> (referenced during IO) to take up more cache lines.  Is it worth taking up
> that space, when the separate work items would be different, dependent
> stages in the same process?

Yes, IMO the added space of an additional work_struct is much better than
adding more state around a single work handler that is queued up multiple
times doing effectively different things.

Re: [RFC PATCH 10/14] nvme-tcp: Use CCR to recover controller that hits an error

Posted by Randy Jennings 1 month, 3 weeks ago

On Tue, Nov 25, 2025 at 6:13 PM Mohamed Khalfella
<mkhalfella@purestorage.com> wrote:
>
> An alive nvme controller that hits an error now will move to RECOVERING
> state instead of RESETTING state. In RECOVERING state ctrl->err_work
> will attempt to use cross-controller recovery to terminate inflight IOs
> on the controller. If CCR succeeds, then switch to RESETTING state and
> continue error recovery as usuall by tearing down controller and attempt
> reconnecting to target. If CCR fails, then the behavior of recovery
"usuall" -> "usual"
"attempt reconnecting" -> "attempting to reconnect"

it would read better with "the" added:
"tearing down the controller"
"reconnect to the target"

> depends on whether CQT is supported or not. If CQT is supported, switch
> to time-based recovery by holding inflight IOs until it is safe for them
> to be retried. If CQT is not supported proceed to retry requests
> immediately, as the code currently does.

> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c

> +static int nvme_tcp_recover_ctrl(struct nvme_ctrl *ctrl)

> +       dev_info(ctrl->device,
> +                "CCR failed, switch to time-based recovery, timeout = %ums\n",
> +                jiffies_to_msecs(rem));
> +       set_bit(NVME_CTRL_RECOVERED, &ctrl->flags);
> +       queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work, rem);
> +       return -EAGAIN;
I see how setting this bit before the delayed work executes works
to complete recovery, but it is kindof weird that the bit is called
RECOVERED.  I do not have a better name.  TIME_BASED_RECOVERY?
RECOVERY_WAIT?

>  static void nvme_tcp_error_recovery_work(struct work_struct *work)
>  {
> -       struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> +       struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
>                                 struct nvme_tcp_ctrl, err_work);
>         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>
> +       if (nvme_ctrl_state(ctrl) == NVME_CTRL_RECOVERING) {
> +               if (nvme_tcp_recover_ctrl(ctrl))
> +                       return;
> +       }
> +
>         if (nvme_tcp_key_revoke_needed(ctrl))
>                 nvme_auth_revoke_tls_key(ctrl);
>         nvme_stop_keep_alive(ctrl);
The state of the controller should not be LIVE while waiting for
recovery, so I do not think we will succeed in sending keep alives,
but I think this should move to before (or inside of)
nvme_tcp_recover_ctrl().

Sincerely,
Randy Jennings

Re: [RFC PATCH 10/14] nvme-tcp: Use CCR to recover controller that hits an error

Posted by Mohamed Khalfella 1 month, 1 week ago

On Thu 2025-12-18 18:06:02 -0800, Randy Jennings wrote:
> On Tue, Nov 25, 2025 at 6:13 PM Mohamed Khalfella
> <mkhalfella@purestorage.com> wrote:
> >
> > An alive nvme controller that hits an error now will move to RECOVERING
> > state instead of RESETTING state. In RECOVERING state ctrl->err_work
> > will attempt to use cross-controller recovery to terminate inflight IOs
> > on the controller. If CCR succeeds, then switch to RESETTING state and
> > continue error recovery as usuall by tearing down controller and attempt
> > reconnecting to target. If CCR fails, then the behavior of recovery
> "usuall" -> "usual"
> "attempt reconnecting" -> "attempting to reconnect"
> 
> it would read better with "the" added:
> "tearing down the controller"
> "reconnect to the target"

Updated as suggested.

> 
> > depends on whether CQT is supported or not. If CQT is supported, switch
> > to time-based recovery by holding inflight IOs until it is safe for them
> > to be retried. If CQT is not supported proceed to retry requests
> > immediately, as the code currently does.
> 
> > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> 
> > +static int nvme_tcp_recover_ctrl(struct nvme_ctrl *ctrl)
> 
> > +       dev_info(ctrl->device,
> > +                "CCR failed, switch to time-based recovery, timeout = %ums\n",
> > +                jiffies_to_msecs(rem));
> > +       set_bit(NVME_CTRL_RECOVERED, &ctrl->flags);
> > +       queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work, rem);
> > +       return -EAGAIN;
> I see how setting this bit before the delayed work executes works
> to complete recovery, but it is kindof weird that the bit is called
> RECOVERED.  I do not have a better name.  TIME_BASED_RECOVERY?
> RECOVERY_WAIT?

Agree. It does look weird. If we agree to add two states FENCING and
FENCED then the flag might not be needed.

> 
> >  static void nvme_tcp_error_recovery_work(struct work_struct *work)
> >  {
> > -       struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> > +       struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
> >                                 struct nvme_tcp_ctrl, err_work);
> >         struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
> >
> > +       if (nvme_ctrl_state(ctrl) == NVME_CTRL_RECOVERING) {
> > +               if (nvme_tcp_recover_ctrl(ctrl))
> > +                       return;
> > +       }
> > +
> >         if (nvme_tcp_key_revoke_needed(ctrl))
> >                 nvme_auth_revoke_tls_key(ctrl);
> >         nvme_stop_keep_alive(ctrl);
> The state of the controller should not be LIVE while waiting for
> recovery, so I do not think we will succeed in sending keep alives,
> but I think this should move to before (or inside of)
> nvme_tcp_recover_ctrl().

This is correct, no keepalive traffic will be sent in RECOVERING state.
If we split fencing work from existing error recovery work then this
should removed. I think we are going in that direction.

> 
> Sincerely,
> Randy Jennings