[v3] TP8028 Rapid Path Failure Recovery

[PATCH v3 19/21] nvme-tcp: Extend FENCING state per TP4129 on CCR failure

Posted by Mohamed Khalfella 1 month, 2 weeks ago

If CCR operations fail and CQT is supported, we must defer the retry of
inflight requests per TP4129. Update ctrl->fencing_work to schedule
ctrl->fenced_work, effectively extending the FENCING state. This delay
ensures that inflight requests are held until it is safe for them to be
retired.

Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
 drivers/nvme/host/tcp.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 229cfdffd848..054e8a350d75 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -194,6 +194,7 @@ struct nvme_tcp_ctrl {
 	struct nvme_ctrl	ctrl;
 
 	struct work_struct	fencing_work;
+	struct delayed_work	fenced_work;
 	struct work_struct	err_work;
 	struct delayed_work	connect_work;
 	struct nvme_tcp_request async_req;
@@ -2477,6 +2478,18 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
 	nvme_tcp_reconnect_or_remove(ctrl, ret);
 }
 
+static void nvme_tcp_fenced_work(struct work_struct *work)
+{
+	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
+					 struct nvme_tcp_ctrl, fenced_work);
+	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+	dev_info(ctrl->device, "Time-based recovery finished\n");
+	nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
+	if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+		queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
+}
+
 static void nvme_tcp_fencing_work(struct work_struct *work)
 {
 	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
@@ -2485,23 +2498,40 @@ static void nvme_tcp_fencing_work(struct work_struct *work)
 	unsigned long rem;
 
 	rem = nvme_fence_ctrl(ctrl);
-	if (rem) {
+	if (!rem)
+		goto done;
+
+	if (!ctrl->cqt) {
 		dev_info(ctrl->device,
-			 "CCR failed, skipping time-based recovery\n");
+			 "CCR failed, CQT not supported, skip time-based recovery\n");
+		goto done;
 	}
 
+	dev_info(ctrl->device,
+		 "CCR failed, switch to time-based recovery, timeout = %ums\n",
+		 jiffies_to_msecs(rem));
+	queue_delayed_work(nvme_wq, &tcp_ctrl->fenced_work, rem);
+	return;
+
+done:
 	nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
 	if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
 		queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
 }
 
+static void nvme_tcp_flush_fencing_works(struct nvme_ctrl *ctrl)
+{
+	flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
+	flush_delayed_work(&to_tcp_ctrl(ctrl)->fenced_work);
+}
+
 static void nvme_tcp_error_recovery_work(struct work_struct *work)
 {
 	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
 				struct nvme_tcp_ctrl, err_work);
 	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
 
-	flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
+	nvme_tcp_flush_fencing_works(ctrl);
 	if (nvme_tcp_key_revoke_needed(ctrl))
 		nvme_auth_revoke_tls_key(ctrl);
 	nvme_stop_keep_alive(ctrl);
@@ -2544,7 +2574,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
 		container_of(work, struct nvme_ctrl, reset_work);
 	int ret;
 
-	flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
+	nvme_tcp_flush_fencing_works(ctrl);
 	if (nvme_tcp_key_revoke_needed(ctrl))
 		nvme_auth_revoke_tls_key(ctrl);
 	nvme_stop_ctrl(ctrl);
@@ -2934,6 +2964,7 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
 	INIT_DELAYED_WORK(&ctrl->connect_work,
 			nvme_tcp_reconnect_ctrl_work);
 	INIT_WORK(&ctrl->fencing_work, nvme_tcp_fencing_work);
+	INIT_DELAYED_WORK(&ctrl->fenced_work, nvme_tcp_fenced_work);
 	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
 	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
 
-- 
2.52.0

Re: [PATCH v3 19/21] nvme-tcp: Extend FENCING state per TP4129 on CCR failure

Posted by Hannes Reinecke 1 month, 2 weeks ago

On 2/14/26 05:25, Mohamed Khalfella wrote:
> If CCR operations fail and CQT is supported, we must defer the retry of
> inflight requests per TP4129. Update ctrl->fencing_work to schedule
> ctrl->fenced_work, effectively extending the FENCING state. This delay
> ensures that inflight requests are held until it is safe for them to be
> retired.
> 
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> ---
>   drivers/nvme/host/tcp.c | 39 +++++++++++++++++++++++++++++++++++----
>   1 file changed, 35 insertions(+), 4 deletions(-)
> 
Can't you merge / integrate this into the nvme_fence_ctrl() routine?
The previous patch already extended the timeout to cover for CQT, so
we can just wait for the timeout if CCR failed, no?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

Re: [PATCH v3 19/21] nvme-tcp: Extend FENCING state per TP4129 on CCR failure

Posted by Mohamed Khalfella 1 month, 2 weeks ago

On Mon 2026-02-16 13:56:10 +0100, Hannes Reinecke wrote:
> On 2/14/26 05:25, Mohamed Khalfella wrote:
> > If CCR operations fail and CQT is supported, we must defer the retry of
> > inflight requests per TP4129. Update ctrl->fencing_work to schedule
> > ctrl->fenced_work, effectively extending the FENCING state. This delay
> > ensures that inflight requests are held until it is safe for them to be
> > retired.
> > 
> > Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> > ---
> >   drivers/nvme/host/tcp.c | 39 +++++++++++++++++++++++++++++++++++----
> >   1 file changed, 35 insertions(+), 4 deletions(-)
> > 
> Can't you merge / integrate this into the nvme_fence_ctrl() routine?

ctrl->fencing_work and ctrl->fenced_work are in transport specific
controller, struct nvme_tcp_ctrl in this case. There is no easy way to
access these members from nvme_fence_ctrl(). One option to go around
that is to move them into struct nvme_ctrl. But we call error recovery
after a controller is fenced, and error recovery is implemented in
transport specific way. That is why the delay is implemented/repeated
for every transport.

> The previous patch already extended the timeout to cover for CQT, so
> we can just wait for the timeout if CCR failed, no?

Following on the point above. One change can be done is to reset the
controller after fencing finishes instead of using error recovery.
This way everything lives in core.c. But I have not tested that.

Do you think this is better than what has been implemented now?

Re: [PATCH v3 19/21] nvme-tcp: Extend FENCING state per TP4129 on CCR failure

Posted by Hannes Reinecke 1 month, 1 week ago

On 2/17/26 18:58, Mohamed Khalfella wrote:
> On Mon 2026-02-16 13:56:10 +0100, Hannes Reinecke wrote:
>> On 2/14/26 05:25, Mohamed Khalfella wrote:
>>> If CCR operations fail and CQT is supported, we must defer the retry of
>>> inflight requests per TP4129. Update ctrl->fencing_work to schedule
>>> ctrl->fenced_work, effectively extending the FENCING state. This delay
>>> ensures that inflight requests are held until it is safe for them to be
>>> retired.
>>>
>>> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
>>> ---
>>>    drivers/nvme/host/tcp.c | 39 +++++++++++++++++++++++++++++++++++----
>>>    1 file changed, 35 insertions(+), 4 deletions(-)
>>>
>> Can't you merge / integrate this into the nvme_fence_ctrl() routine?
> 
> ctrl->fencing_work and ctrl->fenced_work are in transport specific
> controller, struct nvme_tcp_ctrl in this case. There is no easy way to
> access these members from nvme_fence_ctrl(). One option to go around
> that is to move them into struct nvme_ctrl. But we call error recovery
> after a controller is fenced, and error recovery is implemented in
> transport specific way. That is why the delay is implemented/repeated
> for every transport.
> 
>> The previous patch already extended the timeout to cover for CQT, so
>> we can just wait for the timeout if CCR failed, no?
> 
> Following on the point above. One change can be done is to reset the
> controller after fencing finishes instead of using error recovery.
> This way everything lives in core.c. But I have not tested that.
> 
> Do you think this is better than what has been implemented now?
> 
Yeah, the eternal problem.
At one point someone will have to explain to my why 'reset' and
'error handling' are two _distinct_ code paths in nvme-tcp.
I really don't get that. I _guess_ it's trying to hold requests
when doing a reset, and aborting requests if it's an error.
But why one needs to make that distinction is a mystery to
me; FC combines both paths and seems to work quite happily.

Thing is, that will get in the way when trying to move fencing
into the generic layer; you only can call 'nvme_reset_ctrl()',
and hope that this one will abort commands.

I'll check.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich