An alive nvme controller that hits an error now will move to FENCING
state instead of RESETTING state. ctrl->fencing_work attempts CCR to
terminate inflight IOs. If CCR succeeds, switch to FENCED -> RESETTING
and continue error recovery as usual. If CCR fails, the behavior depends
on whether the subsystem supports CQT or not. If CQT is not supported
then reset the controller immediately as if CCR succeeded in order to
maintain the current behavior. If CQT is supported switch to time-based
recovery. Schedule ctrl->fenced_work resets the controller when time
based recovery finishes.
Either ctrl->err_work or ctrl->reset_work can run after a controller is
fenced. Flush fencing work when either work run.
Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
drivers/nvme/host/tcp.c | 62 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 61 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 69cb04406b47..af8d3b36a4bb 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -193,6 +193,8 @@ struct nvme_tcp_ctrl {
struct sockaddr_storage src_addr;
struct nvme_ctrl ctrl;
+ struct work_struct fencing_work;
+ struct delayed_work fenced_work;
struct work_struct err_work;
struct delayed_work connect_work;
struct nvme_tcp_request async_req;
@@ -611,6 +613,12 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
{
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCING)) {
+ dev_warn(ctrl->device, "starting controller fencing\n");
+ queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->fencing_work);
+ return;
+ }
+
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return;
@@ -2470,12 +2478,59 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
nvme_tcp_reconnect_or_remove(ctrl, ret);
}
+static void nvme_tcp_fenced_work(struct work_struct *work)
+{
+ struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
+ struct nvme_tcp_ctrl, fenced_work);
+ struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
+}
+
+static void nvme_tcp_fencing_work(struct work_struct *work)
+{
+ struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
+ struct nvme_tcp_ctrl, fencing_work);
+ struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+ unsigned long rem;
+
+ rem = nvme_fence_ctrl(ctrl);
+ if (!rem)
+ goto done;
+
+ if (!ctrl->cqt) {
+ dev_info(ctrl->device,
+ "CCR failed, CQT not supported, skip time-based recovery\n");
+ goto done;
+ }
+
+ dev_info(ctrl->device,
+ "CCR failed, switch to time-based recovery, timeout = %ums\n",
+ jiffies_to_msecs(rem));
+ queue_delayed_work(nvme_wq, &tcp_ctrl->fenced_work, rem);
+ return;
+
+done:
+ nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
+ if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
+ queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
+}
+
+static void nvme_tcp_flush_fencing_work(struct nvme_ctrl *ctrl)
+{
+ flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
+ flush_delayed_work(&to_tcp_ctrl(ctrl)->fenced_work);
+}
+
static void nvme_tcp_error_recovery_work(struct work_struct *work)
{
struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
struct nvme_tcp_ctrl, err_work);
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+ nvme_tcp_flush_fencing_work(ctrl);
if (nvme_tcp_key_revoke_needed(ctrl))
nvme_auth_revoke_tls_key(ctrl);
nvme_stop_keep_alive(ctrl);
@@ -2518,6 +2573,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
container_of(work, struct nvme_ctrl, reset_work);
int ret;
+ nvme_tcp_flush_fencing_work(ctrl);
if (nvme_tcp_key_revoke_needed(ctrl))
nvme_auth_revoke_tls_key(ctrl);
nvme_stop_ctrl(ctrl);
@@ -2643,13 +2699,15 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
struct nvme_command *cmd = &pdu->cmd;
int qid = nvme_tcp_queue_id(req->queue);
+ enum nvme_ctrl_state state;
dev_warn(ctrl->device,
"I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
nvme_fabrics_opcode_str(qid, cmd), qid);
- if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
+ state = nvme_ctrl_state(ctrl);
+ if (state != NVME_CTRL_LIVE && state != NVME_CTRL_FENCING) {
/*
* If we are resetting, connecting or deleting we should
* complete immediately because we may block controller
@@ -2904,6 +2962,8 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
INIT_DELAYED_WORK(&ctrl->connect_work,
nvme_tcp_reconnect_ctrl_work);
+ INIT_DELAYED_WORK(&ctrl->fenced_work, nvme_tcp_fenced_work);
+ INIT_WORK(&ctrl->fencing_work, nvme_tcp_fencing_work);
INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
--
2.52.0
On 1/30/26 23:34, Mohamed Khalfella wrote:
> An alive nvme controller that hits an error now will move to FENCING
> state instead of RESETTING state. ctrl->fencing_work attempts CCR to
> terminate inflight IOs. If CCR succeeds, switch to FENCED -> RESETTING
> and continue error recovery as usual. If CCR fails, the behavior depends
> on whether the subsystem supports CQT or not. If CQT is not supported
> then reset the controller immediately as if CCR succeeded in order to
> maintain the current behavior. If CQT is supported switch to time-based
> recovery. Schedule ctrl->fenced_work resets the controller when time
> based recovery finishes.
>
> Either ctrl->err_work or ctrl->reset_work can run after a controller is
> fenced. Flush fencing work when either work run.
>
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> ---
> drivers/nvme/host/tcp.c | 62 ++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 61 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 69cb04406b47..af8d3b36a4bb 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -193,6 +193,8 @@ struct nvme_tcp_ctrl {
> struct sockaddr_storage src_addr;
> struct nvme_ctrl ctrl;
>
> + struct work_struct fencing_work;
> + struct delayed_work fenced_work;
> struct work_struct err_work;
> struct delayed_work connect_work;
> struct nvme_tcp_request async_req;
> @@ -611,6 +613,12 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
>
> static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
> {
> + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCING)) {
> + dev_warn(ctrl->device, "starting controller fencing\n");
> + queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->fencing_work);
> + return;
> + }
> +
Don't you need to flush any outstanding 'fenced_work' queue items here
before calling 'queue_work()'?
> if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> return;
>
> @@ -2470,12 +2478,59 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
> nvme_tcp_reconnect_or_remove(ctrl, ret);
> }
>
> +static void nvme_tcp_fenced_work(struct work_struct *work)
> +{
> + struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
> + struct nvme_tcp_ctrl, fenced_work);
> + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
> +
> + nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
> + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> + queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
> +}
> +
> +static void nvme_tcp_fencing_work(struct work_struct *work)
> +{
> + struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> + struct nvme_tcp_ctrl, fencing_work);
> + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
> + unsigned long rem;
> +
> + rem = nvme_fence_ctrl(ctrl);
> + if (!rem)
> + goto done;
> +
> + if (!ctrl->cqt) {
> + dev_info(ctrl->device,
> + "CCR failed, CQT not supported, skip time-based recovery\n");
> + goto done;
> + }
> +
As mentioned, cqt handling should be part of another patchset.
> + dev_info(ctrl->device,
> + "CCR failed, switch to time-based recovery, timeout = %ums\n",
> + jiffies_to_msecs(rem));
> + queue_delayed_work(nvme_wq, &tcp_ctrl->fenced_work, rem);
> + return;
> +
Why do you need the 'fenced' workqueue at all? All it does is queing yet
another workqueue item, which certainly can be done from the 'fencing'
workqueue directly, no?
> +done:
> + nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
> + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> + queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
> +}
> +
> +static void nvme_tcp_flush_fencing_work(struct nvme_ctrl *ctrl)
> +{
> + flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
> + flush_delayed_work(&to_tcp_ctrl(ctrl)->fenced_work);
> +}
> +
> static void nvme_tcp_error_recovery_work(struct work_struct *work)
> {
> struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> struct nvme_tcp_ctrl, err_work);
> struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>
> + nvme_tcp_flush_fencing_work(ctrl);
Why not 'fenced_work' ?
> if (nvme_tcp_key_revoke_needed(ctrl))
> nvme_auth_revoke_tls_key(ctrl);
> nvme_stop_keep_alive(ctrl);
> @@ -2518,6 +2573,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
> container_of(work, struct nvme_ctrl, reset_work);
> int ret;
>
> + nvme_tcp_flush_fencing_work(ctrl);
Same.
> if (nvme_tcp_key_revoke_needed(ctrl))
> nvme_auth_revoke_tls_key(ctrl);
> nvme_stop_ctrl(ctrl);
> @@ -2643,13 +2699,15 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
> struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
> struct nvme_command *cmd = &pdu->cmd;
> int qid = nvme_tcp_queue_id(req->queue);
> + enum nvme_ctrl_state state;
>
> dev_warn(ctrl->device,
> "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
> rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
> nvme_fabrics_opcode_str(qid, cmd), qid);
>
> - if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
> + state = nvme_ctrl_state(ctrl);
> + if (state != NVME_CTRL_LIVE && state != NVME_CTRL_FENCING) {
'FENCED' too, presumably?
> /*
> * If we are resetting, connecting or deleting we should
> * complete immediately because we may block controller
> @@ -2904,6 +2962,8 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
>
> INIT_DELAYED_WORK(&ctrl->connect_work,
> nvme_tcp_reconnect_ctrl_work);
> + INIT_DELAYED_WORK(&ctrl->fenced_work, nvme_tcp_fenced_work);
> + INIT_WORK(&ctrl->fencing_work, nvme_tcp_fencing_work);
> INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
> INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
>
Here you are calling CCR whenever error recovery is triggered.
This will cause CCR to be send from a command timeout, which is
technically wrong (CCR should be send when the KATO timeout expires,
not when a command timout expires). Both could be vastly different.
So I'd prefer to have CCR send whenever KATO timeout triggers, and
lease to current command timeout mechanism in place.
Cheers,
Hannes
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On Tue 2026-02-03 06:34:51 +0100, Hannes Reinecke wrote:
> On 1/30/26 23:34, Mohamed Khalfella wrote:
> > An alive nvme controller that hits an error now will move to FENCING
> > state instead of RESETTING state. ctrl->fencing_work attempts CCR to
> > terminate inflight IOs. If CCR succeeds, switch to FENCED -> RESETTING
> > and continue error recovery as usual. If CCR fails, the behavior depends
> > on whether the subsystem supports CQT or not. If CQT is not supported
> > then reset the controller immediately as if CCR succeeded in order to
> > maintain the current behavior. If CQT is supported switch to time-based
> > recovery. Schedule ctrl->fenced_work resets the controller when time
> > based recovery finishes.
> >
> > Either ctrl->err_work or ctrl->reset_work can run after a controller is
> > fenced. Flush fencing work when either work run.
> >
> > Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> > ---
> > drivers/nvme/host/tcp.c | 62 ++++++++++++++++++++++++++++++++++++++++-
> > 1 file changed, 61 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > index 69cb04406b47..af8d3b36a4bb 100644
> > --- a/drivers/nvme/host/tcp.c
> > +++ b/drivers/nvme/host/tcp.c
> > @@ -193,6 +193,8 @@ struct nvme_tcp_ctrl {
> > struct sockaddr_storage src_addr;
> > struct nvme_ctrl ctrl;
> >
> > + struct work_struct fencing_work;
> > + struct delayed_work fenced_work;
> > struct work_struct err_work;
> > struct delayed_work connect_work;
> > struct nvme_tcp_request async_req;
> > @@ -611,6 +613,12 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
> >
> > static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
> > {
> > + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCING)) {
> > + dev_warn(ctrl->device, "starting controller fencing\n");
> > + queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->fencing_work);
> > + return;
> > + }
> > +
>
> Don't you need to flush any outstanding 'fenced_work' queue items here
> before calling 'queue_work()'?
I do not think we need to flush ctr->fencing_work. It can not be running
at this time.
>
> > if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> > return;
> >
> > @@ -2470,12 +2478,59 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
> > nvme_tcp_reconnect_or_remove(ctrl, ret);
> > }
> >
> > +static void nvme_tcp_fenced_work(struct work_struct *work)
> > +{
> > + struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
> > + struct nvme_tcp_ctrl, fenced_work);
> > + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
> > +
> > + nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
> > + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> > + queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
> > +}
> > +
> > +static void nvme_tcp_fencing_work(struct work_struct *work)
> > +{
> > + struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> > + struct nvme_tcp_ctrl, fencing_work);
> > + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
> > + unsigned long rem;
> > +
> > + rem = nvme_fence_ctrl(ctrl);
> > + if (!rem)
> > + goto done;
> > +
> > + if (!ctrl->cqt) {
> > + dev_info(ctrl->device,
> > + "CCR failed, CQT not supported, skip time-based recovery\n");
> > + goto done;
> > + }
> > +
>
> As mentioned, cqt handling should be part of another patchset.
Let us suppose we drop cqt from this patchset
- How will we be able to calculate CCR time budget?
Currently it is calculated by nvme_fence_timeout_ms()
- What should we do if CCR fails? Retry requests immediately?
> > + dev_info(ctrl->device,
> > + "CCR failed, switch to time-based recovery, timeout = %ums\n",
> > + jiffies_to_msecs(rem));
> > + queue_delayed_work(nvme_wq, &tcp_ctrl->fenced_work, rem);
> > + return;
> > +
>
> Why do you need the 'fenced' workqueue at all? All it does is queing yet
> another workqueue item, which certainly can be done from the 'fencing'
> workqueue directly, no?
It is possible to drop ctr->fenced_work and requeue ctrl->fencing_work
as delayed work to implement request hold time. If we do that then we
need to modify nvme_tcp_fencing_work() to tell if it is being called for
'fencing' or 'fenced'. The first version of this patch used a controller
flag RECOVERED for that and it has been suggested to use a separate work
to simplify the logic and drop the controller flag.
>
> > +done:
> > + nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
> > + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> > + queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
> > +}
> > +
> > +static void nvme_tcp_flush_fencing_work(struct nvme_ctrl *ctrl)
> > +{
> > + flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
> > + flush_delayed_work(&to_tcp_ctrl(ctrl)->fenced_work);
> > +}
> > +
> > static void nvme_tcp_error_recovery_work(struct work_struct *work)
> > {
> > struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> > struct nvme_tcp_ctrl, err_work);
> > struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
> >
> > + nvme_tcp_flush_fencing_work(ctrl);
>
> Why not 'fenced_work' ?
You mean rename nvme_tcp_flush_fencing_work() to
nvme_tcp_flush_fenced_work()?
If yes, then I can do that if you think it makes more sense.
>
> > if (nvme_tcp_key_revoke_needed(ctrl))
> > nvme_auth_revoke_tls_key(ctrl);
> > nvme_stop_keep_alive(ctrl);
> > @@ -2518,6 +2573,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
> > container_of(work, struct nvme_ctrl, reset_work);
> > int ret;
> >
> > + nvme_tcp_flush_fencing_work(ctrl);
>
> Same.
>
> > if (nvme_tcp_key_revoke_needed(ctrl))
> > nvme_auth_revoke_tls_key(ctrl);
> > nvme_stop_ctrl(ctrl);
> > @@ -2643,13 +2699,15 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
> > struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
> > struct nvme_command *cmd = &pdu->cmd;
> > int qid = nvme_tcp_queue_id(req->queue);
> > + enum nvme_ctrl_state state;
> >
> > dev_warn(ctrl->device,
> > "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
> > rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
> > nvme_fabrics_opcode_str(qid, cmd), qid);
> >
> > - if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
> > + state = nvme_ctrl_state(ctrl);
> > + if (state != NVME_CTRL_LIVE && state != NVME_CTRL_FENCING) {
>
> 'FENCED' too, presumably?
I do not think it makes a difference here. FENCED and RESETTING are
almost the same states.
>
> > /*
> > * If we are resetting, connecting or deleting we should
> > * complete immediately because we may block controller
> > @@ -2904,6 +2962,8 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
> >
> > INIT_DELAYED_WORK(&ctrl->connect_work,
> > nvme_tcp_reconnect_ctrl_work);
> > + INIT_DELAYED_WORK(&ctrl->fenced_work, nvme_tcp_fenced_work);
> > + INIT_WORK(&ctrl->fencing_work, nvme_tcp_fencing_work);
> > INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
> > INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
> >
>
> Here you are calling CCR whenever error recovery is triggered.
> This will cause CCR to be send from a command timeout, which is
> technically wrong (CCR should be send when the KATO timeout expires,
> not when a command timout expires). Both could be vastly different.
KATO is driven by the host. What does KTO expires mean?
I think KATO expiry is more applicable to target, no?
KATO timeout is a signal of an error that target is not reachable or
something is wrong with the target?
>
> So I'd prefer to have CCR send whenever KATO timeout triggers, and
> lease to current command timeout mechanism in place.
Assuming we used CCR only when KATO request times out.
What should we do when we hit other errors?
should nvme_tcp_error_recovery() is called from many places to handle
errors and it effectively resets the controller. What should this
function do if not trigger CCR?
>
> Cheers,
>
> Hannes
> --
> Dr. Hannes Reinecke Kernel Storage Architect
> hare@suse.de +49 911 74053 688
> SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
> HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On 2/3/26 22:24, Mohamed Khalfella wrote:
> On Tue 2026-02-03 06:34:51 +0100, Hannes Reinecke wrote:
>> On 1/30/26 23:34, Mohamed Khalfella wrote:
>>> An alive nvme controller that hits an error now will move to FENCING
>>> state instead of RESETTING state. ctrl->fencing_work attempts CCR to
>>> terminate inflight IOs. If CCR succeeds, switch to FENCED -> RESETTING
>>> and continue error recovery as usual. If CCR fails, the behavior depends
>>> on whether the subsystem supports CQT or not. If CQT is not supported
>>> then reset the controller immediately as if CCR succeeded in order to
>>> maintain the current behavior. If CQT is supported switch to time-based
>>> recovery. Schedule ctrl->fenced_work resets the controller when time
>>> based recovery finishes.
>>>
>>> Either ctrl->err_work or ctrl->reset_work can run after a controller is
>>> fenced. Flush fencing work when either work run.
>>>
>>> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
>>> ---
>>> drivers/nvme/host/tcp.c | 62 ++++++++++++++++++++++++++++++++++++++++-
>>> 1 file changed, 61 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>>> index 69cb04406b47..af8d3b36a4bb 100644
>>> --- a/drivers/nvme/host/tcp.c
>>> +++ b/drivers/nvme/host/tcp.c
>>> @@ -193,6 +193,8 @@ struct nvme_tcp_ctrl {
>>> struct sockaddr_storage src_addr;
>>> struct nvme_ctrl ctrl;
>>>
>>> + struct work_struct fencing_work;
>>> + struct delayed_work fenced_work;
>>> struct work_struct err_work;
>>> struct delayed_work connect_work;
>>> struct nvme_tcp_request async_req;
>>> @@ -611,6 +613,12 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
>>>
>>> static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
>>> {
>>> + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCING)) {
>>> + dev_warn(ctrl->device, "starting controller fencing\n");
>>> + queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->fencing_work);
>>> + return;
>>> + }
>>> +
>>
>> Don't you need to flush any outstanding 'fenced_work' queue items here
>> before calling 'queue_work()'?
>
> I do not think we need to flush ctr->fencing_work. It can not be running
> at this time.
>
Hmm. If you say so ... I'd rather make sure here.
These things have a habit of popping up unexpectdly.
>>
>>> if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
>>> return;
>>>
>>> @@ -2470,12 +2478,59 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
>>> nvme_tcp_reconnect_or_remove(ctrl, ret);
>>> }
>>>
>>> +static void nvme_tcp_fenced_work(struct work_struct *work)
>>> +{
>>> + struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
>>> + struct nvme_tcp_ctrl, fenced_work);
>>> + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>>> +
>>> + nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
>>> + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
>>> + queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
>>> +}
>>> +
>>> +static void nvme_tcp_fencing_work(struct work_struct *work)
>>> +{
>>> + struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
>>> + struct nvme_tcp_ctrl, fencing_work);
>>> + struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>>> + unsigned long rem;
>>> +
>>> + rem = nvme_fence_ctrl(ctrl);
>>> + if (!rem)
>>> + goto done;
>>> +
>>> + if (!ctrl->cqt) {
>>> + dev_info(ctrl->device,
>>> + "CCR failed, CQT not supported, skip time-based recovery\n");
>>> + goto done;
>>> + }
>>> +
>>
>> As mentioned, cqt handling should be part of another patchset.
>
> Let us suppose we drop cqt from this patchset
>
> - How will we be able to calculate CCR time budget?
> Currently it is calculated by nvme_fence_timeout_ms()
>
The CCR time budget is calculated by the current KATO value.
CQT is the time a controler requires _after_ KATO expires
to clear out commands.
> - What should we do if CCR fails? Retry requests immediately?
>
No. If CCR fails we should fall back to error handling.
In our case it would mean we let the command timeout handler
initate a controller reset.
_Eventually_ (ie after we implemented CCR _and_ CQT) we would
wait for KATO (+ CQT) to expire and _then_ reset the controller.
>>> + dev_info(ctrl->device,
>>> + "CCR failed, switch to time-based recovery, timeout = %ums\n",
>>> + jiffies_to_msecs(rem));
>>> + queue_delayed_work(nvme_wq, &tcp_ctrl->fenced_work, rem);
>>> + return;
>>> +
>>
>> Why do you need the 'fenced' workqueue at all? All it does is queing yet
>> another workqueue item, which certainly can be done from the 'fencing'
>> workqueue directly, no?
>
> It is possible to drop ctr->fenced_work and requeue ctrl->fencing_work
> as delayed work to implement request hold time. If we do that then we
> need to modify nvme_tcp_fencing_work() to tell if it is being called for
> 'fencing' or 'fenced'. The first version of this patch used a controller
> flag RECOVERED for that and it has been suggested to use a separate work
> to simplify the logic and drop the controller flag.
>
But that's just because you integrated CCR within the current error
recovery.
If you were to implement CCR as to be invoked from
nvme_keep_alive_end_io() we would not need to touch that,
and we would need just one workqueue.
Let me see if I can draft up a patch.
>>
>>> +done:
>>> + nvme_change_ctrl_state(ctrl, NVME_CTRL_FENCED);
>>> + if (nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
>>> + queue_work(nvme_reset_wq, &tcp_ctrl->err_work);
>>> +}
>>> +
>>> +static void nvme_tcp_flush_fencing_work(struct nvme_ctrl *ctrl)
>>> +{
>>> + flush_work(&to_tcp_ctrl(ctrl)->fencing_work);
>>> + flush_delayed_work(&to_tcp_ctrl(ctrl)->fenced_work);
>>> +}
>>> +
>>> static void nvme_tcp_error_recovery_work(struct work_struct *work)
>>> {
>>> struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
>>> struct nvme_tcp_ctrl, err_work);
>>> struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>>>
>>> + nvme_tcp_flush_fencing_work(ctrl);
>>
>> Why not 'fenced_work' ?
>
> You mean rename nvme_tcp_flush_fencing_work() to
> nvme_tcp_flush_fenced_work()?
>
> If yes, then I can do that if you think it makes more sense.
>
Thing is, you have two workqueues dealing with CCR.
You really need to avoid that _both_ are empty when this
funciton is called.
>>
>>> if (nvme_tcp_key_revoke_needed(ctrl))
>>> nvme_auth_revoke_tls_key(ctrl);
>>> nvme_stop_keep_alive(ctrl);
>>> @@ -2518,6 +2573,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
>>> container_of(work, struct nvme_ctrl, reset_work);
>>> int ret;
>>>
>>> + nvme_tcp_flush_fencing_work(ctrl);
>>
>> Same.
>>
>>> if (nvme_tcp_key_revoke_needed(ctrl))
>>> nvme_auth_revoke_tls_key(ctrl);
>>> nvme_stop_ctrl(ctrl);
>>> @@ -2643,13 +2699,15 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
>>> struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
>>> struct nvme_command *cmd = &pdu->cmd;
>>> int qid = nvme_tcp_queue_id(req->queue);
>>> + enum nvme_ctrl_state state;
>>>
>>> dev_warn(ctrl->device,
>>> "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
>>> rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
>>> nvme_fabrics_opcode_str(qid, cmd), qid);
>>>
>>> - if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
>>> + state = nvme_ctrl_state(ctrl);
>>> + if (state != NVME_CTRL_LIVE && state != NVME_CTRL_FENCING) {
>>
>> 'FENCED' too, presumably?
>
> I do not think it makes a difference here. FENCED and RESETTING are
> almost the same states.
>
Yeah, but in FENCED all commands will be aborted, and the same action
will be invoked when this if() clause is entered. So you need to avoid
entering the if() clause to avoid races.
>>
>>> /*
>>> * If we are resetting, connecting or deleting we should
>>> * complete immediately because we may block controller
>>> @@ -2904,6 +2962,8 @@ static struct nvme_tcp_ctrl *nvme_tcp_alloc_ctrl(struct device *dev,
>>>
>>> INIT_DELAYED_WORK(&ctrl->connect_work,
>>> nvme_tcp_reconnect_ctrl_work);
>>> + INIT_DELAYED_WORK(&ctrl->fenced_work, nvme_tcp_fenced_work);
>>> + INIT_WORK(&ctrl->fencing_work, nvme_tcp_fencing_work);
>>> INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
>>> INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
>>>
>>
>> Here you are calling CCR whenever error recovery is triggered.
>> This will cause CCR to be send from a command timeout, which is
>> technically wrong (CCR should be send when the KATO timeout expires,
>> not when a command timout expires). Both could be vastly different.
>
> KATO is driven by the host. What does KTO expires mean?
> I think KATO expiry is more applicable to target, no?
>
KATO expiry (for this implementation) means the nvme_keep_alive_end_io()
is called with rtt > deadline.
> KATO timeout is a signal of an error that target is not reachable or
> something is wrong with the target?
>
Yes. It explicitely means that the Keep-Alive command was not completed
within a time period specified by the Keep-Alive timeout value.
>>
>> So I'd prefer to have CCR send whenever KATO timeout triggers, and
>> lease to current command timeout mechanism in place.
>
> Assuming we used CCR only when KATO request times out.
> What should we do when we hit other errors?
>
Leave it. This patchset is _NOT_ about fixing the error handler.
This patchset is about implementing CCR.
And CCR is just a step towared fixing the error handler.
> should nvme_tcp_error_recovery() is called from many places to handle
> errors and it effectively resets the controller. What should this
> function do if not trigger CCR?
>
The same things as before. CCR needs to be invoked when the Keep-Alive
timeout period expires. And that is what we should be implementing here.
It _might_ (and arguably should) be invoked when error handling needs
to be done. But with a later patchset.
Yes, this will result in a patchset where the error handler still has
gaps. But it will result in a patchset which focusses on a single thing,
with the added benefit that the workings are clearly outlined in the
spec. So the implementation is pretty easy to review.
Hooking CCR into the error handler itself is _not_ mandated by the spec,
and inevitably needs to larger discussions (as this thread nicely
demonstrates). So I would prefer to keep it simple first, and focus
on the technical implementation of CCR.
And concentrate on fixing up the error handler once CCR is in.
Cheers,
Hannes
>>
>> Cheers,
>>
>> Hannes
>> --
>> Dr. Hannes Reinecke Kernel Storage Architect
>> hare@suse.de +49 911 74053 688
>> SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
>> HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
--
Dr. Hannes Reinecke Kernel Storage Architect
hare@suse.de +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
On Tue, Feb 3, 2026 at 1:24 PM Mohamed Khalfella <mkhalfella@purestorage.com> wrote: > > On Tue 2026-02-03 06:34:51 +0100, Hannes Reinecke wrote: > > On 1/30/26 23:34, Mohamed Khalfella wrote: > > > An alive nvme controller that hits an error now will move to FENCING > > > state instead of RESETTING state. ctrl->fencing_work attempts CCR to > > > terminate inflight IOs. If CCR succeeds, switch to FENCED -> RESETTING > > > and continue error recovery as usual. If CCR fails, the behavior depends > > > on whether the subsystem supports CQT or not. If CQT is not supported > > > then reset the controller immediately as if CCR succeeded in order to > > > maintain the current behavior. If CQT is supported switch to time-based > > > recovery. Schedule ctrl->fenced_work resets the controller when time > > > based recovery finishes. > > > > > > Either ctrl->err_work or ctrl->reset_work can run after a controller is > > > fenced. Flush fencing work when either work run. > > > > > > Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com> ... > > Here you are calling CCR whenever error recovery is triggered. > > This will cause CCR to be send from a command timeout, which is > > technically wrong (CCR should be send when the KATO timeout expires, > > not when a command timout expires). Both could be vastly different. > > So I'd prefer to have CCR send whenever KATO timeout triggers, and > > lease to current command timeout mechanism in place. Hannas, It is incorrect that CCR should be sent when the KATO timeout expires, not when a command timeout expires. KATO timeout expiring is what happens on the controller, not the host. The controller behavior is specified because the host has to know what the controller will do. The host is free to decide that a connection/controller association should be abandoned whenever the host wants to. It can be either a timeout on a keep alive (which is not KATO expiring) or any other command. But once the host has decided to abandon and tear down the connection/ controller association, it has to make sure no pending requests are still outstanding on the controller side. And that is either through CCR or through time-based recovery. So, if, after a command timeout, the host decides to cancel/abort the command, but not tear down the association, we should not trigger a CCR. But, if we are tearing down the connection (and there are pending commands, we should trigger CCR (and start time based recovery). Sincerely, Randy Jennings
© 2016 - 2026 Red Hat, Inc.