A host that has more than one path connecting to an nvme subsystem
typically has an nvme controller associated with every path. This is
mostly applicable to nvmeof. If one path goes down, inflight IOs on that
path should not be retried immediately on another path because this
could lead to data corruption as described in TP4129. TP8028 defines
cross-controller reset mechanism that can be used by host to terminate
IOs on the failed path using one of the remaining healthy paths. Only
after IOs are terminated, or long enough time passes as defined by
TP4129, inflight IOs should be retried on another path. Implement core
cross-controller reset shared logic to be used by the transports.
Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
drivers/nvme/host/constants.c | 1 +
drivers/nvme/host/core.c | 133 ++++++++++++++++++++++++++++++++++
drivers/nvme/host/nvme.h | 10 +++
3 files changed, 144 insertions(+)
diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
index dc90df9e13a2..f679efd5110e 100644
--- a/drivers/nvme/host/constants.c
+++ b/drivers/nvme/host/constants.c
@@ -46,6 +46,7 @@ static const char * const nvme_admin_ops[] = {
[nvme_admin_virtual_mgmt] = "Virtual Management",
[nvme_admin_nvme_mi_send] = "NVMe Send MI",
[nvme_admin_nvme_mi_recv] = "NVMe Receive MI",
+ [nvme_admin_cross_ctrl_reset] = "Cross Controller Reset",
[nvme_admin_dbbuf] = "Doorbell Buffer Config",
[nvme_admin_format_nvm] = "Format NVM",
[nvme_admin_security_send] = "Security Send",
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f5b84bc327d3..f38b70ca9cee 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -554,6 +554,138 @@ void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
+static struct nvme_ctrl *nvme_find_ccr_ctrl(struct nvme_ctrl *ictrl,
+ u32 min_cntlid)
+{
+ struct nvme_subsystem *subsys = ictrl->subsys;
+ struct nvme_ctrl *sctrl;
+ unsigned long flags;
+
+ mutex_lock(&nvme_subsystems_lock);
+ list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
+ if (sctrl->cntlid < min_cntlid)
+ continue;
+
+ if (atomic_dec_if_positive(&sctrl->ccr_limit) < 0)
+ continue;
+
+ spin_lock_irqsave(&sctrl->lock, flags);
+ if (sctrl->state != NVME_CTRL_LIVE) {
+ spin_unlock_irqrestore(&sctrl->lock, flags);
+ atomic_inc(&sctrl->ccr_limit);
+ continue;
+ }
+
+ /*
+ * We got a good candidate source controller that is locked and
+ * LIVE. However, no guarantee sctrl will not be deleted after
+ * sctrl->lock is released. Get a ref of both sctrl and admin_q
+ * so they do not disappear until we are done with them.
+ */
+ WARN_ON_ONCE(!blk_get_queue(sctrl->admin_q));
+ nvme_get_ctrl(sctrl);
+ spin_unlock_irqrestore(&sctrl->lock, flags);
+ goto found;
+ }
+ sctrl = NULL;
+found:
+ mutex_unlock(&nvme_subsystems_lock);
+ return sctrl;
+}
+
+static int nvme_issue_wait_ccr(struct nvme_ctrl *sctrl, struct nvme_ctrl *ictrl)
+{
+ unsigned long flags, tmo, remain;
+ struct nvme_ccr_entry ccr = { };
+ union nvme_result res = { 0 };
+ struct nvme_command c = { };
+ u32 result;
+ int ret = 0;
+
+ init_completion(&ccr.complete);
+ ccr.ictrl = ictrl;
+
+ spin_lock_irqsave(&sctrl->lock, flags);
+ list_add_tail(&ccr.list, &sctrl->ccrs);
+ spin_unlock_irqrestore(&sctrl->lock, flags);
+
+ c.ccr.opcode = nvme_admin_cross_ctrl_reset;
+ c.ccr.ciu = ictrl->ciu;
+ c.ccr.icid = cpu_to_le16(ictrl->cntlid);
+ c.ccr.cirn = cpu_to_le64(ictrl->cirn);
+ ret = __nvme_submit_sync_cmd(sctrl->admin_q, &c, &res,
+ NULL, 0, NVME_QID_ANY, 0);
+ if (ret)
+ goto out;
+
+ result = le32_to_cpu(res.u32);
+ if (result & 0x01) /* Immediate Reset */
+ goto out;
+
+ tmo = msecs_to_jiffies(max(ictrl->cqt, ictrl->kato * 1000));
+ remain = wait_for_completion_timeout(&ccr.complete, tmo);
+ if (!remain)
+ ret = -EAGAIN;
+out:
+ spin_lock_irqsave(&sctrl->lock, flags);
+ list_del(&ccr.list);
+ spin_unlock_irqrestore(&sctrl->lock, flags);
+ return ccr.ccrs == 1 ? 0 : ret;
+}
+
+unsigned long nvme_recover_ctrl(struct nvme_ctrl *ictrl)
+{
+ unsigned long deadline, now, timeout;
+ struct nvme_ctrl *sctrl;
+ u32 min_cntlid = 0;
+ int ret;
+
+ timeout = nvme_recovery_timeout_ms(ictrl);
+ dev_info(ictrl->device, "attempting CCR, timeout %lums\n", timeout);
+
+ now = jiffies;
+ deadline = now + msecs_to_jiffies(timeout);
+ while (time_before(now, deadline)) {
+ sctrl = nvme_find_ccr_ctrl(ictrl, min_cntlid);
+ if (!sctrl) {
+ /* CCR failed, switch to time-based recovery */
+ return deadline - now;
+ }
+
+ ret = nvme_issue_wait_ccr(sctrl, ictrl);
+ atomic_inc(&sctrl->ccr_limit);
+
+ if (!ret) {
+ dev_info(ictrl->device, "CCR succeeded using %s\n",
+ dev_name(sctrl->device));
+ blk_put_queue(sctrl->admin_q);
+ nvme_put_ctrl(sctrl);
+ return 0;
+ }
+
+ /* Try another controller */
+ min_cntlid = sctrl->cntlid + 1;
+ blk_put_queue(sctrl->admin_q);
+ nvme_put_ctrl(sctrl);
+ now = jiffies;
+ }
+
+ dev_info(ictrl->device, "CCR reached timeout, call it done\n");
+ return 0;
+}
+EXPORT_SYMBOL_GPL(nvme_recover_ctrl);
+
+void nvme_end_ctrl_recovery(struct nvme_ctrl *ctrl)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctrl->lock, flags);
+ WRITE_ONCE(ctrl->state, NVME_CTRL_RESETTING);
+ wake_up_all(&ctrl->state_wq);
+ spin_unlock_irqrestore(&ctrl->lock, flags);
+}
+EXPORT_SYMBOL_GPL(nvme_end_ctrl_recovery);
+
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state)
{
@@ -5108,6 +5240,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
mutex_init(&ctrl->scan_lock);
INIT_LIST_HEAD(&ctrl->namespaces);
+ INIT_LIST_HEAD(&ctrl->ccrs);
xa_init(&ctrl->cels);
ctrl->dev = dev;
ctrl->ops = ops;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cde427353e0a..1f8937fce9a7 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -279,6 +279,13 @@ enum nvme_ctrl_flags {
NVME_CTRL_RECOVERED = 7,
};
+struct nvme_ccr_entry {
+ struct list_head list;
+ struct completion complete;
+ struct nvme_ctrl *ictrl;
+ u8 ccrs;
+};
+
struct nvme_ctrl {
bool comp_seen;
bool identified;
@@ -296,6 +303,7 @@ struct nvme_ctrl {
struct blk_mq_tag_set *tagset;
struct blk_mq_tag_set *admin_tagset;
struct list_head namespaces;
+ struct list_head ccrs;
struct mutex namespaces_lock;
struct srcu_struct srcu;
struct device ctrl_device;
@@ -805,6 +813,8 @@ blk_status_t nvme_host_path_error(struct request *req);
bool nvme_cancel_request(struct request *req, void *data);
void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
+unsigned long nvme_recover_ctrl(struct nvme_ctrl *ctrl);
+void nvme_end_ctrl_recovery(struct nvme_ctrl *ctrl);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown);
--
2.51.2
On 26/11/2025 4:11, Mohamed Khalfella wrote:
> A host that has more than one path connecting to an nvme subsystem
> typically has an nvme controller associated with every path. This is
> mostly applicable to nvmeof. If one path goes down, inflight IOs on that
> path should not be retried immediately on another path because this
> could lead to data corruption as described in TP4129. TP8028 defines
> cross-controller reset mechanism that can be used by host to terminate
> IOs on the failed path using one of the remaining healthy paths. Only
> after IOs are terminated, or long enough time passes as defined by
> TP4129, inflight IOs should be retried on another path. Implement core
> cross-controller reset shared logic to be used by the transports.
>
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> ---
> drivers/nvme/host/constants.c | 1 +
> drivers/nvme/host/core.c | 133 ++++++++++++++++++++++++++++++++++
> drivers/nvme/host/nvme.h | 10 +++
> 3 files changed, 144 insertions(+)
>
> diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
> index dc90df9e13a2..f679efd5110e 100644
> --- a/drivers/nvme/host/constants.c
> +++ b/drivers/nvme/host/constants.c
> @@ -46,6 +46,7 @@ static const char * const nvme_admin_ops[] = {
> [nvme_admin_virtual_mgmt] = "Virtual Management",
> [nvme_admin_nvme_mi_send] = "NVMe Send MI",
> [nvme_admin_nvme_mi_recv] = "NVMe Receive MI",
> + [nvme_admin_cross_ctrl_reset] = "Cross Controller Reset",
> [nvme_admin_dbbuf] = "Doorbell Buffer Config",
> [nvme_admin_format_nvm] = "Format NVM",
> [nvme_admin_security_send] = "Security Send",
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index f5b84bc327d3..f38b70ca9cee 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -554,6 +554,138 @@ void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
> }
> EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
>
> +static struct nvme_ctrl *nvme_find_ccr_ctrl(struct nvme_ctrl *ictrl,
> + u32 min_cntlid)
> +{
> + struct nvme_subsystem *subsys = ictrl->subsys;
> + struct nvme_ctrl *sctrl;
> + unsigned long flags;
> +
> + mutex_lock(&nvme_subsystems_lock);
This looks like the wrong lock to take here?
> + list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
> + if (sctrl->cntlid < min_cntlid)
> + continue;
The use of min_cntlid is not clear to me.
> +
> + if (atomic_dec_if_positive(&sctrl->ccr_limit) < 0)
> + continue;
> +
> + spin_lock_irqsave(&sctrl->lock, flags);
> + if (sctrl->state != NVME_CTRL_LIVE) {
> + spin_unlock_irqrestore(&sctrl->lock, flags);
> + atomic_inc(&sctrl->ccr_limit);
> + continue;
> + }
> +
> + /*
> + * We got a good candidate source controller that is locked and
> + * LIVE. However, no guarantee sctrl will not be deleted after
> + * sctrl->lock is released. Get a ref of both sctrl and admin_q
> + * so they do not disappear until we are done with them.
> + */
> + WARN_ON_ONCE(!blk_get_queue(sctrl->admin_q));
> + nvme_get_ctrl(sctrl);
> + spin_unlock_irqrestore(&sctrl->lock, flags);
> + goto found;
> + }
> + sctrl = NULL;
> +found:
> + mutex_unlock(&nvme_subsystems_lock);
> + return sctrl;
> +}
> +
> +static int nvme_issue_wait_ccr(struct nvme_ctrl *sctrl, struct nvme_ctrl *ictrl)
> +{
> + unsigned long flags, tmo, remain;
> + struct nvme_ccr_entry ccr = { };
> + union nvme_result res = { 0 };
> + struct nvme_command c = { };
> + u32 result;
> + int ret = 0;
> +
> + init_completion(&ccr.complete);
> + ccr.ictrl = ictrl;
> +
> + spin_lock_irqsave(&sctrl->lock, flags);
> + list_add_tail(&ccr.list, &sctrl->ccrs);
> + spin_unlock_irqrestore(&sctrl->lock, flags);
> +
> + c.ccr.opcode = nvme_admin_cross_ctrl_reset;
> + c.ccr.ciu = ictrl->ciu;
> + c.ccr.icid = cpu_to_le16(ictrl->cntlid);
> + c.ccr.cirn = cpu_to_le64(ictrl->cirn);
> + ret = __nvme_submit_sync_cmd(sctrl->admin_q, &c, &res,
> + NULL, 0, NVME_QID_ANY, 0);
> + if (ret)
> + goto out;
> +
> + result = le32_to_cpu(res.u32);
> + if (result & 0x01) /* Immediate Reset */
> + goto out;
> +
> + tmo = msecs_to_jiffies(max(ictrl->cqt, ictrl->kato * 1000));
> + remain = wait_for_completion_timeout(&ccr.complete, tmo);
> + if (!remain)
I think remain is redundant here.
> + ret = -EAGAIN;
> +out:
> + spin_lock_irqsave(&sctrl->lock, flags);
> + list_del(&ccr.list);
> + spin_unlock_irqrestore(&sctrl->lock, flags);
> + return ccr.ccrs == 1 ? 0 : ret;
Why would you still return 0 and not EAGAIN? you expired on timeout but
still
return success if you have ccrs=1? btw you have ccrs in the ccr struct
and in the controller
as a list. Lets rename to distinguish the two.
> +}
> +
> +unsigned long nvme_recover_ctrl(struct nvme_ctrl *ictrl)
> +{
I'd call it nvme_fence_controller()
> + unsigned long deadline, now, timeout;
> + struct nvme_ctrl *sctrl;
> + u32 min_cntlid = 0;
> + int ret;
> +
> + timeout = nvme_recovery_timeout_ms(ictrl);
> + dev_info(ictrl->device, "attempting CCR, timeout %lums\n", timeout);
> +
> + now = jiffies;
> + deadline = now + msecs_to_jiffies(timeout);
> + while (time_before(now, deadline)) {
> + sctrl = nvme_find_ccr_ctrl(ictrl, min_cntlid);
> + if (!sctrl) {
> + /* CCR failed, switch to time-based recovery */
> + return deadline - now;
It is not clear what is the return code semantics of this function.
How about making it success/failure and have the caller choose what to do?
> + }
> +
> + ret = nvme_issue_wait_ccr(sctrl, ictrl);
> + atomic_inc(&sctrl->ccr_limit);
inc after you wait for the ccr? shouldn't this be before?
> +
> + if (!ret) {
> + dev_info(ictrl->device, "CCR succeeded using %s\n",
> + dev_name(sctrl->device));
> + blk_put_queue(sctrl->admin_q);
> + nvme_put_ctrl(sctrl);
> + return 0;
> + }
> +
> + /* Try another controller */
> + min_cntlid = sctrl->cntlid + 1;
OK, I see why min_cntlid is used. That is very non-intuitive.
I'm wandering if it will be simpler to take one-shot at ccr and
if it fails fallback to crt. I mean, if the sctrl is alive, and it was
unable
to reset the ictrl in time, how would another ctrl do a better job here?
> + blk_put_queue(sctrl->admin_q);
> + nvme_put_ctrl(sctrl);
> + now = jiffies;
> + }
> +
> + dev_info(ictrl->device, "CCR reached timeout, call it done\n");
> + return 0;
> +}
> +EXPORT_SYMBOL_GPL(nvme_recover_ctrl);
> +
> +void nvme_end_ctrl_recovery(struct nvme_ctrl *ctrl)
> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&ctrl->lock, flags);
> + WRITE_ONCE(ctrl->state, NVME_CTRL_RESETTING);
This needs to be a proper state transition.
On Sat 2025-12-27 12:14:11 +0200, Sagi Grimberg wrote:
>
>
> On 26/11/2025 4:11, Mohamed Khalfella wrote:
> > A host that has more than one path connecting to an nvme subsystem
> > typically has an nvme controller associated with every path. This is
> > mostly applicable to nvmeof. If one path goes down, inflight IOs on that
> > path should not be retried immediately on another path because this
> > could lead to data corruption as described in TP4129. TP8028 defines
> > cross-controller reset mechanism that can be used by host to terminate
> > IOs on the failed path using one of the remaining healthy paths. Only
> > after IOs are terminated, or long enough time passes as defined by
> > TP4129, inflight IOs should be retried on another path. Implement core
> > cross-controller reset shared logic to be used by the transports.
> >
> > Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> > ---
> > drivers/nvme/host/constants.c | 1 +
> > drivers/nvme/host/core.c | 133 ++++++++++++++++++++++++++++++++++
> > drivers/nvme/host/nvme.h | 10 +++
> > 3 files changed, 144 insertions(+)
> >
> > diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
> > index dc90df9e13a2..f679efd5110e 100644
> > --- a/drivers/nvme/host/constants.c
> > +++ b/drivers/nvme/host/constants.c
> > @@ -46,6 +46,7 @@ static const char * const nvme_admin_ops[] = {
> > [nvme_admin_virtual_mgmt] = "Virtual Management",
> > [nvme_admin_nvme_mi_send] = "NVMe Send MI",
> > [nvme_admin_nvme_mi_recv] = "NVMe Receive MI",
> > + [nvme_admin_cross_ctrl_reset] = "Cross Controller Reset",
> > [nvme_admin_dbbuf] = "Doorbell Buffer Config",
> > [nvme_admin_format_nvm] = "Format NVM",
> > [nvme_admin_security_send] = "Security Send",
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index f5b84bc327d3..f38b70ca9cee 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -554,6 +554,138 @@ void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
> > }
> > EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
> >
> > +static struct nvme_ctrl *nvme_find_ccr_ctrl(struct nvme_ctrl *ictrl,
> > + u32 min_cntlid)
> > +{
> > + struct nvme_subsystem *subsys = ictrl->subsys;
> > + struct nvme_ctrl *sctrl;
> > + unsigned long flags;
> > +
> > + mutex_lock(&nvme_subsystems_lock);
>
> This looks like the wrong lock to take here?
This is similar to nvme_validate_cntlid()?
What is the correct lock to use?
>
> > + list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
> > + if (sctrl->cntlid < min_cntlid)
> > + continue;
>
> The use of min_cntlid is not clear to me.
>
> > +
> > + if (atomic_dec_if_positive(&sctrl->ccr_limit) < 0)
> > + continue;
> > +
> > + spin_lock_irqsave(&sctrl->lock, flags);
> > + if (sctrl->state != NVME_CTRL_LIVE) {
> > + spin_unlock_irqrestore(&sctrl->lock, flags);
> > + atomic_inc(&sctrl->ccr_limit);
> > + continue;
> > + }
> > +
> > + /*
> > + * We got a good candidate source controller that is locked and
> > + * LIVE. However, no guarantee sctrl will not be deleted after
> > + * sctrl->lock is released. Get a ref of both sctrl and admin_q
> > + * so they do not disappear until we are done with them.
> > + */
> > + WARN_ON_ONCE(!blk_get_queue(sctrl->admin_q));
> > + nvme_get_ctrl(sctrl);
> > + spin_unlock_irqrestore(&sctrl->lock, flags);
> > + goto found;
> > + }
> > + sctrl = NULL;
> > +found:
> > + mutex_unlock(&nvme_subsystems_lock);
> > + return sctrl;
> > +}
> > +
> > +static int nvme_issue_wait_ccr(struct nvme_ctrl *sctrl, struct nvme_ctrl *ictrl)
> > +{
> > + unsigned long flags, tmo, remain;
> > + struct nvme_ccr_entry ccr = { };
> > + union nvme_result res = { 0 };
> > + struct nvme_command c = { };
> > + u32 result;
> > + int ret = 0;
> > +
> > + init_completion(&ccr.complete);
> > + ccr.ictrl = ictrl;
> > +
> > + spin_lock_irqsave(&sctrl->lock, flags);
> > + list_add_tail(&ccr.list, &sctrl->ccrs);
> > + spin_unlock_irqrestore(&sctrl->lock, flags);
> > +
> > + c.ccr.opcode = nvme_admin_cross_ctrl_reset;
> > + c.ccr.ciu = ictrl->ciu;
> > + c.ccr.icid = cpu_to_le16(ictrl->cntlid);
> > + c.ccr.cirn = cpu_to_le64(ictrl->cirn);
> > + ret = __nvme_submit_sync_cmd(sctrl->admin_q, &c, &res,
> > + NULL, 0, NVME_QID_ANY, 0);
> > + if (ret)
> > + goto out;
> > +
> > + result = le32_to_cpu(res.u32);
> > + if (result & 0x01) /* Immediate Reset */
> > + goto out;
> > +
> > + tmo = msecs_to_jiffies(max(ictrl->cqt, ictrl->kato * 1000));
> > + remain = wait_for_completion_timeout(&ccr.complete, tmo);
> > + if (!remain)
>
> I think remain is redundant here.
Deleted 'remain'.
>
> > + ret = -EAGAIN;
> > +out:
> > + spin_lock_irqsave(&sctrl->lock, flags);
> > + list_del(&ccr.list);
> > + spin_unlock_irqrestore(&sctrl->lock, flags);
> > + return ccr.ccrs == 1 ? 0 : ret;
>
> Why would you still return 0 and not EAGAIN? you expired on timeout but
> still
> return success if you have ccrs=1? btw you have ccrs in the ccr struct
> and in the controller
> as a list. Lets rename to distinguish the two.
True, we did expire timeout here. However, after we removed the ccr
entry we found that it was marked as completed. We return success in
this case even though we hit timeout.
Renamed ctrl->ccrs to ctrl->ccr_list.
>
> > +}
> > +
> > +unsigned long nvme_recover_ctrl(struct nvme_ctrl *ictrl)
> > +{
>
> I'd call it nvme_fence_controller()
Okay. I will do that. I will also rename the controller state FENCING.
>
> > + unsigned long deadline, now, timeout;
> > + struct nvme_ctrl *sctrl;
> > + u32 min_cntlid = 0;
> > + int ret;
> > +
> > + timeout = nvme_recovery_timeout_ms(ictrl);
> > + dev_info(ictrl->device, "attempting CCR, timeout %lums\n", timeout);
> > +
> > + now = jiffies;
> > + deadline = now + msecs_to_jiffies(timeout);
> > + while (time_before(now, deadline)) {
> > + sctrl = nvme_find_ccr_ctrl(ictrl, min_cntlid);
> > + if (!sctrl) {
> > + /* CCR failed, switch to time-based recovery */
> > + return deadline - now;
>
> It is not clear what is the return code semantics of this function.
> How about making it success/failure and have the caller choose what to do?
The function returns 0 on success. On failure it returns the time in
jiffies to hold requests for before they are canceled. On failure the
returned time is essentially the hold time defined in TP4129 minus the
time it took to attempt CCR.
>
> > + }
> > +
> > + ret = nvme_issue_wait_ccr(sctrl, ictrl);
> > + atomic_inc(&sctrl->ccr_limit);
>
> inc after you wait for the ccr? shouldn't this be before?
I think it should be after we wait for CCR. sctrl->ccr_limit is the
number of concurrent CCRs the controller supports. Only after we are
done with CCR on this controller we increment it.
>
> > +
> > + if (!ret) {
> > + dev_info(ictrl->device, "CCR succeeded using %s\n",
> > + dev_name(sctrl->device));
> > + blk_put_queue(sctrl->admin_q);
> > + nvme_put_ctrl(sctrl);
> > + return 0;
> > + }
> > +
> > + /* Try another controller */
> > + min_cntlid = sctrl->cntlid + 1;
>
> OK, I see why min_cntlid is used. That is very non-intuitive.
>
> I'm wandering if it will be simpler to take one-shot at ccr and
> if it fails fallback to crt. I mean, if the sctrl is alive, and it was
> unable
> to reset the ictrl in time, how would another ctrl do a better job here?
We need to attempt CCR from multiple controllers for reason explained in
another response. As you figured out min_cntlid is needed in order to
not loop controller list forever. Do you have a better idea?
>
> > + blk_put_queue(sctrl->admin_q);
> > + nvme_put_ctrl(sctrl);
> > + now = jiffies;
> > + }
> > +
> > + dev_info(ictrl->device, "CCR reached timeout, call it done\n");
> > + return 0;
> > +}
> > +EXPORT_SYMBOL_GPL(nvme_recover_ctrl);
> > +
> > +void nvme_end_ctrl_recovery(struct nvme_ctrl *ctrl)
> > +{
> > + unsigned long flags;
> > +
> > + spin_lock_irqsave(&ctrl->lock, flags);
> > + WRITE_ONCE(ctrl->state, NVME_CTRL_RESETTING);
>
> This needs to be a proper state transition.
We do not want to have proper transition from RECOVERING to RESETTING.
The reason is that we do not want the controller to be reset while it is
being recovered/fenced because requests should not be canceled. One way
to keep the transitions in nvme_change_ctrl_state() is to use two
states. Say FENCING and FENCED.
The allowed transitions are
- LIVE -> FENCING
- FENCING -> FENCED
- FENCED -> (RESETTING, DELETING)
This will also git rid of NVME_CTRL_RECOVERED
Does this sound good?
On 01/01/2026 1:43, Mohamed Khalfella wrote:
> On Sat 2025-12-27 12:14:11 +0200, Sagi Grimberg wrote:
>>
>> On 26/11/2025 4:11, Mohamed Khalfella wrote:
>>> A host that has more than one path connecting to an nvme subsystem
>>> typically has an nvme controller associated with every path. This is
>>> mostly applicable to nvmeof. If one path goes down, inflight IOs on that
>>> path should not be retried immediately on another path because this
>>> could lead to data corruption as described in TP4129. TP8028 defines
>>> cross-controller reset mechanism that can be used by host to terminate
>>> IOs on the failed path using one of the remaining healthy paths. Only
>>> after IOs are terminated, or long enough time passes as defined by
>>> TP4129, inflight IOs should be retried on another path. Implement core
>>> cross-controller reset shared logic to be used by the transports.
>>>
>>> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
>>> ---
>>> drivers/nvme/host/constants.c | 1 +
>>> drivers/nvme/host/core.c | 133 ++++++++++++++++++++++++++++++++++
>>> drivers/nvme/host/nvme.h | 10 +++
>>> 3 files changed, 144 insertions(+)
>>>
>>> diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
>>> index dc90df9e13a2..f679efd5110e 100644
>>> --- a/drivers/nvme/host/constants.c
>>> +++ b/drivers/nvme/host/constants.c
>>> @@ -46,6 +46,7 @@ static const char * const nvme_admin_ops[] = {
>>> [nvme_admin_virtual_mgmt] = "Virtual Management",
>>> [nvme_admin_nvme_mi_send] = "NVMe Send MI",
>>> [nvme_admin_nvme_mi_recv] = "NVMe Receive MI",
>>> + [nvme_admin_cross_ctrl_reset] = "Cross Controller Reset",
>>> [nvme_admin_dbbuf] = "Doorbell Buffer Config",
>>> [nvme_admin_format_nvm] = "Format NVM",
>>> [nvme_admin_security_send] = "Security Send",
>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>>> index f5b84bc327d3..f38b70ca9cee 100644
>>> --- a/drivers/nvme/host/core.c
>>> +++ b/drivers/nvme/host/core.c
>>> @@ -554,6 +554,138 @@ void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
>>> }
>>> EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
>>>
>>> +static struct nvme_ctrl *nvme_find_ccr_ctrl(struct nvme_ctrl *ictrl,
>>> + u32 min_cntlid)
>>> +{
>>> + struct nvme_subsystem *subsys = ictrl->subsys;
>>> + struct nvme_ctrl *sctrl;
>>> + unsigned long flags;
>>> +
>>> + mutex_lock(&nvme_subsystems_lock);
>> This looks like the wrong lock to take here?
> This is similar to nvme_validate_cntlid()?
> What is the correct lock to use?
Not really, its only because it is called from nvme_init_subsystem which
spans
subsystems.
>
>>> + list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
>>> + if (sctrl->cntlid < min_cntlid)
>>> + continue;
>> The use of min_cntlid is not clear to me.
>>
>>> +
>>> + if (atomic_dec_if_positive(&sctrl->ccr_limit) < 0)
>>> + continue;
>>> +
>>> + spin_lock_irqsave(&sctrl->lock, flags);
>>> + if (sctrl->state != NVME_CTRL_LIVE) {
>>> + spin_unlock_irqrestore(&sctrl->lock, flags);
>>> + atomic_inc(&sctrl->ccr_limit);
>>> + continue;
>>> + }
>>> +
>>> + /*
>>> + * We got a good candidate source controller that is locked and
>>> + * LIVE. However, no guarantee sctrl will not be deleted after
>>> + * sctrl->lock is released. Get a ref of both sctrl and admin_q
>>> + * so they do not disappear until we are done with them.
>>> + */
>>> + WARN_ON_ONCE(!blk_get_queue(sctrl->admin_q));
>>> + nvme_get_ctrl(sctrl);
>>> + spin_unlock_irqrestore(&sctrl->lock, flags);
>>> + goto found;
>>> + }
>>> + sctrl = NULL;
>>> +found:
>>> + mutex_unlock(&nvme_subsystems_lock);
>>> + return sctrl;
>>> +}
>>> +
>>> +static int nvme_issue_wait_ccr(struct nvme_ctrl *sctrl, struct nvme_ctrl *ictrl)
>>> +{
>>> + unsigned long flags, tmo, remain;
>>> + struct nvme_ccr_entry ccr = { };
>>> + union nvme_result res = { 0 };
>>> + struct nvme_command c = { };
>>> + u32 result;
>>> + int ret = 0;
>>> +
>>> + init_completion(&ccr.complete);
>>> + ccr.ictrl = ictrl;
>>> +
>>> + spin_lock_irqsave(&sctrl->lock, flags);
>>> + list_add_tail(&ccr.list, &sctrl->ccrs);
>>> + spin_unlock_irqrestore(&sctrl->lock, flags);
>>> +
>>> + c.ccr.opcode = nvme_admin_cross_ctrl_reset;
>>> + c.ccr.ciu = ictrl->ciu;
>>> + c.ccr.icid = cpu_to_le16(ictrl->cntlid);
>>> + c.ccr.cirn = cpu_to_le64(ictrl->cirn);
>>> + ret = __nvme_submit_sync_cmd(sctrl->admin_q, &c, &res,
>>> + NULL, 0, NVME_QID_ANY, 0);
>>> + if (ret)
>>> + goto out;
>>> +
>>> + result = le32_to_cpu(res.u32);
>>> + if (result & 0x01) /* Immediate Reset */
>>> + goto out;
>>> +
>>> + tmo = msecs_to_jiffies(max(ictrl->cqt, ictrl->kato * 1000));
>>> + remain = wait_for_completion_timeout(&ccr.complete, tmo);
>>> + if (!remain)
>> I think remain is redundant here.
> Deleted 'remain'.
>
>>> + ret = -EAGAIN;
>>> +out:
>>> + spin_lock_irqsave(&sctrl->lock, flags);
>>> + list_del(&ccr.list);
>>> + spin_unlock_irqrestore(&sctrl->lock, flags);
>>> + return ccr.ccrs == 1 ? 0 : ret;
>> Why would you still return 0 and not EAGAIN? you expired on timeout but
>> still
>> return success if you have ccrs=1? btw you have ccrs in the ccr struct
>> and in the controller
>> as a list. Lets rename to distinguish the two.
> True, we did expire timeout here. However, after we removed the ccr
> entry we found that it was marked as completed. We return success in
> this case even though we hit timeout.
When does this happen? Why is it worth having the code non-intuitive for
something that effectively never happens (unless I'm missing something?)
>
> Renamed ctrl->ccrs to ctrl->ccr_list.
>
>>> +}
>>> +
>>> +unsigned long nvme_recover_ctrl(struct nvme_ctrl *ictrl)
>>> +{
>> I'd call it nvme_fence_controller()
> Okay. I will do that. I will also rename the controller state FENCING.
>
>>> + unsigned long deadline, now, timeout;
>>> + struct nvme_ctrl *sctrl;
>>> + u32 min_cntlid = 0;
>>> + int ret;
>>> +
>>> + timeout = nvme_recovery_timeout_ms(ictrl);
>>> + dev_info(ictrl->device, "attempting CCR, timeout %lums\n", timeout);
>>> +
>>> + now = jiffies;
>>> + deadline = now + msecs_to_jiffies(timeout);
>>> + while (time_before(now, deadline)) {
>>> + sctrl = nvme_find_ccr_ctrl(ictrl, min_cntlid);
>>> + if (!sctrl) {
>>> + /* CCR failed, switch to time-based recovery */
>>> + return deadline - now;
>> It is not clear what is the return code semantics of this function.
>> How about making it success/failure and have the caller choose what to do?
> The function returns 0 on success. On failure it returns the time in
> jiffies to hold requests for before they are canceled. On failure the
> returned time is essentially the hold time defined in TP4129 minus the
> time it took to attempt CCR.
I think it would be cleaner to simple have this function return status
code and
have the caller worry about time spent.
>
>>> + }
>>> +
>>> + ret = nvme_issue_wait_ccr(sctrl, ictrl);
>>> + atomic_inc(&sctrl->ccr_limit);
>> inc after you wait for the ccr? shouldn't this be before?
> I think it should be after we wait for CCR. sctrl->ccr_limit is the
> number of concurrent CCRs the controller supports. Only after we are
> done with CCR on this controller we increment it.
Maybe it should be folded into nvme_issue_wait_ccr for symmetry?
>
>>> +
>>> + if (!ret) {
>>> + dev_info(ictrl->device, "CCR succeeded using %s\n",
>>> + dev_name(sctrl->device));
>>> + blk_put_queue(sctrl->admin_q);
>>> + nvme_put_ctrl(sctrl);
>>> + return 0;
>>> + }
>>> +
>>> + /* Try another controller */
>>> + min_cntlid = sctrl->cntlid + 1;
>> OK, I see why min_cntlid is used. That is very non-intuitive.
>>
>> I'm wandering if it will be simpler to take one-shot at ccr and
>> if it fails fallback to crt. I mean, if the sctrl is alive, and it was
>> unable
>> to reset the ictrl in time, how would another ctrl do a better job here?
> We need to attempt CCR from multiple controllers for reason explained in
> another response. As you figured out min_cntlid is needed in order to
> not loop controller list forever. Do you have a better idea?
No, just know that I don't like it very much :)
>
>>> + blk_put_queue(sctrl->admin_q);
>>> + nvme_put_ctrl(sctrl);
>>> + now = jiffies;
>>> + }
>>> +
>>> + dev_info(ictrl->device, "CCR reached timeout, call it done\n");
>>> + return 0;
>>> +}
>>> +EXPORT_SYMBOL_GPL(nvme_recover_ctrl);
>>> +
>>> +void nvme_end_ctrl_recovery(struct nvme_ctrl *ctrl)
>>> +{
>>> + unsigned long flags;
>>> +
>>> + spin_lock_irqsave(&ctrl->lock, flags);
>>> + WRITE_ONCE(ctrl->state, NVME_CTRL_RESETTING);
>> This needs to be a proper state transition.
> We do not want to have proper transition from RECOVERING to RESETTING.
> The reason is that we do not want the controller to be reset while it is
> being recovered/fenced because requests should not be canceled. One way
> to keep the transitions in nvme_change_ctrl_state() is to use two
> states. Say FENCING and FENCED.
>
> The allowed transitions are
>
> - LIVE -> FENCING
> - FENCING -> FENCED
> - FENCED -> (RESETTING, DELETING)
>
> This will also git rid of NVME_CTRL_RECOVERED
>
> Does this sound good?
We could do what failfast is doing, in case we get transition FENCING ->
RESETTING/DELETING we flush
the fence_work...
On Sun 2026-01-04 23:39:35 +0200, Sagi Grimberg wrote:
>
>
> On 01/01/2026 1:43, Mohamed Khalfella wrote:
> > On Sat 2025-12-27 12:14:11 +0200, Sagi Grimberg wrote:
> >>
> >> On 26/11/2025 4:11, Mohamed Khalfella wrote:
> >>> A host that has more than one path connecting to an nvme subsystem
> >>> typically has an nvme controller associated with every path. This is
> >>> mostly applicable to nvmeof. If one path goes down, inflight IOs on that
> >>> path should not be retried immediately on another path because this
> >>> could lead to data corruption as described in TP4129. TP8028 defines
> >>> cross-controller reset mechanism that can be used by host to terminate
> >>> IOs on the failed path using one of the remaining healthy paths. Only
> >>> after IOs are terminated, or long enough time passes as defined by
> >>> TP4129, inflight IOs should be retried on another path. Implement core
> >>> cross-controller reset shared logic to be used by the transports.
> >>>
> >>> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> >>> ---
> >>> drivers/nvme/host/constants.c | 1 +
> >>> drivers/nvme/host/core.c | 133 ++++++++++++++++++++++++++++++++++
> >>> drivers/nvme/host/nvme.h | 10 +++
> >>> 3 files changed, 144 insertions(+)
> >>>
> >>> diff --git a/drivers/nvme/host/constants.c b/drivers/nvme/host/constants.c
> >>> index dc90df9e13a2..f679efd5110e 100644
> >>> --- a/drivers/nvme/host/constants.c
> >>> +++ b/drivers/nvme/host/constants.c
> >>> @@ -46,6 +46,7 @@ static const char * const nvme_admin_ops[] = {
> >>> [nvme_admin_virtual_mgmt] = "Virtual Management",
> >>> [nvme_admin_nvme_mi_send] = "NVMe Send MI",
> >>> [nvme_admin_nvme_mi_recv] = "NVMe Receive MI",
> >>> + [nvme_admin_cross_ctrl_reset] = "Cross Controller Reset",
> >>> [nvme_admin_dbbuf] = "Doorbell Buffer Config",
> >>> [nvme_admin_format_nvm] = "Format NVM",
> >>> [nvme_admin_security_send] = "Security Send",
> >>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> >>> index f5b84bc327d3..f38b70ca9cee 100644
> >>> --- a/drivers/nvme/host/core.c
> >>> +++ b/drivers/nvme/host/core.c
> >>> @@ -554,6 +554,138 @@ void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl)
> >>> }
> >>> EXPORT_SYMBOL_GPL(nvme_cancel_admin_tagset);
> >>>
> >>> +static struct nvme_ctrl *nvme_find_ccr_ctrl(struct nvme_ctrl *ictrl,
> >>> + u32 min_cntlid)
> >>> +{
> >>> + struct nvme_subsystem *subsys = ictrl->subsys;
> >>> + struct nvme_ctrl *sctrl;
> >>> + unsigned long flags;
> >>> +
> >>> + mutex_lock(&nvme_subsystems_lock);
> >> This looks like the wrong lock to take here?
> > This is similar to nvme_validate_cntlid()?
> > What is the correct lock to use?
>
> Not really, its only because it is called from nvme_init_subsystem which
> spans
> subsystems.
Okay. I will use this lock for now. If this is not the right lock to use
please point me to the right one.
>
> >
> >>> + list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
> >>> + if (sctrl->cntlid < min_cntlid)
> >>> + continue;
> >> The use of min_cntlid is not clear to me.
> >>
> >>> +
> >>> + if (atomic_dec_if_positive(&sctrl->ccr_limit) < 0)
> >>> + continue;
> >>> +
> >>> + spin_lock_irqsave(&sctrl->lock, flags);
> >>> + if (sctrl->state != NVME_CTRL_LIVE) {
> >>> + spin_unlock_irqrestore(&sctrl->lock, flags);
> >>> + atomic_inc(&sctrl->ccr_limit);
> >>> + continue;
> >>> + }
> >>> +
> >>> + /*
> >>> + * We got a good candidate source controller that is locked and
> >>> + * LIVE. However, no guarantee sctrl will not be deleted after
> >>> + * sctrl->lock is released. Get a ref of both sctrl and admin_q
> >>> + * so they do not disappear until we are done with them.
> >>> + */
> >>> + WARN_ON_ONCE(!blk_get_queue(sctrl->admin_q));
> >>> + nvme_get_ctrl(sctrl);
> >>> + spin_unlock_irqrestore(&sctrl->lock, flags);
> >>> + goto found;
> >>> + }
> >>> + sctrl = NULL;
> >>> +found:
> >>> + mutex_unlock(&nvme_subsystems_lock);
> >>> + return sctrl;
> >>> +}
> >>> +
> >>> +static int nvme_issue_wait_ccr(struct nvme_ctrl *sctrl, struct nvme_ctrl *ictrl)
> >>> +{
> >>> + unsigned long flags, tmo, remain;
> >>> + struct nvme_ccr_entry ccr = { };
> >>> + union nvme_result res = { 0 };
> >>> + struct nvme_command c = { };
> >>> + u32 result;
> >>> + int ret = 0;
> >>> +
> >>> + init_completion(&ccr.complete);
> >>> + ccr.ictrl = ictrl;
> >>> +
> >>> + spin_lock_irqsave(&sctrl->lock, flags);
> >>> + list_add_tail(&ccr.list, &sctrl->ccrs);
> >>> + spin_unlock_irqrestore(&sctrl->lock, flags);
> >>> +
> >>> + c.ccr.opcode = nvme_admin_cross_ctrl_reset;
> >>> + c.ccr.ciu = ictrl->ciu;
> >>> + c.ccr.icid = cpu_to_le16(ictrl->cntlid);
> >>> + c.ccr.cirn = cpu_to_le64(ictrl->cirn);
> >>> + ret = __nvme_submit_sync_cmd(sctrl->admin_q, &c, &res,
> >>> + NULL, 0, NVME_QID_ANY, 0);
> >>> + if (ret)
> >>> + goto out;
> >>> +
> >>> + result = le32_to_cpu(res.u32);
> >>> + if (result & 0x01) /* Immediate Reset */
> >>> + goto out;
> >>> +
> >>> + tmo = msecs_to_jiffies(max(ictrl->cqt, ictrl->kato * 1000));
> >>> + remain = wait_for_completion_timeout(&ccr.complete, tmo);
> >>> + if (!remain)
> >> I think remain is redundant here.
> > Deleted 'remain'.
> >
> >>> + ret = -EAGAIN;
> >>> +out:
> >>> + spin_lock_irqsave(&sctrl->lock, flags);
> >>> + list_del(&ccr.list);
> >>> + spin_unlock_irqrestore(&sctrl->lock, flags);
> >>> + return ccr.ccrs == 1 ? 0 : ret;
> >> Why would you still return 0 and not EAGAIN? you expired on timeout but
> >> still
> >> return success if you have ccrs=1? btw you have ccrs in the ccr struct
> >> and in the controller
> >> as a list. Lets rename to distinguish the two.
> > True, we did expire timeout here. However, after we removed the ccr
> > entry we found that it was marked as completed. We return success in
> > this case even though we hit timeout.
>
> When does this happen? Why is it worth having the code non-intuitive for
> something that effectively never happens (unless I'm missing something?)
Agree. It is a very low probability. I deleted the check for this
condition.
>
> >
> > Renamed ctrl->ccrs to ctrl->ccr_list.
> >
> >>> +}
> >>> +
> >>> +unsigned long nvme_recover_ctrl(struct nvme_ctrl *ictrl)
> >>> +{
> >> I'd call it nvme_fence_controller()
> > Okay. I will do that. I will also rename the controller state FENCING.
> >
> >>> + unsigned long deadline, now, timeout;
> >>> + struct nvme_ctrl *sctrl;
> >>> + u32 min_cntlid = 0;
> >>> + int ret;
> >>> +
> >>> + timeout = nvme_recovery_timeout_ms(ictrl);
> >>> + dev_info(ictrl->device, "attempting CCR, timeout %lums\n", timeout);
> >>> +
> >>> + now = jiffies;
> >>> + deadline = now + msecs_to_jiffies(timeout);
> >>> + while (time_before(now, deadline)) {
> >>> + sctrl = nvme_find_ccr_ctrl(ictrl, min_cntlid);
> >>> + if (!sctrl) {
> >>> + /* CCR failed, switch to time-based recovery */
> >>> + return deadline - now;
> >> It is not clear what is the return code semantics of this function.
> >> How about making it success/failure and have the caller choose what to do?
> > The function returns 0 on success. On failure it returns the time in
> > jiffies to hold requests for before they are canceled. On failure the
> > returned time is essentially the hold time defined in TP4129 minus the
> > time it took to attempt CCR.
>
> I think it would be cleaner to simple have this function return status
> code and
> have the caller worry about time spent.
nvme_fence_ctrl() needs to track the time. It needs to be aware of how
much time spent on attempting CCR in order to decide whether to continue
trying CCR or give up.
>
> >
> >>> + }
> >>> +
> >>> + ret = nvme_issue_wait_ccr(sctrl, ictrl);
> >>> + atomic_inc(&sctrl->ccr_limit);
> >> inc after you wait for the ccr? shouldn't this be before?
> > I think it should be after we wait for CCR. sctrl->ccr_limit is the
> > number of concurrent CCRs the controller supports. Only after we are
> > done with CCR on this controller we increment it.
>
> Maybe it should be folded into nvme_issue_wait_ccr for symmetry?
Done.
>
> >
> >>> +
> >>> + if (!ret) {
> >>> + dev_info(ictrl->device, "CCR succeeded using %s\n",
> >>> + dev_name(sctrl->device));
> >>> + blk_put_queue(sctrl->admin_q);
> >>> + nvme_put_ctrl(sctrl);
> >>> + return 0;
> >>> + }
> >>> +
> >>> + /* Try another controller */
> >>> + min_cntlid = sctrl->cntlid + 1;
> >> OK, I see why min_cntlid is used. That is very non-intuitive.
> >>
> >> I'm wandering if it will be simpler to take one-shot at ccr and
> >> if it fails fallback to crt. I mean, if the sctrl is alive, and it was
> >> unable
> >> to reset the ictrl in time, how would another ctrl do a better job here?
> > We need to attempt CCR from multiple controllers for reason explained in
> > another response. As you figured out min_cntlid is needed in order to
> > not loop controller list forever. Do you have a better idea?
>
> No, just know that I don't like it very much :)
>
> >
> >>> + blk_put_queue(sctrl->admin_q);
> >>> + nvme_put_ctrl(sctrl);
> >>> + now = jiffies;
> >>> + }
> >>> +
> >>> + dev_info(ictrl->device, "CCR reached timeout, call it done\n");
> >>> + return 0;
> >>> +}
> >>> +EXPORT_SYMBOL_GPL(nvme_recover_ctrl);
> >>> +
> >>> +void nvme_end_ctrl_recovery(struct nvme_ctrl *ctrl)
> >>> +{
> >>> + unsigned long flags;
> >>> +
> >>> + spin_lock_irqsave(&ctrl->lock, flags);
> >>> + WRITE_ONCE(ctrl->state, NVME_CTRL_RESETTING);
> >> This needs to be a proper state transition.
> > We do not want to have proper transition from RECOVERING to RESETTING.
> > The reason is that we do not want the controller to be reset while it is
> > being recovered/fenced because requests should not be canceled. One way
> > to keep the transitions in nvme_change_ctrl_state() is to use two
> > states. Say FENCING and FENCED.
> >
> > The allowed transitions are
> >
> > - LIVE -> FENCING
> > - FENCING -> FENCED
> > - FENCED -> (RESETTING, DELETING)
> >
> > This will also git rid of NVME_CTRL_RECOVERED
> >
> > Does this sound good?
>
> We could do what failfast is doing, in case we get transition FENCING ->
> RESETTING/DELETING we flush
> the fence_work...
Yes. This is what v2 does.
> > +
> > + if (!ret) {
> > + dev_info(ictrl->device, "CCR succeeded using %s\n",
> > + dev_name(sctrl->device));
> > + blk_put_queue(sctrl->admin_q);
> > + nvme_put_ctrl(sctrl);
> > + return 0;
> > + }
> > +
> > + /* Try another controller */
> > + min_cntlid = sctrl->cntlid + 1;
>
> OK, I see why min_cntlid is used. That is very non-intuitive.
>
> I'm wandering if it will be simpler to take one-shot at ccr and
> if it fails fallback to crt. I mean, if the sctrl is alive, and it was
> unable
> to reset the ictrl in time, how would another ctrl do a better job here?
There are many different kinds of failures we are dealing with here
that result in a dropped connection (association). It could be a problem
with the specific link, or it could be that the node of an HA pair in the
storage array went down. In the case of a specific link problem, maybe
only one of the connections is down and any controller would work.
In the case of the node of an HA pair, roughly half of the connections
are going down, and there is a race between the controllers which
are detected down first. There were some heuristics put into the
spec about deciding which controller to use, but that is more code
and a refinement that could come later (and they are still heuristics;
they may not be helpful).
Because CCR offers a significant win of shortening the recovery time
substantially, it is worth retrying on the other controllers. This time
affects when we can start retrying IO. KATO is in seconds, and
NVMEoF should have the capability of doing a significant amount of
IOs in each of those seconds.
Besides, the alternative is just to wait. Might as well be actively trying
to shorten that wait time. Besides a small increase in code complexity,
is there a downside to doing so?
Sincerely,
Randy Jennings
the time.
On 31/12/2025 2:04, Randy Jennings wrote:
>>> +
>>> + if (!ret) {
>>> + dev_info(ictrl->device, "CCR succeeded using %s\n",
>>> + dev_name(sctrl->device));
>>> + blk_put_queue(sctrl->admin_q);
>>> + nvme_put_ctrl(sctrl);
>>> + return 0;
>>> + }
>>> +
>>> + /* Try another controller */
>>> + min_cntlid = sctrl->cntlid + 1;
>> OK, I see why min_cntlid is used. That is very non-intuitive.
>>
>> I'm wandering if it will be simpler to take one-shot at ccr and
>> if it fails fallback to crt. I mean, if the sctrl is alive, and it was
>> unable
>> to reset the ictrl in time, how would another ctrl do a better job here?
> There are many different kinds of failures we are dealing with here
> that result in a dropped connection (association). It could be a problem
> with the specific link, or it could be that the node of an HA pair in the
> storage array went down. In the case of a specific link problem, maybe
> only one of the connections is down and any controller would work.
> In the case of the node of an HA pair, roughly half of the connections
> are going down, and there is a race between the controllers which
> are detected down first. There were some heuristics put into the
> spec about deciding which controller to use, but that is more code
> and a refinement that could come later (and they are still heuristics;
> they may not be helpful).
>
> Because CCR offers a significant win of shortening the recovery time
> substantially, it is worth retrying on the other controllers. This time
> affects when we can start retrying IO. KATO is in seconds, and
> NVMEoF should have the capability of doing a significant amount of
> IOs in each of those seconds.
But it doesn't actually do I/O, it issues I/O and then wait for it to
time out.
>
> Besides, the alternative is just to wait. Might as well be actively trying
> to shorten that wait time. Besides a small increase in code complexity,
> is there a downside to doing so?
Simplicity is very important when it comes to non-trivial code paths
like error recovery.
On Sun, Jan 4, 2026 at 1:14 PM Sagi Grimberg <sagi@grimberg.me> wrote:
> On 31/12/2025 2:04, Randy Jennings wrote:
> >>> +
> >>> + if (!ret) {
> >>> + dev_info(ictrl->device, "CCR succeeded using %s\n",
> >>> + dev_name(sctrl->device));
> >>> + blk_put_queue(sctrl->admin_q);
> >>> + nvme_put_ctrl(sctrl);
> >>> + return 0;
> >>> + }
> >>> +
> >>> + /* Try another controller */
> >>> + min_cntlid = sctrl->cntlid + 1;
> >> OK, I see why min_cntlid is used. That is very non-intuitive.
> >>
> >> I'm wandering if it will be simpler to take one-shot at ccr and
> >> if it fails fallback to crt. I mean, if the sctrl is alive, and it was
> >> unable
> >> to reset the ictrl in time, how would another ctrl do a better job here?
> > There are many different kinds of failures we are dealing with here
> > that result in a dropped connection (association). It could be a problem
> > with the specific link, or it could be that the node of an HA pair in the
> > storage array went down. In the case of a specific link problem, maybe
> > only one of the connections is down and any controller would work.
> > In the case of the node of an HA pair, roughly half of the connections
> > are going down, and there is a race between the controllers which
> > are detected down first. There were some heuristics put into the
> > spec about deciding which controller to use, but that is more code
> > and a refinement that could come later (and they are still heuristics;
> > they may not be helpful).
> >
> > Because CCR offers a significant win of shortening the recovery time
> > substantially, it is worth retrying on the other controllers. This time
> > affects when we can start retrying IO. KATO is in seconds, and
> > NVMEoF should have the capability of doing a significant amount of
> > IOs in each of those seconds.
>
> But it doesn't actually do I/O, it issues I/O and then wait for it to
> time out.
Retrying CCR does not actually do I/O (trying to place your antecedent),
but a successful CCR allows the host to get back to doing I/O. Every
second saved can be a significant amount of I/O. If you were given a
choice between a 1 second failover and a 60 second failover, of course,
you would go for the 1 second failover. However, if I was given the
option of a 10 second failover and a 60 second failover, I would still
go for the 10 second failover. 50 seconds is still extremely valuable.
>
> >
> > Besides, the alternative is just to wait. Might as well be actively trying
> > to shorten that wait time. Besides a small increase in code complexity,
> > is there a downside to doing so?
>
> Simplicity is very important when it comes to non-trivial code paths
> like error recovery.
Okay, yes, unwarranted complexity, even with some benefit might not
be worth it. I can see that my comment could be taken as flippant. But
the extra complexity here yields an important and material benefit.
Sincerely,
Randy Jennings
On Tue, Nov 25, 2025 at 6:13 PM Mohamed Khalfella
<mkhalfella@purestorage.com> wrote:
>
> A host that has more than one path connecting to an nvme subsystem
> typically has an nvme controller associated with every path. This is
> mostly applicable to nvmeof. If one path goes down, inflight IOs on that
> path should not be retried immediately on another path because this
> could lead to data corruption as described in TP4129. TP8028 defines
> cross-controller reset mechanism that can be used by host to terminate
> IOs on the failed path using one of the remaining healthy paths. Only
> after IOs are terminated, or long enough time passes as defined by
> TP4129, inflight IOs should be retried on another path. Implement core
> cross-controller reset shared logic to be used by the transports.
>
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> +unsigned long nvme_recover_ctrl(struct nvme_ctrl *ictrl)
> + now = jiffies;
> + deadline = now + msecs_to_jiffies(timeout);
> + while (time_before(now, deadline)) {
> + now = jiffies;
> + }
I would use a for-loop to keep the advancing statement close to the condition.
Reviewed-by: Randy Jennings <randyj@purestorage.com>
© 2016 - 2026 Red Hat, Inc.