[v1] TP8028 Rapid Path Failure Recovery

[RFC PATCH 05/14] nvmet: Send an AEN on CCR completion

Posted by Mohamed Khalfella 2 months, 2 weeks ago

Send an AEN to initiator when impacted controller exists. The
notification points to CCR log page that initiator can read to check
which CCR operation completed.

Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
 drivers/nvme/target/core.c  | 27 +++++++++++++++++++++++----
 drivers/nvme/target/nvmet.h |  3 ++-
 include/linux/nvme.h        |  3 +++
 3 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 7dbe9255ff42..60173833c3eb 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -202,7 +202,7 @@ static void nvmet_async_event_work(struct work_struct *work)
 	nvmet_async_events_process(ctrl);
 }
 
-void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
+static void nvmet_add_async_event_locked(struct nvmet_ctrl *ctrl, u8 event_type,
 		u8 event_info, u8 log_page)
 {
 	struct nvmet_async_event *aen;
@@ -215,12 +215,17 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
 	aen->event_info = event_info;
 	aen->log_page = log_page;
 
-	mutex_lock(&ctrl->lock);
 	list_add_tail(&aen->entry, &ctrl->async_events);
-	mutex_unlock(&ctrl->lock);
 
 	queue_work(nvmet_wq, &ctrl->async_event_work);
 }
+void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
+		u8 event_info, u8 log_page)
+{
+	mutex_lock(&ctrl->lock);
+	nvmet_add_async_event_locked(ctrl, event_type, event_info, log_page);
+	mutex_unlock(&ctrl->lock);
+}
 
 static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
 {
@@ -1788,6 +1793,18 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
 }
 EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl);
 
+static void nvmet_ctrl_notify_ccr(struct nvmet_ctrl *ctrl)
+{
+	lockdep_assert_held(&ctrl->lock);
+
+	if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_CCR_COMPLETE))
+		return;
+
+	nvmet_add_async_event_locked(ctrl, NVME_AER_NOTICE,
+				     NVME_AER_NOTICE_CCR_COMPLETED,
+				     NVME_LOG_CCR);
+}
+
 static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
 {
 	struct nvmet_subsys *subsys = ctrl->subsys;
@@ -1801,8 +1818,10 @@ static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
 	list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
 		mutex_lock(&sctrl->lock);
 		list_for_each_entry(ccr, &sctrl->ccrs, entry) {
-			if (ccr->ctrl == ctrl)
+			if (ccr->ctrl == ctrl) {
+				nvmet_ctrl_notify_ccr(sctrl);
 				ccr->ctrl = NULL;
+			}
 		}
 		mutex_unlock(&sctrl->lock);
 	}
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 6c0091b8af8b..7ebcef13be2b 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -44,7 +44,8 @@
  * Supported optional AENs:
  */
 #define NVMET_AEN_CFG_OPTIONAL \
-	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE)
+	(NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_ANA_CHANGE | \
+	 NVME_AEN_CFG_CCR_COMPLETE)
 #define NVMET_DISC_AEN_CFG_OPTIONAL \
 	(NVME_AEN_CFG_DISC_CHANGE)
 
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d51883122d65..a145417dccd3 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -863,12 +863,14 @@ enum {
 	NVME_AER_NOTICE_FW_ACT_STARTING = 0x01,
 	NVME_AER_NOTICE_ANA		= 0x03,
 	NVME_AER_NOTICE_DISC_CHANGED	= 0xf0,
+	NVME_AER_NOTICE_CCR_COMPLETED	= 0xf4,
 };
 
 enum {
 	NVME_AEN_BIT_NS_ATTR		= 8,
 	NVME_AEN_BIT_FW_ACT		= 9,
 	NVME_AEN_BIT_ANA_CHANGE		= 11,
+	NVME_AEN_BIT_CCR_COMPLETE	= 20,
 	NVME_AEN_BIT_DISC_CHANGE	= 31,
 };
 
@@ -876,6 +878,7 @@ enum {
 	NVME_AEN_CFG_NS_ATTR		= 1 << NVME_AEN_BIT_NS_ATTR,
 	NVME_AEN_CFG_FW_ACT		= 1 << NVME_AEN_BIT_FW_ACT,
 	NVME_AEN_CFG_ANA_CHANGE		= 1 << NVME_AEN_BIT_ANA_CHANGE,
+	NVME_AEN_CFG_CCR_COMPLETE	= 1 << NVME_AEN_BIT_CCR_COMPLETE,
 	NVME_AEN_CFG_DISC_CHANGE	= 1 << NVME_AEN_BIT_DISC_CHANGE,
 };
 
-- 
2.51.2

Re: [RFC PATCH 05/14] nvmet: Send an AEN on CCR completion

Posted by Sagi Grimberg 1 month, 2 weeks ago


On 26/11/2025 4:11, Mohamed Khalfella wrote:
> Send an AEN to initiator when impacted controller exists. The
> notification points to CCR log page that initiator can read to check
> which CCR operation completed.
>
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> ---
>   drivers/nvme/target/core.c  | 27 +++++++++++++++++++++++----
>   drivers/nvme/target/nvmet.h |  3 ++-
>   include/linux/nvme.h        |  3 +++
>   3 files changed, 28 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> index 7dbe9255ff42..60173833c3eb 100644
> --- a/drivers/nvme/target/core.c
> +++ b/drivers/nvme/target/core.c
> @@ -202,7 +202,7 @@ static void nvmet_async_event_work(struct work_struct *work)
>   	nvmet_async_events_process(ctrl);
>   }
>   
> -void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> +static void nvmet_add_async_event_locked(struct nvmet_ctrl *ctrl, u8 event_type,
>   		u8 event_info, u8 log_page)
>   {
>   	struct nvmet_async_event *aen;
> @@ -215,12 +215,17 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
>   	aen->event_info = event_info;
>   	aen->log_page = log_page;
>   
> -	mutex_lock(&ctrl->lock);
>   	list_add_tail(&aen->entry, &ctrl->async_events);
> -	mutex_unlock(&ctrl->lock);
>   
>   	queue_work(nvmet_wq, &ctrl->async_event_work);
>   }
> +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> +		u8 event_info, u8 log_page)
> +{
> +	mutex_lock(&ctrl->lock);
> +	nvmet_add_async_event_locked(ctrl, event_type, event_info, log_page);
> +	mutex_unlock(&ctrl->lock);
> +}
>   
>   static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
>   {
> @@ -1788,6 +1793,18 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
>   }
>   EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl);
>   
> +static void nvmet_ctrl_notify_ccr(struct nvmet_ctrl *ctrl)
> +{
> +	lockdep_assert_held(&ctrl->lock);
> +
> +	if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_CCR_COMPLETE))
> +		return;
> +
> +	nvmet_add_async_event_locked(ctrl, NVME_AER_NOTICE,
> +				     NVME_AER_NOTICE_CCR_COMPLETED,
> +				     NVME_LOG_CCR);
> +}
> +
>   static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
>   {
>   	struct nvmet_subsys *subsys = ctrl->subsys;
> @@ -1801,8 +1818,10 @@ static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
>   	list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
>   		mutex_lock(&sctrl->lock);
>   		list_for_each_entry(ccr, &sctrl->ccrs, entry) {
> -			if (ccr->ctrl == ctrl)
> +			if (ccr->ctrl == ctrl) {
> +				nvmet_ctrl_notify_ccr(sctrl);
>   				ccr->ctrl = NULL;
> +			}

Is this double loop necessary? Would you have more than one controller 
cross resetting the same
controller? Won't it be better to install a callback+opaque that the 
controller removal will call?

Re: [RFC PATCH 05/14] nvmet: Send an AEN on CCR completion

Posted by Mohamed Khalfella 1 month, 2 weeks ago

On Thu 2025-12-25 15:23:51 +0200, Sagi Grimberg wrote:
> 
> 
> On 26/11/2025 4:11, Mohamed Khalfella wrote:
> > Send an AEN to initiator when impacted controller exists. The
> > notification points to CCR log page that initiator can read to check
> > which CCR operation completed.
> >
> > Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> > ---
> >   drivers/nvme/target/core.c  | 27 +++++++++++++++++++++++----
> >   drivers/nvme/target/nvmet.h |  3 ++-
> >   include/linux/nvme.h        |  3 +++
> >   3 files changed, 28 insertions(+), 5 deletions(-)
> >
> > diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> > index 7dbe9255ff42..60173833c3eb 100644
> > --- a/drivers/nvme/target/core.c
> > +++ b/drivers/nvme/target/core.c
> > @@ -202,7 +202,7 @@ static void nvmet_async_event_work(struct work_struct *work)
> >   	nvmet_async_events_process(ctrl);
> >   }
> >   
> > -void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> > +static void nvmet_add_async_event_locked(struct nvmet_ctrl *ctrl, u8 event_type,
> >   		u8 event_info, u8 log_page)
> >   {
> >   	struct nvmet_async_event *aen;
> > @@ -215,12 +215,17 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> >   	aen->event_info = event_info;
> >   	aen->log_page = log_page;
> >   
> > -	mutex_lock(&ctrl->lock);
> >   	list_add_tail(&aen->entry, &ctrl->async_events);
> > -	mutex_unlock(&ctrl->lock);
> >   
> >   	queue_work(nvmet_wq, &ctrl->async_event_work);
> >   }
> > +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> > +		u8 event_info, u8 log_page)
> > +{
> > +	mutex_lock(&ctrl->lock);
> > +	nvmet_add_async_event_locked(ctrl, event_type, event_info, log_page);
> > +	mutex_unlock(&ctrl->lock);
> > +}
> >   
> >   static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
> >   {
> > @@ -1788,6 +1793,18 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
> >   }
> >   EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl);
> >   
> > +static void nvmet_ctrl_notify_ccr(struct nvmet_ctrl *ctrl)
> > +{
> > +	lockdep_assert_held(&ctrl->lock);
> > +
> > +	if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_CCR_COMPLETE))
> > +		return;
> > +
> > +	nvmet_add_async_event_locked(ctrl, NVME_AER_NOTICE,
> > +				     NVME_AER_NOTICE_CCR_COMPLETED,
> > +				     NVME_LOG_CCR);
> > +}
> > +
> >   static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
> >   {
> >   	struct nvmet_subsys *subsys = ctrl->subsys;
> > @@ -1801,8 +1818,10 @@ static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
> >   	list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
> >   		mutex_lock(&sctrl->lock);
> >   		list_for_each_entry(ccr, &sctrl->ccrs, entry) {
> > -			if (ccr->ctrl == ctrl)
> > +			if (ccr->ctrl == ctrl) {
> > +				nvmet_ctrl_notify_ccr(sctrl);
> >   				ccr->ctrl = NULL;
> > +			}
> 
> Is this double loop necessary? Would you have more than one controller 
> cross resetting the same

As it is implemented now CCRs are linked to sctrl. This decision can be
revisited if found suboptimal. At some point I had CCRs linked to
ctrl->subsys but that led to lock ordering issues. Double loop is
necessary to find all CCRs in all controllers and mark them done.
Yes, it is possible to have more than one sctrl resetting the same
ictrl.

> controller? Won't it be better to install a callback+opaque that the 
> controller removal will call?

Can you elaborate more on that? Better in what terms?

nvmet_ctrl_complete_pending_ccr() is called from nvmet_ctrl_free() when
we know that ctrl->ref is zero and no new CCRs will be added to this
controller because nvmet_ctrl_find_get_ccr() will not be able to get it.

Re: [RFC PATCH 05/14] nvmet: Send an AEN on CCR completion

Posted by Sagi Grimberg 1 month, 1 week ago


On 25/12/2025 20:13, Mohamed Khalfella wrote:
> On Thu 2025-12-25 15:23:51 +0200, Sagi Grimberg wrote:
>>
>> On 26/11/2025 4:11, Mohamed Khalfella wrote:
>>> Send an AEN to initiator when impacted controller exists. The
>>> notification points to CCR log page that initiator can read to check
>>> which CCR operation completed.
>>>
>>> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
>>> ---
>>>    drivers/nvme/target/core.c  | 27 +++++++++++++++++++++++----
>>>    drivers/nvme/target/nvmet.h |  3 ++-
>>>    include/linux/nvme.h        |  3 +++
>>>    3 files changed, 28 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
>>> index 7dbe9255ff42..60173833c3eb 100644
>>> --- a/drivers/nvme/target/core.c
>>> +++ b/drivers/nvme/target/core.c
>>> @@ -202,7 +202,7 @@ static void nvmet_async_event_work(struct work_struct *work)
>>>    	nvmet_async_events_process(ctrl);
>>>    }
>>>    
>>> -void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
>>> +static void nvmet_add_async_event_locked(struct nvmet_ctrl *ctrl, u8 event_type,
>>>    		u8 event_info, u8 log_page)
>>>    {
>>>    	struct nvmet_async_event *aen;
>>> @@ -215,12 +215,17 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
>>>    	aen->event_info = event_info;
>>>    	aen->log_page = log_page;
>>>    
>>> -	mutex_lock(&ctrl->lock);
>>>    	list_add_tail(&aen->entry, &ctrl->async_events);
>>> -	mutex_unlock(&ctrl->lock);
>>>    
>>>    	queue_work(nvmet_wq, &ctrl->async_event_work);
>>>    }
>>> +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
>>> +		u8 event_info, u8 log_page)
>>> +{
>>> +	mutex_lock(&ctrl->lock);
>>> +	nvmet_add_async_event_locked(ctrl, event_type, event_info, log_page);
>>> +	mutex_unlock(&ctrl->lock);
>>> +}
>>>    
>>>    static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
>>>    {
>>> @@ -1788,6 +1793,18 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
>>>    }
>>>    EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl);
>>>    
>>> +static void nvmet_ctrl_notify_ccr(struct nvmet_ctrl *ctrl)
>>> +{
>>> +	lockdep_assert_held(&ctrl->lock);
>>> +
>>> +	if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_CCR_COMPLETE))
>>> +		return;
>>> +
>>> +	nvmet_add_async_event_locked(ctrl, NVME_AER_NOTICE,
>>> +				     NVME_AER_NOTICE_CCR_COMPLETED,
>>> +				     NVME_LOG_CCR);
>>> +}
>>> +
>>>    static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
>>>    {
>>>    	struct nvmet_subsys *subsys = ctrl->subsys;
>>> @@ -1801,8 +1818,10 @@ static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
>>>    	list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
>>>    		mutex_lock(&sctrl->lock);
>>>    		list_for_each_entry(ccr, &sctrl->ccrs, entry) {
>>> -			if (ccr->ctrl == ctrl)
>>> +			if (ccr->ctrl == ctrl) {
>>> +				nvmet_ctrl_notify_ccr(sctrl);
>>>    				ccr->ctrl = NULL;
>>> +			}
>> Is this double loop necessary? Would you have more than one controller
>> cross resetting the same
> As it is implemented now CCRs are linked to sctrl. This decision can be
> revisited if found suboptimal. At some point I had CCRs linked to
> ctrl->subsys but that led to lock ordering issues. Double loop is
> necessary to find all CCRs in all controllers and mark them done.
> Yes, it is possible to have more than one sctrl resetting the same
> ictrl.

I'm more interested in simplifying.

>
>> controller? Won't it be better to install a callback+opaque that the
>> controller removal will call?
> Can you elaborate more on that? Better in what terms?
>
> nvmet_ctrl_complete_pending_ccr() is called from nvmet_ctrl_free() when
> we know that ctrl->ref is zero and no new CCRs will be added to this
> controller because nvmet_ctrl_find_get_ccr() will not be able to get it.

In nvmet, the controller is serving a single host. Hence I am not sure I
understand how multiple source controllers will try to reset the impacted
controller. So, if there is a 1-1 relationship between source and impacted
controller, I'd perhaps suggest to simplify and install on the impacted 
controller
callback+opaque (e.g. void *data) instead of having it iterate and then 
actually send
the AEN from the impacted controller.

Re: [RFC PATCH 05/14] nvmet: Send an AEN on CCR completion

Posted by Mohamed Khalfella 1 month, 1 week ago

On Sat 2025-12-27 11:48:49 +0200, Sagi Grimberg wrote:
> 
> 
> On 25/12/2025 20:13, Mohamed Khalfella wrote:
> > On Thu 2025-12-25 15:23:51 +0200, Sagi Grimberg wrote:
> >>
> >> On 26/11/2025 4:11, Mohamed Khalfella wrote:
> >>> Send an AEN to initiator when impacted controller exists. The
> >>> notification points to CCR log page that initiator can read to check
> >>> which CCR operation completed.
> >>>
> >>> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> >>> ---
> >>>    drivers/nvme/target/core.c  | 27 +++++++++++++++++++++++----
> >>>    drivers/nvme/target/nvmet.h |  3 ++-
> >>>    include/linux/nvme.h        |  3 +++
> >>>    3 files changed, 28 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> >>> index 7dbe9255ff42..60173833c3eb 100644
> >>> --- a/drivers/nvme/target/core.c
> >>> +++ b/drivers/nvme/target/core.c
> >>> @@ -202,7 +202,7 @@ static void nvmet_async_event_work(struct work_struct *work)
> >>>    	nvmet_async_events_process(ctrl);
> >>>    }
> >>>    
> >>> -void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> >>> +static void nvmet_add_async_event_locked(struct nvmet_ctrl *ctrl, u8 event_type,
> >>>    		u8 event_info, u8 log_page)
> >>>    {
> >>>    	struct nvmet_async_event *aen;
> >>> @@ -215,12 +215,17 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> >>>    	aen->event_info = event_info;
> >>>    	aen->log_page = log_page;
> >>>    
> >>> -	mutex_lock(&ctrl->lock);
> >>>    	list_add_tail(&aen->entry, &ctrl->async_events);
> >>> -	mutex_unlock(&ctrl->lock);
> >>>    
> >>>    	queue_work(nvmet_wq, &ctrl->async_event_work);
> >>>    }
> >>> +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> >>> +		u8 event_info, u8 log_page)
> >>> +{
> >>> +	mutex_lock(&ctrl->lock);
> >>> +	nvmet_add_async_event_locked(ctrl, event_type, event_info, log_page);
> >>> +	mutex_unlock(&ctrl->lock);
> >>> +}
> >>>    
> >>>    static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
> >>>    {
> >>> @@ -1788,6 +1793,18 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
> >>>    }
> >>>    EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl);
> >>>    
> >>> +static void nvmet_ctrl_notify_ccr(struct nvmet_ctrl *ctrl)
> >>> +{
> >>> +	lockdep_assert_held(&ctrl->lock);
> >>> +
> >>> +	if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_CCR_COMPLETE))
> >>> +		return;
> >>> +
> >>> +	nvmet_add_async_event_locked(ctrl, NVME_AER_NOTICE,
> >>> +				     NVME_AER_NOTICE_CCR_COMPLETED,
> >>> +				     NVME_LOG_CCR);
> >>> +}
> >>> +
> >>>    static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
> >>>    {
> >>>    	struct nvmet_subsys *subsys = ctrl->subsys;
> >>> @@ -1801,8 +1818,10 @@ static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
> >>>    	list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
> >>>    		mutex_lock(&sctrl->lock);
> >>>    		list_for_each_entry(ccr, &sctrl->ccrs, entry) {
> >>> -			if (ccr->ctrl == ctrl)
> >>> +			if (ccr->ctrl == ctrl) {
> >>> +				nvmet_ctrl_notify_ccr(sctrl);
> >>>    				ccr->ctrl = NULL;
> >>> +			}
> >> Is this double loop necessary? Would you have more than one controller
> >> cross resetting the same
> > As it is implemented now CCRs are linked to sctrl. This decision can be
> > revisited if found suboptimal. At some point I had CCRs linked to
> > ctrl->subsys but that led to lock ordering issues. Double loop is
> > necessary to find all CCRs in all controllers and mark them done.
> > Yes, it is possible to have more than one sctrl resetting the same
> > ictrl.
> 
> I'm more interested in simplifying.
> 
> >
> >> controller? Won't it be better to install a callback+opaque that the
> >> controller removal will call?
> > Can you elaborate more on that? Better in what terms?
> >
> > nvmet_ctrl_complete_pending_ccr() is called from nvmet_ctrl_free() when
> > we know that ctrl->ref is zero and no new CCRs will be added to this
> > controller because nvmet_ctrl_find_get_ccr() will not be able to get it.
> 
> In nvmet, the controller is serving a single host. Hence I am not sure I
> understand how multiple source controllers will try to reset the impacted
> controller. So, if there is a 1-1 relationship between source and impacted
> controller, I'd perhaps suggest to simplify and install on the impacted 
> controller
> callback+opaque (e.g. void *data) instead of having it iterate and then 
> actually send
> the AEN from the impacted controller.

A controller is serving a single path for a given host. A host that is
connected to nvme subsystem via multiple paths will have more than one
controller. I can think of two reasons why we need to support resetting
an impacted controller from multiple source controllers.

- It is possible for multiple paths to go down at the same time. The
  first source controller we use for CCR, even though we check to see if
  LIVE, might have lost connection to subsystem. It is a matter of time
  for it to see keepalive timeout and fail too. If CCR fails using this
  controller we should not give up. We need to try other paths.

- Some nvme subsystems might support resetting impacted controller from
  a subset of controllers connected to the host. An array that has
  multiple frontend engines might not support resetting controllers
  across engines. In fact, TP8028 allows for subsystem to suggest to
  host to use another source controller in Alternate Controller ID
  (ACID) fied on CCR logpage (not implemente in this patchset).

Re: [RFC PATCH 05/14] nvmet: Send an AEN on CCR completion

Posted by Sagi Grimberg 1 month ago


On 01/01/2026 0:00, Mohamed Khalfella wrote:
> On Sat 2025-12-27 11:48:49 +0200, Sagi Grimberg wrote:
>> On 25/12/2025 20:13, Mohamed Khalfella wrote:
>>> On Thu 2025-12-25 15:23:51 +0200, Sagi Grimberg wrote:
>>>> On 26/11/2025 4:11, Mohamed Khalfella wrote:
>>>>> Send an AEN to initiator when impacted controller exists. The
>>>>> notification points to CCR log page that initiator can read to check
>>>>> which CCR operation completed.
>>>>>
>>>>> Signed-off-by: Mohamed Khalfella<mkhalfella@purestorage.com>
>>>>> ---
>>>>>     drivers/nvme/target/core.c  | 27 +++++++++++++++++++++++----
>>>>>     drivers/nvme/target/nvmet.h |  3 ++-
>>>>>     include/linux/nvme.h        |  3 +++
>>>>>     3 files changed, 28 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
>>>>> index 7dbe9255ff42..60173833c3eb 100644
>>>>> --- a/drivers/nvme/target/core.c
>>>>> +++ b/drivers/nvme/target/core.c
>>>>> @@ -202,7 +202,7 @@ static void nvmet_async_event_work(struct work_struct *work)
>>>>>     	nvmet_async_events_process(ctrl);
>>>>>     }
>>>>>     
>>>>> -void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
>>>>> +static void nvmet_add_async_event_locked(struct nvmet_ctrl *ctrl, u8 event_type,
>>>>>     		u8 event_info, u8 log_page)
>>>>>     {
>>>>>     	struct nvmet_async_event *aen;
>>>>> @@ -215,12 +215,17 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
>>>>>     	aen->event_info = event_info;
>>>>>     	aen->log_page = log_page;
>>>>>     
>>>>> -	mutex_lock(&ctrl->lock);
>>>>>     	list_add_tail(&aen->entry, &ctrl->async_events);
>>>>> -	mutex_unlock(&ctrl->lock);
>>>>>     
>>>>>     	queue_work(nvmet_wq, &ctrl->async_event_work);
>>>>>     }
>>>>> +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
>>>>> +		u8 event_info, u8 log_page)
>>>>> +{
>>>>> +	mutex_lock(&ctrl->lock);
>>>>> +	nvmet_add_async_event_locked(ctrl, event_type, event_info, log_page);
>>>>> +	mutex_unlock(&ctrl->lock);
>>>>> +}
>>>>>     
>>>>>     static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
>>>>>     {
>>>>> @@ -1788,6 +1793,18 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
>>>>>     }
>>>>>     EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl);
>>>>>     
>>>>> +static void nvmet_ctrl_notify_ccr(struct nvmet_ctrl *ctrl)
>>>>> +{
>>>>> +	lockdep_assert_held(&ctrl->lock);
>>>>> +
>>>>> +	if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_CCR_COMPLETE))
>>>>> +		return;
>>>>> +
>>>>> +	nvmet_add_async_event_locked(ctrl, NVME_AER_NOTICE,
>>>>> +				     NVME_AER_NOTICE_CCR_COMPLETED,
>>>>> +				     NVME_LOG_CCR);
>>>>> +}
>>>>> +
>>>>>     static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
>>>>>     {
>>>>>     	struct nvmet_subsys *subsys = ctrl->subsys;
>>>>> @@ -1801,8 +1818,10 @@ static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
>>>>>     	list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
>>>>>     		mutex_lock(&sctrl->lock);
>>>>>     		list_for_each_entry(ccr, &sctrl->ccrs, entry) {
>>>>> -			if (ccr->ctrl == ctrl)
>>>>> +			if (ccr->ctrl == ctrl) {
>>>>> +				nvmet_ctrl_notify_ccr(sctrl);
>>>>>     				ccr->ctrl = NULL;
>>>>> +			}
>>>> Is this double loop necessary? Would you have more than one controller
>>>> cross resetting the same
>>> As it is implemented now CCRs are linked to sctrl. This decision can be
>>> revisited if found suboptimal. At some point I had CCRs linked to
>>> ctrl->subsys but that led to lock ordering issues. Double loop is
>>> necessary to find all CCRs in all controllers and mark them done.
>>> Yes, it is possible to have more than one sctrl resetting the same
>>> ictrl.
>> I'm more interested in simplifying.
>>
>>>> controller? Won't it be better to install a callback+opaque that the
>>>> controller removal will call?
>>> Can you elaborate more on that? Better in what terms?
>>>
>>> nvmet_ctrl_complete_pending_ccr() is called from nvmet_ctrl_free() when
>>> we know that ctrl->ref is zero and no new CCRs will be added to this
>>> controller because nvmet_ctrl_find_get_ccr() will not be able to get it.
>> In nvmet, the controller is serving a single host. Hence I am not sure I
>> understand how multiple source controllers will try to reset the impacted
>> controller. So, if there is a 1-1 relationship between source and impacted
>> controller, I'd perhaps suggest to simplify and install on the impacted
>> controller
>> callback+opaque (e.g. void *data) instead of having it iterate and then
>> actually send
>> the AEN from the impacted controller.
> A controller is serving a single path for a given host. A host that is
> connected to nvme subsystem via multiple paths will have more than one
> controller. I can think of two reasons why we need to support resetting
> an impacted controller from multiple source controllers.
>
> - It is possible for multiple paths to go down at the same time. The
>    first source controller we use for CCR, even though we check to see if
>    LIVE, might have lost connection to subsystem. It is a matter of time
>    for it to see keepalive timeout and fail too. If CCR fails using this
>    controller we should not give up. We need to try other paths.

But the host is doing the cross-reset synchronously... it waits for
kato for a completion of the reset, and then gives up, its not like it
is sitting there waiting for the AEN...

Generally the fact that the spec states a capability/flexibility, it is 
still Linux's
choice to choose weather to implement it. I'm trying to understand if we can
simplify Linux host and controller in this non-trivial error recovery flow.

What is your expectation to happen in general? what are your expected 
kato/cqt
values? how many attempts do we want the host to do?

> - Some nvme subsystems might support resetting impacted controller from
>    a subset of controllers connected to the host. An array that has
>    multiple frontend engines might not support resetting controllers
>    across engines. In fact, TP8028 allows for subsystem to suggest to
>    host to use another source controller in Alternate Controller ID
>    (ACID) fied on CCR logpage (not implemente in this patchset).

It is not a case though where the impacted controller will be reset from 
multiple
source controller at the same time...

I'd also say that if indeed there are subsystems that require specific 
controllers to do
cross recovery, they won't be able to use this at all... Are there any 
such arrays?

Re: [RFC PATCH 05/14] nvmet: Send an AEN on CCR completion

Posted by Mohamed Khalfella 1 week, 2 days ago

On Sun 2026-01-04 23:09:54 +0200, Sagi Grimberg wrote:
> 
> 
> On 01/01/2026 0:00, Mohamed Khalfella wrote:
> > On Sat 2025-12-27 11:48:49 +0200, Sagi Grimberg wrote:
> >> On 25/12/2025 20:13, Mohamed Khalfella wrote:
> >>> On Thu 2025-12-25 15:23:51 +0200, Sagi Grimberg wrote:
> >>>> On 26/11/2025 4:11, Mohamed Khalfella wrote:
> >>>>> Send an AEN to initiator when impacted controller exists. The
> >>>>> notification points to CCR log page that initiator can read to check
> >>>>> which CCR operation completed.
> >>>>>
> >>>>> Signed-off-by: Mohamed Khalfella<mkhalfella@purestorage.com>
> >>>>> ---
> >>>>>     drivers/nvme/target/core.c  | 27 +++++++++++++++++++++++----
> >>>>>     drivers/nvme/target/nvmet.h |  3 ++-
> >>>>>     include/linux/nvme.h        |  3 +++
> >>>>>     3 files changed, 28 insertions(+), 5 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> >>>>> index 7dbe9255ff42..60173833c3eb 100644
> >>>>> --- a/drivers/nvme/target/core.c
> >>>>> +++ b/drivers/nvme/target/core.c
> >>>>> @@ -202,7 +202,7 @@ static void nvmet_async_event_work(struct work_struct *work)
> >>>>>     	nvmet_async_events_process(ctrl);
> >>>>>     }
> >>>>>     
> >>>>> -void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> >>>>> +static void nvmet_add_async_event_locked(struct nvmet_ctrl *ctrl, u8 event_type,
> >>>>>     		u8 event_info, u8 log_page)
> >>>>>     {
> >>>>>     	struct nvmet_async_event *aen;
> >>>>> @@ -215,12 +215,17 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> >>>>>     	aen->event_info = event_info;
> >>>>>     	aen->log_page = log_page;
> >>>>>     
> >>>>> -	mutex_lock(&ctrl->lock);
> >>>>>     	list_add_tail(&aen->entry, &ctrl->async_events);
> >>>>> -	mutex_unlock(&ctrl->lock);
> >>>>>     
> >>>>>     	queue_work(nvmet_wq, &ctrl->async_event_work);
> >>>>>     }
> >>>>> +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> >>>>> +		u8 event_info, u8 log_page)
> >>>>> +{
> >>>>> +	mutex_lock(&ctrl->lock);
> >>>>> +	nvmet_add_async_event_locked(ctrl, event_type, event_info, log_page);
> >>>>> +	mutex_unlock(&ctrl->lock);
> >>>>> +}
> >>>>>     
> >>>>>     static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
> >>>>>     {
> >>>>> @@ -1788,6 +1793,18 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
> >>>>>     }
> >>>>>     EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl);
> >>>>>     
> >>>>> +static void nvmet_ctrl_notify_ccr(struct nvmet_ctrl *ctrl)
> >>>>> +{
> >>>>> +	lockdep_assert_held(&ctrl->lock);
> >>>>> +
> >>>>> +	if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_CCR_COMPLETE))
> >>>>> +		return;
> >>>>> +
> >>>>> +	nvmet_add_async_event_locked(ctrl, NVME_AER_NOTICE,
> >>>>> +				     NVME_AER_NOTICE_CCR_COMPLETED,
> >>>>> +				     NVME_LOG_CCR);
> >>>>> +}
> >>>>> +
> >>>>>     static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
> >>>>>     {
> >>>>>     	struct nvmet_subsys *subsys = ctrl->subsys;
> >>>>> @@ -1801,8 +1818,10 @@ static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
> >>>>>     	list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
> >>>>>     		mutex_lock(&sctrl->lock);
> >>>>>     		list_for_each_entry(ccr, &sctrl->ccrs, entry) {
> >>>>> -			if (ccr->ctrl == ctrl)
> >>>>> +			if (ccr->ctrl == ctrl) {
> >>>>> +				nvmet_ctrl_notify_ccr(sctrl);
> >>>>>     				ccr->ctrl = NULL;
> >>>>> +			}
> >>>> Is this double loop necessary? Would you have more than one controller
> >>>> cross resetting the same
> >>> As it is implemented now CCRs are linked to sctrl. This decision can be
> >>> revisited if found suboptimal. At some point I had CCRs linked to
> >>> ctrl->subsys but that led to lock ordering issues. Double loop is
> >>> necessary to find all CCRs in all controllers and mark them done.
> >>> Yes, it is possible to have more than one sctrl resetting the same
> >>> ictrl.
> >> I'm more interested in simplifying.
> >>
> >>>> controller? Won't it be better to install a callback+opaque that the
> >>>> controller removal will call?
> >>> Can you elaborate more on that? Better in what terms?
> >>>
> >>> nvmet_ctrl_complete_pending_ccr() is called from nvmet_ctrl_free() when
> >>> we know that ctrl->ref is zero and no new CCRs will be added to this
> >>> controller because nvmet_ctrl_find_get_ccr() will not be able to get it.
> >> In nvmet, the controller is serving a single host. Hence I am not sure I
> >> understand how multiple source controllers will try to reset the impacted
> >> controller. So, if there is a 1-1 relationship between source and impacted
> >> controller, I'd perhaps suggest to simplify and install on the impacted
> >> controller
> >> callback+opaque (e.g. void *data) instead of having it iterate and then
> >> actually send
> >> the AEN from the impacted controller.
> > A controller is serving a single path for a given host. A host that is
> > connected to nvme subsystem via multiple paths will have more than one
> > controller. I can think of two reasons why we need to support resetting
> > an impacted controller from multiple source controllers.
> >
> > - It is possible for multiple paths to go down at the same time. The
> >    first source controller we use for CCR, even though we check to see if
> >    LIVE, might have lost connection to subsystem. It is a matter of time
> >    for it to see keepalive timeout and fail too. If CCR fails using this
> >    controller we should not give up. We need to try other paths.
> 
> But the host is doing the cross-reset synchronously... it waits for
> kato for a completion of the reset, and then gives up, its not like it
> is sitting there waiting for the AEN...
> 
> Generally the fact that the spec states a capability/flexibility, it is 
> still Linux's
> choice to choose weather to implement it. I'm trying to understand if we can
> simplify Linux host and controller in this non-trivial error recovery flow.
> 
> What is your expectation to happen in general? what are your expected 
> kato/cqt
> values? how many attempts do we want the host to do?

For a target that support CCR it is expected to support CQT as well. If
the initiator notices path failure then it either needs to get the
impacted controller reset via CCR or wait for time-based recovery. Time
based recovery adds long delay (n * kato + cqt), CCR is expected to take
way less time. This is why initiator should try CCR for as long as there
is time and paths to do so. The alternative is to sit and wait and do
nothing.

kato defaults to 5 seconds. cqt depends on the implementation on the
target. I tested these changes with 30s of cqt. With traffic based
keepalive time-based recovery takes 3 * 5 + 30 = 45s. In case of partial
path failure CCR finished in milliseconds. This is why it is better to
try every possible path before giving up. There is no fixed number of
attempts. As long as there is time to try CCR and there are paths to try
the initiator should continue trying CCR. Again, we do not want to
fallback to time-based recovery unless there are no options.


        tmo = msecs_to_jiffies(max(ictrl->cqt, ictrl->kato * 1000));
        if (!wait_for_completion_timeout(&ccr.complete, tmo)) {
                ret = -ETIMEDOUT;
                goto out;
        }

This is the code that waits for CCR to finish after CCR command was
accepted by the target. The specs does not say how much time host should
wait. max(cqt, kato) felt like reasonable value.

> 
> > - Some nvme subsystems might support resetting impacted controller from
> >    a subset of controllers connected to the host. An array that has
> >    multiple frontend engines might not support resetting controllers
> >    across engines. In fact, TP8028 allows for subsystem to suggest to
> >    host to use another source controller in Alternate Controller ID
> >    (ACID) fied on CCR logpage (not implemente in this patchset).
> 
> It is not a case though where the impacted controller will be reset from 
> multiple
> source controller at the same time...
> 
> I'd also say that if indeed there are subsystems that require specific 
> controllers to do
> cross recovery, they won't be able to use this at all... Are there any 
> such arrays?

Not at all. CCR is still useful in the case of partial connectivity
failure to the same engine. I can not name a product that does that
today.

Re: [RFC PATCH 05/14] nvmet: Send an AEN on CCR completion

Posted by Randy Jennings 1 month ago

On Sun, Jan 4, 2026 at 1:09 PM Sagi Grimberg <sagi@grimberg.me> wrote:
> On 01/01/2026 0:00, Mohamed Khalfella wrote:
> > On Sat 2025-12-27 11:48:49 +0200, Sagi Grimberg wrote:
> >> On 25/12/2025 20:13, Mohamed Khalfella wrote:
> >>> On Thu 2025-12-25 15:23:51 +0200, Sagi Grimberg wrote:
> >>>> On 26/11/2025 4:11, Mohamed Khalfella wrote:
> >>>>> Send an AEN to initiator when impacted controller exists. The
> >>>>> notification points to CCR log page that initiator can read to check
> >>>>> which CCR operation completed.
> >>>>>
> >>>>> Signed-off-by: Mohamed Khalfella<mkhalfella@purestorage.com>
> >>>>> ---
> >>>>>     drivers/nvme/target/core.c  | 27 +++++++++++++++++++++++----
> >>>>>     drivers/nvme/target/nvmet.h |  3 ++-
> >>>>>     include/linux/nvme.h        |  3 +++
> >>>>>     3 files changed, 28 insertions(+), 5 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> >>>>> index 7dbe9255ff42..60173833c3eb 100644
> >>>>> --- a/drivers/nvme/target/core.c
> >>>>> +++ b/drivers/nvme/target/core.c
> >>>>> @@ -202,7 +202,7 @@ static void nvmet_async_event_work(struct work_struct *work)
> >>>>>           nvmet_async_events_process(ctrl);
> >>>>>     }
> >>>>>
> >>>>> -void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> >>>>> +static void nvmet_add_async_event_locked(struct nvmet_ctrl *ctrl, u8 event_type,
> >>>>>                   u8 event_info, u8 log_page)
> >>>>>     {
> >>>>>           struct nvmet_async_event *aen;
> >>>>> @@ -215,12 +215,17 @@ void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> >>>>>           aen->event_info = event_info;
> >>>>>           aen->log_page = log_page;
> >>>>>
> >>>>> - mutex_lock(&ctrl->lock);
> >>>>>           list_add_tail(&aen->entry, &ctrl->async_events);
> >>>>> - mutex_unlock(&ctrl->lock);
> >>>>>
> >>>>>           queue_work(nvmet_wq, &ctrl->async_event_work);
> >>>>>     }
> >>>>> +void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
> >>>>> +         u8 event_info, u8 log_page)
> >>>>> +{
> >>>>> + mutex_lock(&ctrl->lock);
> >>>>> + nvmet_add_async_event_locked(ctrl, event_type, event_info, log_page);
> >>>>> + mutex_unlock(&ctrl->lock);
> >>>>> +}
> >>>>>
> >>>>>     static void nvmet_add_to_changed_ns_log(struct nvmet_ctrl *ctrl, __le32 nsid)
> >>>>>     {
> >>>>> @@ -1788,6 +1793,18 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
> >>>>>     }
> >>>>>     EXPORT_SYMBOL_GPL(nvmet_alloc_ctrl);
> >>>>>
> >>>>> +static void nvmet_ctrl_notify_ccr(struct nvmet_ctrl *ctrl)
> >>>>> +{
> >>>>> + lockdep_assert_held(&ctrl->lock);
> >>>>> +
> >>>>> + if (nvmet_aen_bit_disabled(ctrl, NVME_AEN_BIT_CCR_COMPLETE))
> >>>>> +         return;
> >>>>> +
> >>>>> + nvmet_add_async_event_locked(ctrl, NVME_AER_NOTICE,
> >>>>> +                              NVME_AER_NOTICE_CCR_COMPLETED,
> >>>>> +                              NVME_LOG_CCR);
> >>>>> +}
> >>>>> +
> >>>>>     static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
> >>>>>     {
> >>>>>           struct nvmet_subsys *subsys = ctrl->subsys;
> >>>>> @@ -1801,8 +1818,10 @@ static void nvmet_ctrl_complete_pending_ccr(struct nvmet_ctrl *ctrl)
> >>>>>           list_for_each_entry(sctrl, &subsys->ctrls, subsys_entry) {
> >>>>>                   mutex_lock(&sctrl->lock);
> >>>>>                   list_for_each_entry(ccr, &sctrl->ccrs, entry) {
> >>>>> -                 if (ccr->ctrl == ctrl)
> >>>>> +                 if (ccr->ctrl == ctrl) {
> >>>>> +                         nvmet_ctrl_notify_ccr(sctrl);
> >>>>>                                   ccr->ctrl = NULL;
> >>>>> +                 }
> >>>> Is this double loop necessary? Would you have more than one controller
> >>>> cross resetting the same
> >>> As it is implemented now CCRs are linked to sctrl. This decision can be
> >>> revisited if found suboptimal. At some point I had CCRs linked to
> >>> ctrl->subsys but that led to lock ordering issues. Double loop is
> >>> necessary to find all CCRs in all controllers and mark them done.
> >>> Yes, it is possible to have more than one sctrl resetting the same
> >>> ictrl.
> >> I'm more interested in simplifying.
> >>
> >>>> controller? Won't it be better to install a callback+opaque that the
> >>>> controller removal will call?
> >>> Can you elaborate more on that? Better in what terms?
> >>>
> >>> nvmet_ctrl_complete_pending_ccr() is called from nvmet_ctrl_free() when
> >>> we know that ctrl->ref is zero and no new CCRs will be added to this
> >>> controller because nvmet_ctrl_find_get_ccr() will not be able to get it.
> >> In nvmet, the controller is serving a single host. Hence I am not sure I
> >> understand how multiple source controllers will try to reset the impacted
> >> controller. So, if there is a 1-1 relationship between source and impacted
> >> controller, I'd perhaps suggest to simplify and install on the impacted
> >> controller
> >> callback+opaque (e.g. void *data) instead of having it iterate and then
> >> actually send
> >> the AEN from the impacted controller.
> > A controller is serving a single path for a given host. A host that is
> > connected to nvme subsystem via multiple paths will have more than one
> > controller. I can think of two reasons why we need to support resetting
> > an impacted controller from multiple source controllers.
> >
> > - It is possible for multiple paths to go down at the same time. The
> >    first source controller we use for CCR, even though we check to see if
> >    LIVE, might have lost connection to subsystem. It is a matter of time
> >    for it to see keepalive timeout and fail too. If CCR fails using this
> >    controller we should not give up. We need to try other paths.
>
> But the host is doing the cross-reset synchronously... it waits for
> kato for a completion of the reset, and then gives up, its not like it
> is sitting there waiting for the AEN...
The Linux host as Mohamed has implemented it is doing the CCR
synchronously.  I expect most hosts will also, until they time out one
controller because the connection might be dying, and that timeout
might come sooner than the target.  Controllers publish how many
outstanding CCRs they may be the source of, not how many
controllers they may be impacted controllers for.  However, it is my
understanding (correct me if I am wrong) that the Linux nvmet
implementation is primarily for testing the Linux nvme host
implementation.  Whatever limitations keep the nvmet code
simple can make sense, even if they do not in a production
system.

>
> Generally the fact that the spec states a capability/flexibility, it is
> still Linux's
> choice to choose weather to implement it. I'm trying to understand if we can
> simplify Linux host and controller in this non-trivial error recovery flow.
>
> What is your expectation to happen in general? what are your expected
> kato/cqt
> values? how many attempts do we want the host to do?
For CQT, I see 30-60 seconds.  For KATO, I see 5-60 seconds.  I see
KATO skewing lower than it should because of the cost to reliably avoid
data corruption.

In general, I expect an HA-pair storage array to benefit most from
implementing this feature.  Often, a host would establish an association
over 4 paths, 2 to each HA-pair.  It is common for non-disruptive operation
for a member of the pair to go down for software upgrades or hardware
modifications while the other member stays up.  In this configuration,
2 paths go down when the member goes down.  The host has a 2 in 3
chance of picking a controller to which the host still has a connection.
Given the tremendous reduction in failover time (even with a timeout
on the CCR) and my perception of the complexity cost of a retry, I would
much rather make the host have a 100% chance of using CCR
successfully after a retry.

> > - Some nvme subsystems might support resetting impacted controller from
> >    a subset of controllers connected to the host. An array that has
> >    multiple frontend engines might not support resetting controllers
> >    across engines. In fact, TP8028 allows for subsystem to suggest to
> >    host to use another source controller in Alternate Controller ID
> >    (ACID) fied on CCR logpage (not implemente in this patchset).
>
> It is not a case though where the impacted controller will be reset from
> multiple
> source controller at the same time...
>
> I'd also say that if indeed there are subsystems that require specific
> controllers to do
> cross recovery, they won't be able to use this at all... Are there any
> such arrays?

There are certainly systems out there (striping, Ceph-like, or stretched to
support non-dispersed hosts) that have many more nodes than an HA-pair
and may have communication barriers that prevent nvme controllers from
being able to handle an CCR for another nvme controller. I still think they
would benefit from retrying a CCR to find a path that can succeed.

Sincerely,
Randy Jennings

Re: [RFC PATCH 05/14] nvmet: Send an AEN on CCR completion

Posted by Randy Jennings 1 month, 3 weeks ago

On Tue, Nov 25, 2025 at 6:13 PM Mohamed Khalfella
<mkhalfella@purestorage.com> wrote:
>
> Send an AEN to initiator when impacted controller exists. The
> notification points to CCR log page that initiator can read to check
> which CCR operation completed.
>
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>

Reviewed-by: Randy Jennings <randyj@purestorage.com>