[PATCH v2 01/14] nvmet: Rapid Path Failure Recovery set controller identify fields

Mohamed Khalfella posted 14 patches 1 week, 2 days ago
[PATCH v2 01/14] nvmet: Rapid Path Failure Recovery set controller identify fields
Posted by Mohamed Khalfella 1 week, 2 days ago
TP8028 Rapid Path Failure Recovery defined new fields in controller
identify response. The newly defined fields are:

- CIU (Controller Instance UNIQUIFIER): is an 8bit non-zero value that
is assigned a random value when controller first created. The value is
expected to be incremented when RDY bit in CSTS register is asserted
- CIRN (Controller Instance Random Number): is 64bit random value that
gets generated when controller is crated. CIRN is regenerated everytime
RDY bit is CSTS register is asserted.
- CCRL (Cross-Controller Reset Limit) is an 8bit value that defines the
maximum number of in-progress controller reset operations. CCRL is
hardcoded to 4 as recommended by TP8028.

TP4129 KATO Corrections and Clarifications defined CQT (Command Quiesce
Time) which is used along with KATO (Keep Alive Timeout) to set an upper
time limit for attempting Cross-Controller Recovery. For NVME subsystem
CQT is set to 0 by default to keep the current behavior. The value can
be set from configfs if needed.

Make the new fields available for IO controllers only since TP8028 is
not very useful for discovery controllers.

Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
---
 drivers/nvme/target/admin-cmd.c |  6 ++++++
 drivers/nvme/target/configfs.c  | 31 +++++++++++++++++++++++++++++++
 drivers/nvme/target/core.c      | 12 ++++++++++++
 drivers/nvme/target/nvmet.h     |  4 ++++
 include/linux/nvme.h            | 15 ++++++++++++---
 5 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 3da31bb1183e..ade1145df72d 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -696,6 +696,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
 
 	id->cntlid = cpu_to_le16(ctrl->cntlid);
 	id->ver = cpu_to_le32(ctrl->subsys->ver);
+	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
+		id->cqt = cpu_to_le16(ctrl->cqt);
+		id->ciu = ctrl->ciu;
+		id->cirn = cpu_to_le64(ctrl->cirn);
+		id->ccrl = NVMF_CCR_LIMIT;
+	}
 
 	/* XXX: figure out what to do about RTD3R/RTD3 */
 	id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index e44ef69dffc2..035f6e75a818 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -1636,6 +1636,36 @@ static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item,
 CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable);
 #endif
 
+static ssize_t nvmet_subsys_attr_cqt_show(struct config_item *item,
+					  char *page)
+{
+	return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cqt);
+}
+
+static ssize_t nvmet_subsys_attr_cqt_store(struct config_item *item,
+					   const char *page, size_t cnt)
+{
+	struct nvmet_subsys *subsys = to_subsys(item);
+	struct nvmet_ctrl *ctrl;
+	u16 cqt;
+
+	if (sscanf(page, "%hu\n", &cqt) != 1)
+		return -EINVAL;
+
+	down_write(&nvmet_config_sem);
+	if (subsys->cqt == cqt)
+		goto out;
+
+	subsys->cqt = cqt;
+	/* Force reconnect */
+	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+		ctrl->ops->delete_ctrl(ctrl);
+out:
+	up_write(&nvmet_config_sem);
+	return cnt;
+}
+CONFIGFS_ATTR(nvmet_subsys_, attr_cqt);
+
 static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item,
 					      char *page)
 {
@@ -1676,6 +1706,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = {
 	&nvmet_subsys_attr_attr_vendor_id,
 	&nvmet_subsys_attr_attr_subsys_vendor_id,
 	&nvmet_subsys_attr_attr_model,
+	&nvmet_subsys_attr_attr_cqt,
 	&nvmet_subsys_attr_attr_qid_max,
 	&nvmet_subsys_attr_attr_ieee_oui,
 	&nvmet_subsys_attr_attr_firmware,
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index cc88e5a28c8a..0d2a1206e08f 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -1393,6 +1393,10 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
 		return;
 	}
 
+	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
+		ctrl->ciu = ((u8)(ctrl->ciu + 1)) ? : 1;
+		ctrl->cirn = get_random_u64();
+	}
 	ctrl->csts = NVME_CSTS_RDY;
 
 	/*
@@ -1661,6 +1665,12 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
 	}
 	ctrl->cntlid = ret;
 
+	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
+		ctrl->cqt = subsys->cqt;
+		ctrl->ciu = get_random_u8() ? : 1;
+		ctrl->cirn = get_random_u64();
+	}
+
 	/*
 	 * Discovery controllers may use some arbitrary high value
 	 * in order to cleanup stale discovery sessions
@@ -1853,10 +1863,12 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
 
 	switch (type) {
 	case NVME_NQN_NVME:
+		subsys->cqt = NVMF_CQT_MS;
 		subsys->max_qid = NVMET_NR_QUEUES;
 		break;
 	case NVME_NQN_DISC:
 	case NVME_NQN_CURR:
+		subsys->cqt = 0;
 		subsys->max_qid = 0;
 		break;
 	default:
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index b664b584fdc8..f5d9a01ec60c 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -264,7 +264,10 @@ struct nvmet_ctrl {
 
 	uuid_t			hostid;
 	u16			cntlid;
+	u16			cqt;
+	u8			ciu;
 	u32			kato;
+	u64			cirn;
 
 	struct nvmet_port	*port;
 
@@ -331,6 +334,7 @@ struct nvmet_subsys {
 #ifdef CONFIG_NVME_TARGET_DEBUGFS
 	struct dentry		*debugfs_dir;
 #endif
+	u16			cqt;
 	u16			max_qid;
 
 	u64			ver;
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 655d194f8e72..5135cdc3c120 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -21,6 +21,9 @@
 #define NVMF_TRADDR_SIZE	256
 #define NVMF_TSAS_SIZE		256
 
+#define NVMF_CQT_MS		0
+#define NVMF_CCR_LIMIT		4
+
 #define NVME_DISC_SUBSYS_NAME	"nqn.2014-08.org.nvmexpress.discovery"
 
 #define NVME_NSID_ALL		0xffffffff
@@ -328,7 +331,10 @@ struct nvme_id_ctrl {
 	__le16			crdt1;
 	__le16			crdt2;
 	__le16			crdt3;
-	__u8			rsvd134[122];
+	__u8			rsvd134[1];
+	__u8			ciu;
+	__le64			cirn;
+	__u8			rsvd144[112];
 	__le16			oacs;
 	__u8			acl;
 	__u8			aerl;
@@ -362,7 +368,9 @@ struct nvme_id_ctrl {
 	__u8			anacap;
 	__le32			anagrpmax;
 	__le32			nanagrpid;
-	__u8			rsvd352[160];
+	__u8			rsvd352[34];
+	__le16			cqt;
+	__u8			rsvd388[124];
 	__u8			sqes;
 	__u8			cqes;
 	__le16			maxcmd;
@@ -389,7 +397,8 @@ struct nvme_id_ctrl {
 	__u8			msdbd;
 	__u8			rsvd1804[2];
 	__u8			dctype;
-	__u8			rsvd1807[241];
+	__u8			ccrl;
+	__u8			rsvd1808[240];
 	struct nvme_id_power_state	psd[32];
 	__u8			vs[1024];
 };
-- 
2.52.0
Re: [PATCH v2 01/14] nvmet: Rapid Path Failure Recovery set controller identify fields
Posted by Hannes Reinecke 5 days, 22 hours ago
On 1/30/26 23:34, Mohamed Khalfella wrote:
> TP8028 Rapid Path Failure Recovery defined new fields in controller
> identify response. The newly defined fields are:
> 
> - CIU (Controller Instance UNIQUIFIER): is an 8bit non-zero value that
> is assigned a random value when controller first created. The value is
> expected to be incremented when RDY bit in CSTS register is asserted
> - CIRN (Controller Instance Random Number): is 64bit random value that
> gets generated when controller is crated. CIRN is regenerated everytime
> RDY bit is CSTS register is asserted.
> - CCRL (Cross-Controller Reset Limit) is an 8bit value that defines the
> maximum number of in-progress controller reset operations. CCRL is
> hardcoded to 4 as recommended by TP8028.
> 
> TP4129 KATO Corrections and Clarifications defined CQT (Command Quiesce
> Time) which is used along with KATO (Keep Alive Timeout) to set an upper
> time limit for attempting Cross-Controller Recovery. For NVME subsystem
> CQT is set to 0 by default to keep the current behavior. The value can
> be set from configfs if needed.
> 
> Make the new fields available for IO controllers only since TP8028 is
> not very useful for discovery controllers.
> 
> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> ---
>   drivers/nvme/target/admin-cmd.c |  6 ++++++
>   drivers/nvme/target/configfs.c  | 31 +++++++++++++++++++++++++++++++
>   drivers/nvme/target/core.c      | 12 ++++++++++++
>   drivers/nvme/target/nvmet.h     |  4 ++++
>   include/linux/nvme.h            | 15 ++++++++++++---
>   5 files changed, 65 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
> index 3da31bb1183e..ade1145df72d 100644
> --- a/drivers/nvme/target/admin-cmd.c
> +++ b/drivers/nvme/target/admin-cmd.c
> @@ -696,6 +696,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
>   
>   	id->cntlid = cpu_to_le16(ctrl->cntlid);
>   	id->ver = cpu_to_le32(ctrl->subsys->ver);
> +	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
> +		id->cqt = cpu_to_le16(ctrl->cqt);
> +		id->ciu = ctrl->ciu;
> +		id->cirn = cpu_to_le64(ctrl->cirn);
> +		id->ccrl = NVMF_CCR_LIMIT;
> +	}
>   
>   	/* XXX: figure out what to do about RTD3R/RTD3 */
>   	id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
> diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
> index e44ef69dffc2..035f6e75a818 100644
> --- a/drivers/nvme/target/configfs.c
> +++ b/drivers/nvme/target/configfs.c
> @@ -1636,6 +1636,36 @@ static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item,
>   CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable);
>   #endif
>   
> +static ssize_t nvmet_subsys_attr_cqt_show(struct config_item *item,
> +					  char *page)
> +{
> +	return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cqt);
> +}
> +
> +static ssize_t nvmet_subsys_attr_cqt_store(struct config_item *item,
> +					   const char *page, size_t cnt)
> +{
> +	struct nvmet_subsys *subsys = to_subsys(item);
> +	struct nvmet_ctrl *ctrl;
> +	u16 cqt;
> +
> +	if (sscanf(page, "%hu\n", &cqt) != 1)
> +		return -EINVAL;
> +
> +	down_write(&nvmet_config_sem);
> +	if (subsys->cqt == cqt)
> +		goto out;
> +
> +	subsys->cqt = cqt;
> +	/* Force reconnect */
> +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
> +		ctrl->ops->delete_ctrl(ctrl);
> +out:
> +	up_write(&nvmet_config_sem);
> +	return cnt;
> +}
> +CONFIGFS_ATTR(nvmet_subsys_, attr_cqt);
> +
>   static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item,
>   					      char *page)
>   {
> @@ -1676,6 +1706,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = {
>   	&nvmet_subsys_attr_attr_vendor_id,
>   	&nvmet_subsys_attr_attr_subsys_vendor_id,
>   	&nvmet_subsys_attr_attr_model,
> +	&nvmet_subsys_attr_attr_cqt,
>   	&nvmet_subsys_attr_attr_qid_max,
>   	&nvmet_subsys_attr_attr_ieee_oui,
>   	&nvmet_subsys_attr_attr_firmware,

I do think that TP8028 (ie the CQT defintions) are somewhat independent
on CCR. So I'm not sure if they should be integrated in this patchset;
personally I would prefer to have it moved to another patchset.

> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> index cc88e5a28c8a..0d2a1206e08f 100644
> --- a/drivers/nvme/target/core.c
> +++ b/drivers/nvme/target/core.c
> @@ -1393,6 +1393,10 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
>   		return;
>   	}
>   
> +	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
> +		ctrl->ciu = ((u8)(ctrl->ciu + 1)) ? : 1;
> +		ctrl->cirn = get_random_u64();
> +	}
>   	ctrl->csts = NVME_CSTS_RDY;
>   
>   	/*
> @@ -1661,6 +1665,12 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
>   	}
>   	ctrl->cntlid = ret;
>   
> +	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
> +		ctrl->cqt = subsys->cqt;
> +		ctrl->ciu = get_random_u8() ? : 1;
> +		ctrl->cirn = get_random_u64();
> +	}
> +
>   	/*
>   	 * Discovery controllers may use some arbitrary high value
>   	 * in order to cleanup stale discovery sessions
> @@ -1853,10 +1863,12 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
>   
>   	switch (type) {
>   	case NVME_NQN_NVME:
> +		subsys->cqt = NVMF_CQT_MS;
>   		subsys->max_qid = NVMET_NR_QUEUES;
>   		break;

And I would not set the CQT default here.
Thing is, implementing CQT to the letter would inflict a CQT delay
during failover for _every_ installation, thereby resulting in a
regression to previous implementations where we would fail over
with _no_ delay.
So again, we should make it a different patchset.

>   	case NVME_NQN_DISC:
>   	case NVME_NQN_CURR:
> +		subsys->cqt = 0;
>   		subsys->max_qid = 0;
>   		break;
>   	default:
> diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
> index b664b584fdc8..f5d9a01ec60c 100644
> --- a/drivers/nvme/target/nvmet.h
> +++ b/drivers/nvme/target/nvmet.h
> @@ -264,7 +264,10 @@ struct nvmet_ctrl {
>   
>   	uuid_t			hostid;
>   	u16			cntlid;
> +	u16			cqt;
> +	u8			ciu;
>   	u32			kato;
> +	u64			cirn;
>   
>   	struct nvmet_port	*port;
>   
> @@ -331,6 +334,7 @@ struct nvmet_subsys {
>   #ifdef CONFIG_NVME_TARGET_DEBUGFS
>   	struct dentry		*debugfs_dir;
>   #endif
> +	u16			cqt;
>   	u16			max_qid;
>   
>   	u64			ver;
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index 655d194f8e72..5135cdc3c120 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -21,6 +21,9 @@
>   #define NVMF_TRADDR_SIZE	256
>   #define NVMF_TSAS_SIZE		256
>   
> +#define NVMF_CQT_MS		0
> +#define NVMF_CCR_LIMIT		4
> +
>   #define NVME_DISC_SUBSYS_NAME	"nqn.2014-08.org.nvmexpress.discovery"
>   
>   #define NVME_NSID_ALL		0xffffffff
> @@ -328,7 +331,10 @@ struct nvme_id_ctrl {
>   	__le16			crdt1;
>   	__le16			crdt2;
>   	__le16			crdt3;
> -	__u8			rsvd134[122];
> +	__u8			rsvd134[1];
> +	__u8			ciu;
> +	__le64			cirn;
> +	__u8			rsvd144[112];
>   	__le16			oacs;
>   	__u8			acl;
>   	__u8			aerl;
> @@ -362,7 +368,9 @@ struct nvme_id_ctrl {
>   	__u8			anacap;
>   	__le32			anagrpmax;
>   	__le32			nanagrpid;
> -	__u8			rsvd352[160];
> +	__u8			rsvd352[34];
> +	__le16			cqt;
> +	__u8			rsvd388[124];
>   	__u8			sqes;
>   	__u8			cqes;
>   	__le16			maxcmd;
> @@ -389,7 +397,8 @@ struct nvme_id_ctrl {
>   	__u8			msdbd;
>   	__u8			rsvd1804[2];
>   	__u8			dctype;
> -	__u8			rsvd1807[241];
> +	__u8			ccrl;
> +	__u8			rsvd1808[240];
>   	struct nvme_id_power_state	psd[32];
>   	__u8			vs[1024];
>   };

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
Re: [PATCH v2 01/14] nvmet: Rapid Path Failure Recovery set controller identify fields
Posted by Mohamed Khalfella 5 days, 7 hours ago
On Tue 2026-02-03 04:03:22 +0100, Hannes Reinecke wrote:
> On 1/30/26 23:34, Mohamed Khalfella wrote:
> > TP8028 Rapid Path Failure Recovery defined new fields in controller
> > identify response. The newly defined fields are:
> > 
> > - CIU (Controller Instance UNIQUIFIER): is an 8bit non-zero value that
> > is assigned a random value when controller first created. The value is
> > expected to be incremented when RDY bit in CSTS register is asserted
> > - CIRN (Controller Instance Random Number): is 64bit random value that
> > gets generated when controller is crated. CIRN is regenerated everytime
> > RDY bit is CSTS register is asserted.
> > - CCRL (Cross-Controller Reset Limit) is an 8bit value that defines the
> > maximum number of in-progress controller reset operations. CCRL is
> > hardcoded to 4 as recommended by TP8028.
> > 
> > TP4129 KATO Corrections and Clarifications defined CQT (Command Quiesce
> > Time) which is used along with KATO (Keep Alive Timeout) to set an upper
> > time limit for attempting Cross-Controller Recovery. For NVME subsystem
> > CQT is set to 0 by default to keep the current behavior. The value can
> > be set from configfs if needed.
> > 
> > Make the new fields available for IO controllers only since TP8028 is
> > not very useful for discovery controllers.
> > 
> > Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
> > ---
> >   drivers/nvme/target/admin-cmd.c |  6 ++++++
> >   drivers/nvme/target/configfs.c  | 31 +++++++++++++++++++++++++++++++
> >   drivers/nvme/target/core.c      | 12 ++++++++++++
> >   drivers/nvme/target/nvmet.h     |  4 ++++
> >   include/linux/nvme.h            | 15 ++++++++++++---
> >   5 files changed, 65 insertions(+), 3 deletions(-)
> > 
> > diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
> > index 3da31bb1183e..ade1145df72d 100644
> > --- a/drivers/nvme/target/admin-cmd.c
> > +++ b/drivers/nvme/target/admin-cmd.c
> > @@ -696,6 +696,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
> >   
> >   	id->cntlid = cpu_to_le16(ctrl->cntlid);
> >   	id->ver = cpu_to_le32(ctrl->subsys->ver);
> > +	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
> > +		id->cqt = cpu_to_le16(ctrl->cqt);
> > +		id->ciu = ctrl->ciu;
> > +		id->cirn = cpu_to_le64(ctrl->cirn);
> > +		id->ccrl = NVMF_CCR_LIMIT;
> > +	}
> >   
> >   	/* XXX: figure out what to do about RTD3R/RTD3 */
> >   	id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
> > diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
> > index e44ef69dffc2..035f6e75a818 100644
> > --- a/drivers/nvme/target/configfs.c
> > +++ b/drivers/nvme/target/configfs.c
> > @@ -1636,6 +1636,36 @@ static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item,
> >   CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable);
> >   #endif
> >   
> > +static ssize_t nvmet_subsys_attr_cqt_show(struct config_item *item,
> > +					  char *page)
> > +{
> > +	return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cqt);
> > +}
> > +
> > +static ssize_t nvmet_subsys_attr_cqt_store(struct config_item *item,
> > +					   const char *page, size_t cnt)
> > +{
> > +	struct nvmet_subsys *subsys = to_subsys(item);
> > +	struct nvmet_ctrl *ctrl;
> > +	u16 cqt;
> > +
> > +	if (sscanf(page, "%hu\n", &cqt) != 1)
> > +		return -EINVAL;
> > +
> > +	down_write(&nvmet_config_sem);
> > +	if (subsys->cqt == cqt)
> > +		goto out;
> > +
> > +	subsys->cqt = cqt;
> > +	/* Force reconnect */
> > +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
> > +		ctrl->ops->delete_ctrl(ctrl);
> > +out:
> > +	up_write(&nvmet_config_sem);
> > +	return cnt;
> > +}
> > +CONFIGFS_ATTR(nvmet_subsys_, attr_cqt);
> > +
> >   static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item,
> >   					      char *page)
> >   {
> > @@ -1676,6 +1706,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = {
> >   	&nvmet_subsys_attr_attr_vendor_id,
> >   	&nvmet_subsys_attr_attr_subsys_vendor_id,
> >   	&nvmet_subsys_attr_attr_model,
> > +	&nvmet_subsys_attr_attr_cqt,
> >   	&nvmet_subsys_attr_attr_qid_max,
> >   	&nvmet_subsys_attr_attr_ieee_oui,
> >   	&nvmet_subsys_attr_attr_firmware,
> 
> I do think that TP8028 (ie the CQT defintions) are somewhat independent
> on CCR. So I'm not sure if they should be integrated in this patchset;
> personally I would prefer to have it moved to another patchset.

Agreed that CQT is not directly related to CCR from the target
perspective. But there is a relationship when it comes to how the
initiator uses CQT to calculate the time budget for CCR. As you know on
the host side if CCR fails and CQT is supported the requests needs to be
held for certain amount of time before they are retried. So CQT value is
needed and that I why I included it in this patchset.

> 
> > diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
> > index cc88e5a28c8a..0d2a1206e08f 100644
> > --- a/drivers/nvme/target/core.c
> > +++ b/drivers/nvme/target/core.c
> > @@ -1393,6 +1393,10 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
> >   		return;
> >   	}
> >   
> > +	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
> > +		ctrl->ciu = ((u8)(ctrl->ciu + 1)) ? : 1;
> > +		ctrl->cirn = get_random_u64();
> > +	}
> >   	ctrl->csts = NVME_CSTS_RDY;
> >   
> >   	/*
> > @@ -1661,6 +1665,12 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
> >   	}
> >   	ctrl->cntlid = ret;
> >   
> > +	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
> > +		ctrl->cqt = subsys->cqt;
> > +		ctrl->ciu = get_random_u8() ? : 1;
> > +		ctrl->cirn = get_random_u64();
> > +	}
> > +
> >   	/*
> >   	 * Discovery controllers may use some arbitrary high value
> >   	 * in order to cleanup stale discovery sessions
> > @@ -1853,10 +1863,12 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
> >   
> >   	switch (type) {
> >   	case NVME_NQN_NVME:
> > +		subsys->cqt = NVMF_CQT_MS;
> >   		subsys->max_qid = NVMET_NR_QUEUES;
> >   		break;
> 
> And I would not set the CQT default here.
> Thing is, implementing CQT to the letter would inflict a CQT delay
> during failover for _every_ installation, thereby resulting in a
> regression to previous implementations where we would fail over
> with _no_ delay.
> So again, we should make it a different patchset.

CQT defaults to 0 to avoid introducing surprise delay. The initiator will
skip holding requests if it sees CQT set to 0.
Re: [PATCH v2 01/14] nvmet: Rapid Path Failure Recovery set controller identify fields
Posted by Hannes Reinecke 5 days, 1 hour ago
On 2/3/26 19:14, Mohamed Khalfella wrote:
> On Tue 2026-02-03 04:03:22 +0100, Hannes Reinecke wrote:
>> On 1/30/26 23:34, Mohamed Khalfella wrote:
>>> TP8028 Rapid Path Failure Recovery defined new fields in controller
>>> identify response. The newly defined fields are:
>>>
>>> - CIU (Controller Instance UNIQUIFIER): is an 8bit non-zero value that
>>> is assigned a random value when controller first created. The value is
>>> expected to be incremented when RDY bit in CSTS register is asserted
>>> - CIRN (Controller Instance Random Number): is 64bit random value that
>>> gets generated when controller is crated. CIRN is regenerated everytime
>>> RDY bit is CSTS register is asserted.
>>> - CCRL (Cross-Controller Reset Limit) is an 8bit value that defines the
>>> maximum number of in-progress controller reset operations. CCRL is
>>> hardcoded to 4 as recommended by TP8028.
>>>
>>> TP4129 KATO Corrections and Clarifications defined CQT (Command Quiesce
>>> Time) which is used along with KATO (Keep Alive Timeout) to set an upper
>>> time limit for attempting Cross-Controller Recovery. For NVME subsystem
>>> CQT is set to 0 by default to keep the current behavior. The value can
>>> be set from configfs if needed.
>>>
>>> Make the new fields available for IO controllers only since TP8028 is
>>> not very useful for discovery controllers.
>>>
>>> Signed-off-by: Mohamed Khalfella <mkhalfella@purestorage.com>
>>> ---
>>>    drivers/nvme/target/admin-cmd.c |  6 ++++++
>>>    drivers/nvme/target/configfs.c  | 31 +++++++++++++++++++++++++++++++
>>>    drivers/nvme/target/core.c      | 12 ++++++++++++
>>>    drivers/nvme/target/nvmet.h     |  4 ++++
>>>    include/linux/nvme.h            | 15 ++++++++++++---
>>>    5 files changed, 65 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
>>> index 3da31bb1183e..ade1145df72d 100644
>>> --- a/drivers/nvme/target/admin-cmd.c
>>> +++ b/drivers/nvme/target/admin-cmd.c
>>> @@ -696,6 +696,12 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
>>>    
>>>    	id->cntlid = cpu_to_le16(ctrl->cntlid);
>>>    	id->ver = cpu_to_le32(ctrl->subsys->ver);
>>> +	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
>>> +		id->cqt = cpu_to_le16(ctrl->cqt);
>>> +		id->ciu = ctrl->ciu;
>>> +		id->cirn = cpu_to_le64(ctrl->cirn);
>>> +		id->ccrl = NVMF_CCR_LIMIT;
>>> +	}
>>>    
>>>    	/* XXX: figure out what to do about RTD3R/RTD3 */
>>>    	id->oaes = cpu_to_le32(NVMET_AEN_CFG_OPTIONAL);
>>> diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
>>> index e44ef69dffc2..035f6e75a818 100644
>>> --- a/drivers/nvme/target/configfs.c
>>> +++ b/drivers/nvme/target/configfs.c
>>> @@ -1636,6 +1636,36 @@ static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item,
>>>    CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable);
>>>    #endif
>>>    
>>> +static ssize_t nvmet_subsys_attr_cqt_show(struct config_item *item,
>>> +					  char *page)
>>> +{
>>> +	return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->cqt);
>>> +}
>>> +
>>> +static ssize_t nvmet_subsys_attr_cqt_store(struct config_item *item,
>>> +					   const char *page, size_t cnt)
>>> +{
>>> +	struct nvmet_subsys *subsys = to_subsys(item);
>>> +	struct nvmet_ctrl *ctrl;
>>> +	u16 cqt;
>>> +
>>> +	if (sscanf(page, "%hu\n", &cqt) != 1)
>>> +		return -EINVAL;
>>> +
>>> +	down_write(&nvmet_config_sem);
>>> +	if (subsys->cqt == cqt)
>>> +		goto out;
>>> +
>>> +	subsys->cqt = cqt;
>>> +	/* Force reconnect */
>>> +	list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
>>> +		ctrl->ops->delete_ctrl(ctrl);
>>> +out:
>>> +	up_write(&nvmet_config_sem);
>>> +	return cnt;
>>> +}
>>> +CONFIGFS_ATTR(nvmet_subsys_, attr_cqt);
>>> +
>>>    static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item,
>>>    					      char *page)
>>>    {
>>> @@ -1676,6 +1706,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = {
>>>    	&nvmet_subsys_attr_attr_vendor_id,
>>>    	&nvmet_subsys_attr_attr_subsys_vendor_id,
>>>    	&nvmet_subsys_attr_attr_model,
>>> +	&nvmet_subsys_attr_attr_cqt,
>>>    	&nvmet_subsys_attr_attr_qid_max,
>>>    	&nvmet_subsys_attr_attr_ieee_oui,
>>>    	&nvmet_subsys_attr_attr_firmware,
>>
>> I do think that TP8028 (ie the CQT defintions) are somewhat independent
>> on CCR. So I'm not sure if they should be integrated in this patchset;
>> personally I would prefer to have it moved to another patchset.
> 
> Agreed that CQT is not directly related to CCR from the target
> perspective. But there is a relationship when it comes to how the
> initiator uses CQT to calculate the time budget for CCR. As you know on
> the host side if CCR fails and CQT is supported the requests needs to be
> held for certain amount of time before they are retried. So CQT value is
> needed and that I why I included it in this patchset.
> 
Oh, I don't disagree that CQT is needed to get full spec compliance.
But my point here is that there is no requirement that we need to
achieve full spec compliance _with this patchset_.
We are not compliant with the current implementation, and we don't
lose anything if we stay non-compliant after this patch.
 From my POV CCR is about resetting controllers and aborting commands,
and CQT is about inserting a delay before retrying commands.
And there is no dependency between those two; you can send CCR
without inserting a delay before retrying, and you can insert a
delay before retrying commands without having sent CCR.
So we should have two patchsets here. And of course, the
CQT patchset should be based on the CCR patchset.
But there is no requirements to have both in a single patchset.

>>
>>> diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
>>> index cc88e5a28c8a..0d2a1206e08f 100644
>>> --- a/drivers/nvme/target/core.c
>>> +++ b/drivers/nvme/target/core.c
>>> @@ -1393,6 +1393,10 @@ static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl)
>>>    		return;
>>>    	}
>>>    
>>> +	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
>>> +		ctrl->ciu = ((u8)(ctrl->ciu + 1)) ? : 1;
>>> +		ctrl->cirn = get_random_u64();
>>> +	}
>>>    	ctrl->csts = NVME_CSTS_RDY;
>>>    
>>>    	/*
>>> @@ -1661,6 +1665,12 @@ struct nvmet_ctrl *nvmet_alloc_ctrl(struct nvmet_alloc_ctrl_args *args)
>>>    	}
>>>    	ctrl->cntlid = ret;
>>>    
>>> +	if (!nvmet_is_disc_subsys(ctrl->subsys)) {
>>> +		ctrl->cqt = subsys->cqt;
>>> +		ctrl->ciu = get_random_u8() ? : 1;
>>> +		ctrl->cirn = get_random_u64();
>>> +	}
>>> +
>>>    	/*
>>>    	 * Discovery controllers may use some arbitrary high value
>>>    	 * in order to cleanup stale discovery sessions
>>> @@ -1853,10 +1863,12 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
>>>    
>>>    	switch (type) {
>>>    	case NVME_NQN_NVME:
>>> +		subsys->cqt = NVMF_CQT_MS;
>>>    		subsys->max_qid = NVMET_NR_QUEUES;
>>>    		break;
>>
>> And I would not set the CQT default here.
>> Thing is, implementing CQT to the letter would inflict a CQT delay
>> during failover for _every_ installation, thereby resulting in a
>> regression to previous implementations where we would fail over
>> with _no_ delay.
>> So again, we should make it a different patchset.
> 
> CQT defaults to 0 to avoid introducing surprise delay. The initiator will
> skip holding requests if it sees CQT set to 0.

Precisely my point. If CQT defaults to zero no delay will be inserted,
but we _still_ have CCR handling. Just proving that both really are
independent on each other. Hence I would prefer to have two patchsets.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich
Re: [PATCH v2 01/14] nvmet: Rapid Path Failure Recovery set controller identify fields
Posted by Sagi Grimberg 1 day, 12 hours ago
> Precisely my point. If CQT defaults to zero no delay will be inserted,
> but we _still_ have CCR handling. Just proving that both really are
> independent on each other. Hence I would prefer to have two patchsets.

Agreed. I think it would be cleaner to review each separately. the CCR 
can be based
on top of the CQT patchset, for simplicity.