[PATCH v2] nvme/host: add delayed retries upon non-fatal error during ns validation

Alex Tran posted 1 patch 1 month, 1 week ago
There is a newer version of this series
drivers/nvme/host/core.c | 37 +++++++++++++++++++++++++++++--------
drivers/nvme/host/nvme.h |  6 ++++++
2 files changed, 35 insertions(+), 8 deletions(-)
[PATCH v2] nvme/host: add delayed retries upon non-fatal error during ns validation
Posted by Alex Tran 1 month, 1 week ago
If a non-fatal error is received during nvme namespace validation, it
should not be ignored and the namespace should not be removed immediately.
Rather, delayed retries should be performed on the namespace validation
process.

This handles non-fatal issues more robustly, by retrying a few times before
giving up and removing the namespace. The number of retries is set
to 3 and the interval between retries is set to 3 seconds.

The retries are handled locally. Upon success then end. Upon fatal error
then remove the namespace before ending. Upon non-fatal error, retry
until the max retry amount is hit before ending.

Signed-off-by: Alex Tran <alex.t.tran@gmail.com>
---
Changes in v2:
- Simplify retry logic with local loop instead of delayed work.
---
 drivers/nvme/host/core.c | 37 +++++++++++++++++++++++++++++--------
 drivers/nvme/host/nvme.h |  6 ++++++
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7bf228df6001f1f4d0b3c570de285a5eb17bb08e..6cd1ce4a60af55e5673e5fd0ec2053a028fae4f2 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4289,23 +4289,44 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
 static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
 {
 	int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
+	unsigned int i;
 
 	if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
-		dev_err(ns->ctrl->device,
-			"identifiers changed for nsid %d\n", ns->head->ns_id);
+		dev_err(ns->ctrl->device, "identifiers changed for nsid %d\n",
+			ns->head->ns_id);
 		goto out;
 	}
 
-	ret = nvme_update_ns_info(ns, info);
+	for (i = 0; i <= NVME_NS_VALIDATION_MAX_RETRIES; i++) {
+		ret = nvme_update_ns_info(ns, info);
+		if (ret == 0)
+			return;
+
+		if (ret > 0 && (ret & NVME_STATUS_DNR))
+			goto out;
+
+		if (i == NVME_NS_VALIDATION_MAX_RETRIES) {
+			dev_err(ns->ctrl->device,
+				"validation failed for nsid %d after %d retries\n",
+				ns->head->ns_id,
+				NVME_NS_VALIDATION_MAX_RETRIES);
+			return;
+		}
+
+		dev_warn(ns->ctrl->device,
+			 "validation failed for nsid %d, retry %d/%d in %dms\n",
+			 ns->head->ns_id, i + 1, NVME_NS_VALIDATION_MAX_RETRIES,
+			 NVME_NS_VALIDATION_RETRY_INTERVAL);
+
+		msleep(NVME_NS_VALIDATION_RETRY_INTERVAL);
+	}
+
 out:
 	/*
 	 * Only remove the namespace if we got a fatal error back from the
-	 * device, otherwise ignore the error and just move on.
-	 *
-	 * TODO: we should probably schedule a delayed retry here.
+	 * device.
 	 */
-	if (ret > 0 && (ret & NVME_STATUS_DNR))
-		nvme_ns_remove(ns);
+	nvme_ns_remove(ns);
 }
 
 static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9a5f28c5103c5c42777bd9309a983ef0196c1b95..dcbdc7fa8af0cb838b3f1a774d4c67fa69a00050 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -46,6 +46,12 @@ extern unsigned int admin_timeout;
 #define NVME_CTRL_PAGE_SHIFT	12
 #define NVME_CTRL_PAGE_SIZE	(1 << NVME_CTRL_PAGE_SHIFT)
 
+/*
+ * Default to 3 retries in intervals of 3000ms for namespace validation
+ */
+#define NVME_NS_VALIDATION_MAX_RETRIES 3
+#define NVME_NS_VALIDATION_RETRY_INTERVAL 3000
+
 extern struct workqueue_struct *nvme_wq;
 extern struct workqueue_struct *nvme_reset_wq;
 extern struct workqueue_struct *nvme_delete_wq;

---
base-commit: fa084c35afa13ab07a860ef0936cd987f9aa0460
change-id: 20251225-nvme_ns_validation-310a07aa8b2b

Best regards,
-- 
Alex Tran <alex.t.tran@gmail.com>
Re: [PATCH v2] nvme/host: add delayed retries upon non-fatal error during ns validation
Posted by Sagi Grimberg 1 month, 1 week ago

On 26/12/2025 19:54, Alex Tran wrote:
> If a non-fatal error is received during nvme namespace validation, it
> should not be ignored and the namespace should not be removed immediately.
> Rather, delayed retries should be performed on the namespace validation
> process.
>
> This handles non-fatal issues more robustly, by retrying a few times before
> giving up and removing the namespace. The number of retries is set
> to 3 and the interval between retries is set to 3 seconds.
>
> The retries are handled locally. Upon success then end. Upon fatal error
> then remove the namespace before ending. Upon non-fatal error, retry
> until the max retry amount is hit before ending.
>
> Signed-off-by: Alex Tran <alex.t.tran@gmail.com>
> ---
> Changes in v2:
> - Simplify retry logic with local loop instead of delayed work.
> ---
>   drivers/nvme/host/core.c | 37 +++++++++++++++++++++++++++++--------
>   drivers/nvme/host/nvme.h |  6 ++++++
>   2 files changed, 35 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 7bf228df6001f1f4d0b3c570de285a5eb17bb08e..6cd1ce4a60af55e5673e5fd0ec2053a028fae4f2 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -4289,23 +4289,44 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
>   static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
>   {
>   	int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
> +	unsigned int i;
>   
>   	if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
> -		dev_err(ns->ctrl->device,
> -			"identifiers changed for nsid %d\n", ns->head->ns_id);
> +		dev_err(ns->ctrl->device, "identifiers changed for nsid %d\n",
> +			ns->head->ns_id);
>   		goto out;
>   	}
>   
> -	ret = nvme_update_ns_info(ns, info);
> +	for (i = 0; i <= NVME_NS_VALIDATION_MAX_RETRIES; i++) {
> +		ret = nvme_update_ns_info(ns, info);
> +		if (ret == 0)
> +			return;
> +
> +		if (ret > 0 && (ret & NVME_STATUS_DNR))
> +			goto out;
> +
> +		if (i == NVME_NS_VALIDATION_MAX_RETRIES) {
> +			dev_err(ns->ctrl->device,
> +				"validation failed for nsid %d after %d retries\n",
> +				ns->head->ns_id,
> +				NVME_NS_VALIDATION_MAX_RETRIES);
> +			return;
> +		}

This is really redundant. just make the loop (i < 
NVME_NS_VALIDATION_MAX_RETRIES)
and print in the end outside the loop.
BTW, don't you need to remove the ns here?

> +
> +		dev_warn(ns->ctrl->device,
> +			 "validation failed for nsid %d, retry %d/%d in %dms\n",
> +			 ns->head->ns_id, i + 1, NVME_NS_VALIDATION_MAX_RETRIES,
> +			 NVME_NS_VALIDATION_RETRY_INTERVAL);

This can safely be debug print.

> +
> +		msleep(NVME_NS_VALIDATION_RETRY_INTERVAL);
> +	}
> +
>   out:
>   	/*
>   	 * Only remove the namespace if we got a fatal error back from the
> -	 * device, otherwise ignore the error and just move on.
> -	 *
> -	 * TODO: we should probably schedule a delayed retry here.
> +	 * device.
>   	 */
> -	if (ret > 0 && (ret & NVME_STATUS_DNR))
> -		nvme_ns_remove(ns);
> +	nvme_ns_remove(ns);
>   }
>   
>   static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 9a5f28c5103c5c42777bd9309a983ef0196c1b95..dcbdc7fa8af0cb838b3f1a774d4c67fa69a00050 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -46,6 +46,12 @@ extern unsigned int admin_timeout;
>   #define NVME_CTRL_PAGE_SHIFT	12
>   #define NVME_CTRL_PAGE_SIZE	(1 << NVME_CTRL_PAGE_SHIFT)
>   
> +/*
> + * Default to 3 retries in intervals of 3000ms for namespace validation
> + */
> +#define NVME_NS_VALIDATION_MAX_RETRIES 3
> +#define NVME_NS_VALIDATION_RETRY_INTERVAL 3000
> +
>   extern struct workqueue_struct *nvme_wq;
>   extern struct workqueue_struct *nvme_reset_wq;
>   extern struct workqueue_struct *nvme_delete_wq;
>
> ---
> base-commit: fa084c35afa13ab07a860ef0936cd987f9aa0460
> change-id: 20251225-nvme_ns_validation-310a07aa8b2b
>
> Best regards,
Re: [PATCH v2] nvme/host: add delayed retries upon non-fatal error during ns validation
Posted by Alex Tran 1 month, 1 week ago
On Sat, Dec 27, 2025 at 2:42 AM Sagi Grimberg <sagi@grimberg.me> wrote:
>
>
>
> On 26/12/2025 19:54, Alex Tran wrote:
> > If a non-fatal error is received during nvme namespace validation, it
> > should not be ignored and the namespace should not be removed immediately.
> > Rather, delayed retries should be performed on the namespace validation
> > process.
> >
> > This handles non-fatal issues more robustly, by retrying a few times before
> > giving up and removing the namespace. The number of retries is set
> > to 3 and the interval between retries is set to 3 seconds.
> >
> > The retries are handled locally. Upon success then end. Upon fatal error
> > then remove the namespace before ending. Upon non-fatal error, retry
> > until the max retry amount is hit before ending.
> >
> > Signed-off-by: Alex Tran <alex.t.tran@gmail.com>
> > ---
> > Changes in v2:
> > - Simplify retry logic with local loop instead of delayed work.
> > ---
> >   drivers/nvme/host/core.c | 37 +++++++++++++++++++++++++++++--------
> >   drivers/nvme/host/nvme.h |  6 ++++++
> >   2 files changed, 35 insertions(+), 8 deletions(-)
> >
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index 7bf228df6001f1f4d0b3c570de285a5eb17bb08e..6cd1ce4a60af55e5673e5fd0ec2053a028fae4f2 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -4289,23 +4289,44 @@ static void nvme_ns_remove_by_nsid(struct nvme_ctrl *ctrl, u32 nsid)
> >   static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
> >   {
> >       int ret = NVME_SC_INVALID_NS | NVME_STATUS_DNR;
> > +     unsigned int i;
> >
> >       if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
> > -             dev_err(ns->ctrl->device,
> > -                     "identifiers changed for nsid %d\n", ns->head->ns_id);
> > +             dev_err(ns->ctrl->device, "identifiers changed for nsid %d\n",
> > +                     ns->head->ns_id);
> >               goto out;
> >       }
> >
> > -     ret = nvme_update_ns_info(ns, info);
> > +     for (i = 0; i <= NVME_NS_VALIDATION_MAX_RETRIES; i++) {
> > +             ret = nvme_update_ns_info(ns, info);
> > +             if (ret == 0)
> > +                     return;
> > +
> > +             if (ret > 0 && (ret & NVME_STATUS_DNR))
> > +                     goto out;
> > +
> > +             if (i == NVME_NS_VALIDATION_MAX_RETRIES) {
> > +                     dev_err(ns->ctrl->device,
> > +                             "validation failed for nsid %d after %d retries\n",
> > +                             ns->head->ns_id,
> > +                             NVME_NS_VALIDATION_MAX_RETRIES);
> > +                     return;
> > +             }
>
> This is really redundant. just make the loop (i <
> NVME_NS_VALIDATION_MAX_RETRIES)
> and print in the end outside the loop.
> BTW, don't you need to remove the ns here?
>
In the original, a non-fatal error was just ignored without removing
the ns, so I assumed the same behavior, even when still receiving a
non-fatal error after the max retries. Unless it is preferred to
remove the namespace after failing max retry times?