[v5] nvme-fabrics: short-circuit connect retries

[PATCH v5 3/6] nvme-tcp: short-circuit reconnect retries

Posted by Daniel Wagner 1 year, 10 months ago

From: Hannes Reinecke <hare@suse.de>

Returning an nvme status from nvme_tcp_setup_ctrl() indicates that the
association was established and we have received a status from the
controller; consequently we should honour the DNR bit. If not any future
reconnect attempts will just return the same error, so we can
short-circuit the reconnect attempts and fail the connection directly.

Signed-off-by: Hannes Reinecke <hare@suse.de>
[dwagner: add helper to decide to reconnect]
Signed-off-by: Daniel Wagner <dwagner@suse.de>
---
 drivers/nvme/host/nvme.h | 24 ++++++++++++++++++++++++
 drivers/nvme/host/tcp.c  | 23 +++++++++++++++--------
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9b8904a476b8..dfe103283a3d 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -701,6 +701,30 @@ static inline bool nvme_is_path_error(u16 status)
 	return (status & 0x700) == 0x300;
 }
 
+/*
+ * Evaluate the status information returned by the LLDD in order to
+ * decided if a reconnect attempt should be scheduled.
+ *
+ * There are two cases where no reconnect attempt should be attempted:
+ *
+ * 1) The LLDD reports an negative status. There was an error (e.g. no
+ *    memory) on the host side and thus abort the operation.
+ *    Note, there are exception such as ENOTCONN which is
+ *    not an internal driver error, thus we filter these errors
+ *    out and retry later.
+ * 2) The DNR bit is set and the specification states no further
+ *    connect attempts with the same set of paramenters should be
+ *    attempted.
+ */
+static inline bool nvme_ctrl_reconnect(int status)
+{
+	if (status < 0 && status != -ENOTCONN)
+		return false;
+	else if (status > 0 && (status & NVME_SC_DNR))
+		return false;
+	return true;
+}
+
 /*
  * Fill in the status and result information from the CQE, and then figure out
  * if blk-mq will need to use IPI magic to complete the request, and if yes do
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index fdbcdcedcee9..7e25a96e9870 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -2155,9 +2155,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
 	nvme_tcp_destroy_io_queues(ctrl, remove);
 }
 
-static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
+static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl,
+		int status)
 {
 	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
+	bool recon = nvme_ctrl_reconnect(status);
 
 	/* If we are resetting/deleting then do nothing */
 	if (state != NVME_CTRL_CONNECTING) {
@@ -2165,13 +2167,14 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
 		return;
 	}
 
-	if (nvmf_should_reconnect(ctrl)) {
+	if (recon && nvmf_should_reconnect(ctrl)) {
 		dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
 			ctrl->opts->reconnect_delay);
 		queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
 				ctrl->opts->reconnect_delay * HZ);
 	} else {
-		dev_info(ctrl->device, "Removing controller...\n");
+		dev_info(ctrl->device, "Removing controller (%d)...\n",
+			 status);
 		nvme_delete_ctrl(ctrl);
 	}
 }
@@ -2252,10 +2255,12 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
 	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
 			struct nvme_tcp_ctrl, connect_work);
 	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
+	int ret;
 
 	++ctrl->nr_reconnects;
 
-	if (nvme_tcp_setup_ctrl(ctrl, false))
+	ret = nvme_tcp_setup_ctrl(ctrl, false);
+	if (ret)
 		goto requeue;
 
 	dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
@@ -2268,7 +2273,7 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
 requeue:
 	dev_info(ctrl->device, "Failed reconnect attempt %d\n",
 			ctrl->nr_reconnects);
-	nvme_tcp_reconnect_or_remove(ctrl);
+	nvme_tcp_reconnect_or_remove(ctrl, ret);
 }
 
 static void nvme_tcp_error_recovery_work(struct work_struct *work)
@@ -2295,7 +2300,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
 		return;
 	}
 
-	nvme_tcp_reconnect_or_remove(ctrl);
+	nvme_tcp_reconnect_or_remove(ctrl, -ENOTCONN);
 }
 
 static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
@@ -2315,6 +2320,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
 {
 	struct nvme_ctrl *ctrl =
 		container_of(work, struct nvme_ctrl, reset_work);
+	int ret;
 
 	nvme_stop_ctrl(ctrl);
 	nvme_tcp_teardown_ctrl(ctrl, false);
@@ -2328,14 +2334,15 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
 		return;
 	}
 
-	if (nvme_tcp_setup_ctrl(ctrl, false))
+	ret = nvme_tcp_setup_ctrl(ctrl, false);
+	if (ret)
 		goto out_fail;
 
 	return;
 
 out_fail:
 	++ctrl->nr_reconnects;
-	nvme_tcp_reconnect_or_remove(ctrl);
+	nvme_tcp_reconnect_or_remove(ctrl, ret);
 }
 
 static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
-- 
2.44.0

Re: [PATCH v5 3/6] nvme-tcp: short-circuit reconnect retries

Posted by Sagi Grimberg 1 year, 10 months ago


On 09/04/2024 12:35, Daniel Wagner wrote:
> From: Hannes Reinecke <hare@suse.de>
>
> Returning an nvme status from nvme_tcp_setup_ctrl() indicates that the
> association was established and we have received a status from the
> controller; consequently we should honour the DNR bit. If not any future
> reconnect attempts will just return the same error, so we can
> short-circuit the reconnect attempts and fail the connection directly.
>
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> [dwagner: add helper to decide to reconnect]
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
>   drivers/nvme/host/nvme.h | 24 ++++++++++++++++++++++++
>   drivers/nvme/host/tcp.c  | 23 +++++++++++++++--------
>   2 files changed, 39 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 9b8904a476b8..dfe103283a3d 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -701,6 +701,30 @@ static inline bool nvme_is_path_error(u16 status)
>   	return (status & 0x700) == 0x300;
>   }
>   
> +/*
> + * Evaluate the status information returned by the LLDD

We usually say transport, less so LLDD.

>   in order to
> + * decided if a reconnect attempt should be scheduled.
> + *
> + * There are two cases where no reconnect attempt should be attempted:
> + *
> + * 1) The LLDD reports an negative status. There was an error (e.g. no
> + *    memory) on the host side and thus abort the operation.
> + *    Note, there are exception such as ENOTCONN which is
> + *    not an internal driver error, thus we filter these errors
> + *    out and retry later.

What is this ENOTCONN here? From what I see its just a random
escape value to bypass the status check. I think that 0 would do the same
wouldn't it?

> + * 2) The DNR bit is set and the specification states no further
> + *    connect attempts with the same set of paramenters should be
> + *    attempted.
> + */
> +static inline bool nvme_ctrl_reconnect(int status)
> +{
> +	if (status < 0 && status != -ENOTCONN)
> +		return false;
> +	else if (status > 0 && (status & NVME_SC_DNR))
> +		return false;
> +	return true;
> +}
> +
>   /*
>    * Fill in the status and result information from the CQE, and then figure out
>    * if blk-mq will need to use IPI magic to complete the request, and if yes do
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index fdbcdcedcee9..7e25a96e9870 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -2155,9 +2155,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
>   	nvme_tcp_destroy_io_queues(ctrl, remove);
>   }
>   
> -static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
> +static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl,
> +		int status)
>   {
>   	enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
> +	bool recon = nvme_ctrl_reconnect(status);
>   
>   	/* If we are resetting/deleting then do nothing */
>   	if (state != NVME_CTRL_CONNECTING) {
> @@ -2165,13 +2167,14 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
>   		return;
>   	}
>   
> -	if (nvmf_should_reconnect(ctrl)) {
> +	if (recon && nvmf_should_reconnect(ctrl)) {

its weird to have a condition that checks _and_ condition
of two different "should reconnect" conditions.

Why don't we simply pass the status to nvmf_should_reconnect()
and evaluate there?

>   		dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
>   			ctrl->opts->reconnect_delay);
>   		queue_delayed_work(nvme_wq, &to_tcp_ctrl(ctrl)->connect_work,
>   				ctrl->opts->reconnect_delay * HZ);
>   	} else {
> -		dev_info(ctrl->device, "Removing controller...\n");
> +		dev_info(ctrl->device, "Removing controller (%d)...\n",
> +			 status);
>   		nvme_delete_ctrl(ctrl);
>   	}
>   }
> @@ -2252,10 +2255,12 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
>   	struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
>   			struct nvme_tcp_ctrl, connect_work);
>   	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
> +	int ret;
>   
>   	++ctrl->nr_reconnects;
>   
> -	if (nvme_tcp_setup_ctrl(ctrl, false))
> +	ret = nvme_tcp_setup_ctrl(ctrl, false);
> +	if (ret)
>   		goto requeue;
>   
>   	dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
> @@ -2268,7 +2273,7 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
>   requeue:
>   	dev_info(ctrl->device, "Failed reconnect attempt %d\n",
>   			ctrl->nr_reconnects);
> -	nvme_tcp_reconnect_or_remove(ctrl);
> +	nvme_tcp_reconnect_or_remove(ctrl, ret);
>   }
>   
>   static void nvme_tcp_error_recovery_work(struct work_struct *work)
> @@ -2295,7 +2300,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
>   		return;
>   	}
>   
> -	nvme_tcp_reconnect_or_remove(ctrl);
> +	nvme_tcp_reconnect_or_remove(ctrl, -ENOTCONN);
>   }
>   
>   static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
> @@ -2315,6 +2320,7 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
>   {
>   	struct nvme_ctrl *ctrl =
>   		container_of(work, struct nvme_ctrl, reset_work);
> +	int ret;
>   
>   	nvme_stop_ctrl(ctrl);
>   	nvme_tcp_teardown_ctrl(ctrl, false);
> @@ -2328,14 +2334,15 @@ static void nvme_reset_ctrl_work(struct work_struct *work)
>   		return;
>   	}
>   
> -	if (nvme_tcp_setup_ctrl(ctrl, false))
> +	ret = nvme_tcp_setup_ctrl(ctrl, false);
> +	if (ret)
>   		goto out_fail;
>   
>   	return;
>   
>   out_fail:
>   	++ctrl->nr_reconnects;
> -	nvme_tcp_reconnect_or_remove(ctrl);
> +	nvme_tcp_reconnect_or_remove(ctrl, ret);
>   }
>   
>   static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)

Re: [PATCH v5 3/6] nvme-tcp: short-circuit reconnect retries

Posted by Daniel Wagner 1 year, 10 months ago

On Tue, Apr 09, 2024 at 11:40:03PM +0300, Sagi Grimberg wrote:
> > +/*
> > + * Evaluate the status information returned by the LLDD
> 
> We usually say transport, less so LLDD.

Updated the documentation accordingly.

> > + * 1) The LLDD reports an negative status. There was an error (e.g. no
> > + *    memory) on the host side and thus abort the operation.
> > + *    Note, there are exception such as ENOTCONN which is
> > + *    not an internal driver error, thus we filter these errors
> > + *    out and retry later.
>
> What is this ENOTCONN here? From what I see its just a random
> escape value to bypass the status check. I think that 0 would do the same
> wouldn't it?

ENOTCONN is issued by the reset handler. Sure, 0 will work as well, so
if the consense is that the reset handler should do that. see below.

> > -	if (nvmf_should_reconnect(ctrl)) {
> > +	if (recon && nvmf_should_reconnect(ctrl)) {
> 
> its weird to have a condition that checks _and_ condition
> of two different "should reconnect" conditions.
> 
> Why don't we simply pass the status to nvmf_should_reconnect()
> and evaluate there?

Sure, makes sense.

> >   static void nvme_tcp_error_recovery_work(struct work_struct *work)
> > @@ -2295,7 +2300,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
> >   		return;
> >   	}
> > -	nvme_tcp_reconnect_or_remove(ctrl);
> > +	nvme_tcp_reconnect_or_remove(ctrl, -ENOTCONN);

Here is the source of ENOTCONN.

Re: [PATCH v5 3/6] nvme-tcp: short-circuit reconnect retries

Posted by Sagi Grimberg 1 year, 10 months ago


On 09/04/2024 12:35, Daniel Wagner wrote:
> From: Hannes Reinecke <hare@suse.de>
>
> Returning an nvme status from nvme_tcp_setup_ctrl() indicates that the
> association was established and we have received a status from the
> controller; consequently we should honour the DNR bit. If not any future
> reconnect attempts will just return the same error, so we can
> short-circuit the reconnect attempts and fail the connection directly.
>
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> [dwagner: add helper to decide to reconnect]
> Signed-off-by: Daniel Wagner <dwagner@suse.de>
> ---
>   drivers/nvme/host/nvme.h | 24 ++++++++++++++++++++++++
>   drivers/nvme/host/tcp.c  | 23 +++++++++++++++--------
>   2 files changed, 39 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 9b8904a476b8..dfe103283a3d 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -701,6 +701,30 @@ static inline bool nvme_is_path_error(u16 status)
>   	return (status & 0x700) == 0x300;
>   }
>   
> +/*
> + * Evaluate the status information returned by the LLDD in order to
> + * decided if a reconnect attempt should be scheduled.
> + *
> + * There are two cases where no reconnect attempt should be attempted:
> + *
> + * 1) The LLDD reports an negative status. There was an error (e.g. no
> + *    memory) on the host side and thus abort the operation.
> + *    Note, there are exception such as ENOTCONN which is
> + *    not an internal driver error, thus we filter these errors
> + *    out and retry later.
> + * 2) The DNR bit is set and the specification states no further
> + *    connect attempts with the same set of paramenters should be
> + *    attempted.
> + */
> +static inline bool nvme_ctrl_reconnect(int status)
> +{
> +	if (status < 0 && status != -ENOTCONN)
> +		return false;

So if the host failed to allocate a buffer it will never attempt
another reconnect? doesn't sound right to me..,

Re: [PATCH v5 3/6] nvme-tcp: short-circuit reconnect retries

Posted by Hannes Reinecke 1 year, 10 months ago

On 4/9/24 22:20, Sagi Grimberg wrote:
> 
> 
> On 09/04/2024 12:35, Daniel Wagner wrote:
>> From: Hannes Reinecke <hare@suse.de>
>>
>> Returning an nvme status from nvme_tcp_setup_ctrl() indicates that the
>> association was established and we have received a status from the
>> controller; consequently we should honour the DNR bit. If not any future
>> reconnect attempts will just return the same error, so we can
>> short-circuit the reconnect attempts and fail the connection directly.
>>
>> Signed-off-by: Hannes Reinecke <hare@suse.de>
>> [dwagner: add helper to decide to reconnect]
>> Signed-off-by: Daniel Wagner <dwagner@suse.de>
>> ---
>>   drivers/nvme/host/nvme.h | 24 ++++++++++++++++++++++++
>>   drivers/nvme/host/tcp.c  | 23 +++++++++++++++--------
>>   2 files changed, 39 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
>> index 9b8904a476b8..dfe103283a3d 100644
>> --- a/drivers/nvme/host/nvme.h
>> +++ b/drivers/nvme/host/nvme.h
>> @@ -701,6 +701,30 @@ static inline bool nvme_is_path_error(u16 status)
>>       return (status & 0x700) == 0x300;
>>   }
>> +/*
>> + * Evaluate the status information returned by the LLDD in order to
>> + * decided if a reconnect attempt should be scheduled.
>> + *
>> + * There are two cases where no reconnect attempt should be attempted:
>> + *
>> + * 1) The LLDD reports an negative status. There was an error (e.g. no
>> + *    memory) on the host side and thus abort the operation.
>> + *    Note, there are exception such as ENOTCONN which is
>> + *    not an internal driver error, thus we filter these errors
>> + *    out and retry later.
>> + * 2) The DNR bit is set and the specification states no further
>> + *    connect attempts with the same set of paramenters should be
>> + *    attempted.
>> + */
>> +static inline bool nvme_ctrl_reconnect(int status)
>> +{
>> +    if (status < 0 && status != -ENOTCONN)
>> +        return false;
> 
> So if the host failed to allocate a buffer it will never attempt
> another reconnect? doesn't sound right to me..,

Memory allocation errors are always tricky. If a system is under memory
pressure yours won't be the only process which will exhibit issues,
so it's questionable whether you should retry, or whether it wouldn't
be safer to just abort the operation, and retry manually once the
system has recovered.
Also we just have the interesting case where RDMA goes haywire and
reports a log buffer size of several TB. No amount of retries will
be fixing that.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

Re: [PATCH v5 3/6] nvme-tcp: short-circuit reconnect retries

Posted by Christoph Hellwig 1 year, 10 months ago

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>