drivers/scsi/libsas/sas_discover.c | 32 +++++++++++++++++++----------- drivers/scsi/libsas/sas_expander.c | 8 ++++++++ drivers/scsi/libsas/sas_internal.h | 6 +++++- 3 files changed, 33 insertions(+), 13 deletions(-)
We found that it is judged as broadcast flutter when the exp-attached end
device reconnects after probe failed, as follows:
[78779.654026] sas: broadcast received: 0
[78779.654037] sas: REVALIDATING DOMAIN on port 0, pid:10
[78779.654680] sas: ex 500e004aaaaaaa1f phy05 change count has changed
[78779.662977] sas: ex 500e004aaaaaaa1f phy05 originated BROADCAST(CHANGE)
[78779.662986] sas: ex 500e004aaaaaaa1f phy05 new device attached
[78779.663079] sas: ex 500e004aaaaaaa1f phy05:U:8 attached: 500e004aaaaaaa05 (stp)
[78779.693542] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] found
[78779.701155] sas: done REVALIDATING DOMAIN on port 0, pid:10, res 0x0
[78779.707864] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
...
[78835.161307] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
[78835.171344] sas: sas_probe_sata: for exp-attached device 500e004aaaaaaa05 returned -19
[78835.180879] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] is gone
[78835.187487] sas: broadcast received: 0
[78835.187504] sas: REVALIDATING DOMAIN on port 0, pid:10
[78835.188263] sas: ex 500e004aaaaaaa1f phy05 change count has changed
[78835.195870] sas: ex 500e004aaaaaaa1f phy05 originated BROADCAST(CHANGE)
[78835.195875] sas: ex 500e004aaaaaaa1f rediscovering phy05
[78835.196022] sas: ex 500e004aaaaaaa1f phy05:U:A attached: 500e004aaaaaaa05 (stp)
[78835.196026] sas: ex 500e004aaaaaaa1f phy05 broadcast flutter
[78835.197615] sas: done REVALIDATING DOMAIN on port 0, pid:10, res 0x0
The cause of the problem is that the related ex_phy's attached_sas_addr was
not cleared after the end device probe failed. In order to solve the above
problem, a function sas_ex_unregister_end_dev() is defined to clear the
ex_phy information and unregister the end device after the exp-attached end
device probe failed.
As devices may probe failed after done REVALIDATING DOMAIN when call
sas_probe_devices(). Then after its port is added to the sas_port_del_list,
the port will not be deleted until the end of the next REVALIDATING DOMAIN
and sas_destruct_ports() is called. A warning about creating a duplicate
port will occur in the new REVALIDATING DOMAIN when the end device
reconnects. Therefore, the previous destroy_list and sas_port_del_list
should be handled after devices probe failed.
Signed-off-by: Xingui Yang <yangxingui@huawei.com>
---
Changes since v2:
- Add a helper for calling sas_destruct_devices() and sas_destruct_ports(),
and put the new call at the end of sas_probe_devices() based on John's
suggestion.
Changes since v1:
- Simplify the process of getting ex_phy id based on Jason's suggestion.
- Update commit information.
---
drivers/scsi/libsas/sas_discover.c | 32 +++++++++++++++++++-----------
drivers/scsi/libsas/sas_expander.c | 8 ++++++++
drivers/scsi/libsas/sas_internal.h | 6 +++++-
3 files changed, 33 insertions(+), 13 deletions(-)
diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
index 8fb7c41c0962..8c517e47d2b9 100644
--- a/drivers/scsi/libsas/sas_discover.c
+++ b/drivers/scsi/libsas/sas_discover.c
@@ -17,6 +17,22 @@
#include <scsi/sas_ata.h>
#include "scsi_sas_internal.h"
+static void sas_destruct_ports(struct asd_sas_port *port)
+{
+ struct sas_port *sas_port, *p;
+
+ list_for_each_entry_safe(sas_port, p, &port->sas_port_del_list, del_list) {
+ list_del_init(&sas_port->del_list);
+ sas_port_delete(sas_port);
+ }
+}
+
+static void sas_destruct_devices_and_ports(struct asd_sas_port *port)
+{
+ sas_destruct_devices(port);
+ sas_destruct_ports(port);
+}
+
/* ---------- Basic task processing for discovery purposes ---------- */
void sas_init_dev(struct domain_device *dev)
@@ -226,6 +242,9 @@ static void sas_probe_devices(struct asd_sas_port *port)
else
list_del_init(&dev->disco_list_node);
}
+
+ /* destruct devices and ports after probe failed */
+ sas_destruct_devices_and_ports(port);
}
static void sas_suspend_devices(struct work_struct *work)
@@ -350,16 +369,6 @@ void sas_destruct_devices(struct asd_sas_port *port)
}
}
-static void sas_destruct_ports(struct asd_sas_port *port)
-{
- struct sas_port *sas_port, *p;
-
- list_for_each_entry_safe(sas_port, p, &port->sas_port_del_list, del_list) {
- list_del_init(&sas_port->del_list);
- sas_port_delete(sas_port);
- }
-}
-
static bool sas_abort_cmd(struct request *req, void *data)
{
struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
@@ -538,8 +547,7 @@ static void sas_revalidate_domain(struct work_struct *work)
out:
mutex_unlock(&ha->disco_mutex);
- sas_destruct_devices(port);
- sas_destruct_ports(port);
+ sas_destruct_devices_and_ports(port);
sas_probe_devices(port);
}
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 4e6bb3d0f163..09be69ea09a2 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -1878,6 +1878,14 @@ static void sas_unregister_devs_sas_addr(struct domain_device *parent,
}
}
+void sas_ex_unregister_end_dev(struct domain_device *dev)
+{
+ struct domain_device *parent = dev->parent;
+ struct sas_phy *phy = dev->phy;
+
+ sas_unregister_devs_sas_addr(parent, phy->number, true);
+}
+
static int sas_discover_bfs_by_root_level(struct domain_device *root,
const int level)
{
diff --git a/drivers/scsi/libsas/sas_internal.h b/drivers/scsi/libsas/sas_internal.h
index 85948963fb97..caeda847c919 100644
--- a/drivers/scsi/libsas/sas_internal.h
+++ b/drivers/scsi/libsas/sas_internal.h
@@ -50,6 +50,7 @@ void sas_discover_event(struct asd_sas_port *port, enum discover_event ev);
void sas_init_dev(struct domain_device *dev);
void sas_unregister_dev(struct asd_sas_port *port, struct domain_device *dev);
+void sas_ex_unregister_end_dev(struct domain_device *dev);
void sas_scsi_recover_host(struct Scsi_Host *shost);
@@ -145,7 +146,10 @@ static inline void sas_fail_probe(struct domain_device *dev, const char *func, i
func, dev->parent ? "exp-attached" :
"direct-attached",
SAS_ADDR(dev->sas_addr), err);
- sas_unregister_dev(dev->port, dev);
+ if (dev->parent && !dev_is_expander(dev->dev_type))
+ sas_ex_unregister_end_dev(dev);
+ else
+ sas_unregister_dev(dev->port, dev);
}
static inline void sas_fill_in_rphy(struct domain_device *dev,
--
2.17.1
On 13/06/2024 13:23, Xingui Yang wrote:
Sorry for delay in responding and asking further questions.
> We found that it is judged as broadcast flutter when the exp-attached end
> device reconnects after probe failed, as follows:
>
> [78779.654026] sas: broadcast received: 0
> [78779.654037] sas: REVALIDATING DOMAIN on port 0, pid:10
> [78779.654680] sas: ex 500e004aaaaaaa1f phy05 change count has changed
> [78779.662977] sas: ex 500e004aaaaaaa1f phy05 originated BROADCAST(CHANGE)
> [78779.662986] sas: ex 500e004aaaaaaa1f phy05 new device attached
> [78779.663079] sas: ex 500e004aaaaaaa1f phy05:U:8 attached: 500e004aaaaaaa05 (stp)
> [78779.693542] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] found
> [78779.701155] sas: done REVALIDATING DOMAIN on port 0, pid:10, res 0x0
> [78779.707864] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
> ...
> [78835.161307] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0 tries: 1
> [78835.171344] sas: sas_probe_sata: for exp-attached device 500e004aaaaaaa05 returned -19
> [78835.180879] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] is gone
> [78835.187487] sas: broadcast received: 0
> [78835.187504] sas: REVALIDATING DOMAIN on port 0, pid:10
> [78835.188263] sas: ex 500e004aaaaaaa1f phy05 change count has changed
> [78835.195870] sas: ex 500e004aaaaaaa1f phy05 originated BROADCAST(CHANGE)
> [78835.195875] sas: ex 500e004aaaaaaa1f rediscovering phy05
> [78835.196022] sas: ex 500e004aaaaaaa1f phy05:U:A attached: 500e004aaaaaaa05 (stp)
> [78835.196026] sas: ex 500e004aaaaaaa1f phy05 broadcast flutter
> [78835.197615] sas: done REVALIDATING DOMAIN on port 0, pid:10, res 0x0
>
> The cause of the problem is that the related ex_phy's attached_sas_addr was
> not cleared after the end device probe failed. In order to solve the above
> problem, a function sas_ex_unregister_end_dev() is defined to clear the
> ex_phy information and unregister the end device after the exp-attached end
> device probe failed.
Can you just manually clear the ex_phy's attached_sas_addr at the
appropiate point (along with calling sas_unregister_dev())? It seems
that we are using heavy-handed approach in calling
sas_unregister_devs_sas_addr(), which does the clearing and much more.
>
> As devices may probe failed after done REVALIDATING DOMAIN when call
> sas_probe_devices(). Then after its port is added to the sas_port_del_list,
> the port will not be deleted until the end of the next REVALIDATING DOMAIN
> and sas_destruct_ports() is called. A warning about creating a duplicate
> port will occur in the new REVALIDATING DOMAIN when the end device
> reconnects. Therefore, the previous destroy_list and sas_port_del_list
> should be handled after devices probe failed.
>
> Signed-off-by: Xingui Yang <yangxingui@huawei.com>
> ---
> Changes since v2:
> - Add a helper for calling sas_destruct_devices() and sas_destruct_ports(),
> and put the new call at the end of sas_probe_devices() based on John's
> suggestion.
>
> Changes since v1:
> - Simplify the process of getting ex_phy id based on Jason's suggestion.
> - Update commit information.
> ---
> drivers/scsi/libsas/sas_discover.c | 32 +++++++++++++++++++-----------
> drivers/scsi/libsas/sas_expander.c | 8 ++++++++
> drivers/scsi/libsas/sas_internal.h | 6 +++++-
> 3 files changed, 33 insertions(+), 13 deletions(-)
>
> diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
> index 8fb7c41c0962..8c517e47d2b9 100644
> --- a/drivers/scsi/libsas/sas_discover.c
> +++ b/drivers/scsi/libsas/sas_discover.c
> @@ -17,6 +17,22 @@
> #include <scsi/sas_ata.h>
> #include "scsi_sas_internal.h"
>
> +static void sas_destruct_ports(struct asd_sas_port *port)
> +{
> + struct sas_port *sas_port, *p;
> +
> + list_for_each_entry_safe(sas_port, p, &port->sas_port_del_list, del_list) {
> + list_del_init(&sas_port->del_list);
> + sas_port_delete(sas_port);
> + }
> +}
> +
> +static void sas_destruct_devices_and_ports(struct asd_sas_port *port)
"and" in a function name never sounds right.
Can you just call sas_destruct_port(), as it takes a port arg? Maybe
rename sas_destruct_ports() to sas_delete_ports(), as it does "delete" -
this may avoid some confusion in names.
> +{
> + sas_destruct_devices(port);
> + sas_destruct_ports(port);
> +}
> +
> /* ---------- Basic task processing for discovery purposes ---------- */
>
> void sas_init_dev(struct domain_device *dev)
> @@ -226,6 +242,9 @@ static void sas_probe_devices(struct asd_sas_port *port)
> else
> list_del_init(&dev->disco_list_node);
> }
> +
> + /* destruct devices and ports after probe failed */
> + sas_destruct_devices_and_ports(port);
> }
>
> static void sas_suspend_devices(struct work_struct *work)
> @@ -350,16 +369,6 @@ void sas_destruct_devices(struct asd_sas_port *port)
> }
> }
>
> -static void sas_destruct_ports(struct asd_sas_port *port)
> -{
> - struct sas_port *sas_port, *p;
> -
> - list_for_each_entry_safe(sas_port, p, &port->sas_port_del_list, del_list) {
> - list_del_init(&sas_port->del_list);
> - sas_port_delete(sas_port);
> - }
> -}
> -
> static bool sas_abort_cmd(struct request *req, void *data)
> {
> struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req);
> @@ -538,8 +547,7 @@ static void sas_revalidate_domain(struct work_struct *work)
> out:
> mutex_unlock(&ha->disco_mutex);
>
> - sas_destruct_devices(port);
> - sas_destruct_ports(port);
> + sas_destruct_devices_and_ports(port);
> sas_probe_devices(port);
> }
>
> diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
> index 4e6bb3d0f163..09be69ea09a2 100644
> --- a/drivers/scsi/libsas/sas_expander.c
> +++ b/drivers/scsi/libsas/sas_expander.c
> @@ -1878,6 +1878,14 @@ static void sas_unregister_devs_sas_addr(struct domain_device *parent,
> }
> }
>
> +void sas_ex_unregister_end_dev(struct domain_device *dev)
> +{
> + struct domain_device *parent = dev->parent;
> + struct sas_phy *phy = dev->phy;
> +
> + sas_unregister_devs_sas_addr(parent, phy->number, true);
> +}
> +
> static int sas_discover_bfs_by_root_level(struct domain_device *root,
> const int level)
> {
> diff --git a/drivers/scsi/libsas/sas_internal.h b/drivers/scsi/libsas/sas_internal.h
> index 85948963fb97..caeda847c919 100644
> --- a/drivers/scsi/libsas/sas_internal.h
> +++ b/drivers/scsi/libsas/sas_internal.h
> @@ -50,6 +50,7 @@ void sas_discover_event(struct asd_sas_port *port, enum discover_event ev);
>
> void sas_init_dev(struct domain_device *dev);
> void sas_unregister_dev(struct asd_sas_port *port, struct domain_device *dev);
> +void sas_ex_unregister_end_dev(struct domain_device *dev);
>
> void sas_scsi_recover_host(struct Scsi_Host *shost);
>
> @@ -145,7 +146,10 @@ static inline void sas_fail_probe(struct domain_device *dev, const char *func, i
> func, dev->parent ? "exp-attached" :
> "direct-attached",
> SAS_ADDR(dev->sas_addr), err);
> - sas_unregister_dev(dev->port, dev);
> + if (dev->parent && !dev_is_expander(dev->dev_type))
> + sas_ex_unregister_end_dev(dev);
> + else
> + sas_unregister_dev(dev->port, dev);
> }
>
> static inline void sas_fill_in_rphy(struct domain_device *dev,
Hi, John,
Thanks for your reply.
On 2024/6/18 16:55, John Garry wrote:
> On 13/06/2024 13:23, Xingui Yang wrote:
>
> Sorry for delay in responding and asking further questions.
It doesn't matter.
>
>> We found that it is judged as broadcast flutter when the exp-attached end
>> device reconnects after probe failed, as follows:
>>
>> [78779.654026] sas: broadcast received: 0
>> [78779.654037] sas: REVALIDATING DOMAIN on port 0, pid:10
>> [78779.654680] sas: ex 500e004aaaaaaa1f phy05 change count has changed
>> [78779.662977] sas: ex 500e004aaaaaaa1f phy05 originated
>> BROADCAST(CHANGE)
>> [78779.662986] sas: ex 500e004aaaaaaa1f phy05 new device attached
>> [78779.663079] sas: ex 500e004aaaaaaa1f phy05:U:8 attached:
>> 500e004aaaaaaa05 (stp)
>> [78779.693542] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] found
>> [78779.701155] sas: done REVALIDATING DOMAIN on port 0, pid:10, res 0x0
>> [78779.707864] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
>> ...
>> [78835.161307] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0
>> tries: 1
>> [78835.171344] sas: sas_probe_sata: for exp-attached device
>> 500e004aaaaaaa05 returned -19
>> [78835.180879] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] is gone
>> [78835.187487] sas: broadcast received: 0
>> [78835.187504] sas: REVALIDATING DOMAIN on port 0, pid:10
>> [78835.188263] sas: ex 500e004aaaaaaa1f phy05 change count has changed
>> [78835.195870] sas: ex 500e004aaaaaaa1f phy05 originated
>> BROADCAST(CHANGE)
>> [78835.195875] sas: ex 500e004aaaaaaa1f rediscovering phy05
>> [78835.196022] sas: ex 500e004aaaaaaa1f phy05:U:A attached:
>> 500e004aaaaaaa05 (stp)
>> [78835.196026] sas: ex 500e004aaaaaaa1f phy05 broadcast flutter
>> [78835.197615] sas: done REVALIDATING DOMAIN on port 0, pid:10, res 0x0
>>
>> The cause of the problem is that the related ex_phy's
>> attached_sas_addr was
>> not cleared after the end device probe failed. In order to solve the
>> above
>> problem, a function sas_ex_unregister_end_dev() is defined to clear the
>> ex_phy information and unregister the end device after the
>> exp-attached end
>> device probe failed.
>
> Can you just manually clear the ex_phy's attached_sas_addr at the
> appropiate point (along with calling sas_unregister_dev())? It seems
> that we are using heavy-handed approach in calling
> sas_unregister_devs_sas_addr(), which does the clearing and much more.
I just tried it and it worked. If we only clear ex_phy's
attached_sas_addr, there is no need to call sas_destruct_ports(). We are
currently using sas_unregister_devs_sas_addr() which will add the port
to sas_port_del_list, so we need to call sas_destruct_ports() separately
to delete the port.
Should we also delete the port after the devices probe failed?
Maybe I can update another version and only clear ex_phy's
attached_sas_addr based on your suggestions.
>
>>
>> As devices may probe failed after done REVALIDATING DOMAIN when call
>> sas_probe_devices(). Then after its port is added to the
>> sas_port_del_list,
>> the port will not be deleted until the end of the next REVALIDATING
>> DOMAIN
>> and sas_destruct_ports() is called. A warning about creating a duplicate
>> port will occur in the new REVALIDATING DOMAIN when the end device
>> reconnects. Therefore, the previous destroy_list and sas_port_del_list
>> should be handled after devices probe failed.
>>
>> Signed-off-by: Xingui Yang <yangxingui@huawei.com>
>> ---
>> Changes since v2:
>> - Add a helper for calling sas_destruct_devices() and
>> sas_destruct_ports(),
>> and put the new call at the end of sas_probe_devices() based on John's
>> suggestion.
>>
>> Changes since v1:
>> - Simplify the process of getting ex_phy id based on Jason's suggestion.
>> - Update commit information.
>> ---
>> drivers/scsi/libsas/sas_discover.c | 32 +++++++++++++++++++-----------
>> drivers/scsi/libsas/sas_expander.c | 8 ++++++++
>> drivers/scsi/libsas/sas_internal.h | 6 +++++-
>> 3 files changed, 33 insertions(+), 13 deletions(-)
>>
>> diff --git a/drivers/scsi/libsas/sas_discover.c
>> b/drivers/scsi/libsas/sas_discover.c
>> index 8fb7c41c0962..8c517e47d2b9 100644
>> --- a/drivers/scsi/libsas/sas_discover.c
>> +++ b/drivers/scsi/libsas/sas_discover.c
>> @@ -17,6 +17,22 @@
>> #include <scsi/sas_ata.h>
>> #include "scsi_sas_internal.h"
>> +static void sas_destruct_ports(struct asd_sas_port *port)
>> +{
>> + struct sas_port *sas_port, *p;
>> +
>> + list_for_each_entry_safe(sas_port, p, &port->sas_port_del_list,
>> del_list) {
>> + list_del_init(&sas_port->del_list);
>> + sas_port_delete(sas_port);
>> + }
>> +}
>> +
>> +static void sas_destruct_devices_and_ports(struct asd_sas_port *port)
>
> "and" in a function name never sounds right.
>
> Can you just call sas_destruct_port(), as it takes a port arg? Maybe
> rename sas_destruct_ports() to sas_delete_ports(), as it does "delete" -
> this may avoid some confusion in names.
As described above, if we only clear ex_phy's attached_sas_addr, we do
not need to call sas_destruct_ports().
Thanks,
Xingui
On 18/06/2024 12:45, yangxingui wrote:
> Hi, John,
>
> Thanks for your reply.
>
> On 2024/6/18 16:55, John Garry wrote:
>> On 13/06/2024 13:23, Xingui Yang wrote:
>>
>> Sorry for delay in responding and asking further questions.
> It doesn't matter.
>>
>>> We found that it is judged as broadcast flutter when the exp-attached
>>> end
>>> device reconnects after probe failed, as follows:
>>>
>>> [78779.654026] sas: broadcast received: 0
>>> [78779.654037] sas: REVALIDATING DOMAIN on port 0, pid:10
>>> [78779.654680] sas: ex 500e004aaaaaaa1f phy05 change count has changed
>>> [78779.662977] sas: ex 500e004aaaaaaa1f phy05 originated
>>> BROADCAST(CHANGE)
>>> [78779.662986] sas: ex 500e004aaaaaaa1f phy05 new device attached
>>> [78779.663079] sas: ex 500e004aaaaaaa1f phy05:U:8 attached:
>>> 500e004aaaaaaa05 (stp)
>>> [78779.693542] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] found
>>> [78779.701155] sas: done REVALIDATING DOMAIN on port 0, pid:10, res 0x0
>>> [78779.707864] sas: Enter sas_scsi_recover_host busy: 0 failed: 0
>>> ...
>>> [78835.161307] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: 0
>>> tries: 1
>>> [78835.171344] sas: sas_probe_sata: for exp-attached device
>>> 500e004aaaaaaa05 returned -19
>>> [78835.180879] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] is gone
>>> [78835.187487] sas: broadcast received: 0
>>> [78835.187504] sas: REVALIDATING DOMAIN on port 0, pid:10
>>> [78835.188263] sas: ex 500e004aaaaaaa1f phy05 change count has changed
>>> [78835.195870] sas: ex 500e004aaaaaaa1f phy05 originated
>>> BROADCAST(CHANGE)
>>> [78835.195875] sas: ex 500e004aaaaaaa1f rediscovering phy05
>>> [78835.196022] sas: ex 500e004aaaaaaa1f phy05:U:A attached:
>>> 500e004aaaaaaa05 (stp)
>>> [78835.196026] sas: ex 500e004aaaaaaa1f phy05 broadcast flutter
>>> [78835.197615] sas: done REVALIDATING DOMAIN on port 0, pid:10, res 0x0
>>>
>>> The cause of the problem is that the related ex_phy's
>>> attached_sas_addr was
>>> not cleared after the end device probe failed. In order to solve the
>>> above
>>> problem, a function sas_ex_unregister_end_dev() is defined to clear the
>>> ex_phy information and unregister the end device after the
>>> exp-attached end
>>> device probe failed.
>>
>> Can you just manually clear the ex_phy's attached_sas_addr at the
>> appropiate point (along with calling sas_unregister_dev())? It seems
>> that we are using heavy-handed approach in calling
>> sas_unregister_devs_sas_addr(), which does the clearing and much more.
>
> I just tried it and it worked. If we only clear ex_phy's
> attached_sas_addr, there is no need to call sas_destruct_ports(). We are
> currently using sas_unregister_devs_sas_addr() which will add the port
> to sas_port_del_list, so we need to call sas_destruct_ports() separately
> to delete the port.
>
> Should we also delete the port after the devices probe failed?
I'm not sure. Please check it.
sas_fail_probe() would still call sas_unregister_dev(), as required.
And you said that the sas_fail_probe() probe call would be asynchronous
to sas_revalidate_domainin(). I actually expected you to have the new
call to sas_destruct_ports() at the top of sas_revalidate_domainin(),
like v2, but it is in sas_probe_devices().
Anyway, please check whether you require this additional call to delete
the port.
>
> Maybe I can update another version and only clear ex_phy's
> attached_sas_addr based on your suggestions.
>>
>>>
>>> As devices may probe failed after done REVALIDATING DOMAIN when call
>>> sas_probe_devices(). Then after its port is added to the
>>> sas_port_del_list,
>>> the port will not be deleted until the end of the next REVALIDATING
>>> DOMAIN
>>> and sas_destruct_ports() is called. A warning about creating a duplicate
>>> port will occur in the new REVALIDATING DOMAIN when the end device
>>> reconnects. Therefore, the previous destroy_list and sas_port_del_list
>>> should be handled after devices probe failed.
>>>
>>> Signed-off-by: Xingui Yang <yangxingui@huawei.com>
>>> ---
>>> Changes since v2:
>>> - Add a helper for calling sas_destruct_devices() and
>>> sas_destruct_ports(),
>>> and put the new call at the end of sas_probe_devices() based on
>>> John's
>>> suggestion.
>>>
>>> Changes since v1:
>>> - Simplify the process of getting ex_phy id based on Jason's suggestion.
>>> - Update commit information.
>>> ---
>>> drivers/scsi/libsas/sas_discover.c | 32 +++++++++++++++++++-----------
>>> drivers/scsi/libsas/sas_expander.c | 8 ++++++++
>>> drivers/scsi/libsas/sas_internal.h | 6 +++++-
>>> 3 files changed, 33 insertions(+), 13 deletions(-)
>>>
>>> diff --git a/drivers/scsi/libsas/sas_discover.c
>>> b/drivers/scsi/libsas/sas_discover.c
>>> index 8fb7c41c0962..8c517e47d2b9 100644
>>> --- a/drivers/scsi/libsas/sas_discover.c
>>> +++ b/drivers/scsi/libsas/sas_discover.c
>>> @@ -17,6 +17,22 @@
>>> #include <scsi/sas_ata.h>
>>> #include "scsi_sas_internal.h"
>>> +static void sas_destruct_ports(struct asd_sas_port *port)
>>> +{
>>> + struct sas_port *sas_port, *p;
>>> +
>>> + list_for_each_entry_safe(sas_port, p, &port->sas_port_del_list,
>>> del_list) {
>>> + list_del_init(&sas_port->del_list);
>>> + sas_port_delete(sas_port);
>>> + }
>>> +}
>>> +
>>> +static void sas_destruct_devices_and_ports(struct asd_sas_port *port)
>>
>> "and" in a function name never sounds right.
>>
>> Can you just call sas_destruct_port(), as it takes a port arg? Maybe
>> rename sas_destruct_ports() to sas_delete_ports(), as it does "delete"
>> - this may avoid some confusion in names.
> As described above, if we only clear ex_phy's attached_sas_addr, we do
> not need to call sas_destruct_ports().
Thanks,
John
Hi John, On 2024/6/18 20:08, John Garry wrote: > On 18/06/2024 12:45, yangxingui wrote: >> Hi, John, >> >> Thanks for your reply. >> >> On 2024/6/18 16:55, John Garry wrote: >>> On 13/06/2024 13:23, Xingui Yang wrote: >>> >>> Sorry for delay in responding and asking further questions. >> It doesn't matter. >>> >>>> We found that it is judged as broadcast flutter when the >>>> exp-attached end >>>> device reconnects after probe failed, as follows: >>>> >>>> [78779.654026] sas: broadcast received: 0 >>>> [78779.654037] sas: REVALIDATING DOMAIN on port 0, pid:10 >>>> [78779.654680] sas: ex 500e004aaaaaaa1f phy05 change count has changed >>>> [78779.662977] sas: ex 500e004aaaaaaa1f phy05 originated >>>> BROADCAST(CHANGE) >>>> [78779.662986] sas: ex 500e004aaaaaaa1f phy05 new device attached >>>> [78779.663079] sas: ex 500e004aaaaaaa1f phy05:U:8 attached: >>>> 500e004aaaaaaa05 (stp) >>>> [78779.693542] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] found >>>> [78779.701155] sas: done REVALIDATING DOMAIN on port 0, pid:10, res 0x0 >>>> [78779.707864] sas: Enter sas_scsi_recover_host busy: 0 failed: 0 >>>> ... >>>> [78835.161307] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: >>>> 0 tries: 1 >>>> [78835.171344] sas: sas_probe_sata: for exp-attached device >>>> 500e004aaaaaaa05 returned -19 >>>> [78835.180879] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] is gone >>>> [78835.187487] sas: broadcast received: 0 >>>> [78835.187504] sas: REVALIDATING DOMAIN on port 0, pid:10 >>>> [78835.188263] sas: ex 500e004aaaaaaa1f phy05 change count has changed >>>> [78835.195870] sas: ex 500e004aaaaaaa1f phy05 originated >>>> BROADCAST(CHANGE) >>>> [78835.195875] sas: ex 500e004aaaaaaa1f rediscovering phy05 >>>> [78835.196022] sas: ex 500e004aaaaaaa1f phy05:U:A attached: >>>> 500e004aaaaaaa05 (stp) >>>> [78835.196026] sas: ex 500e004aaaaaaa1f phy05 broadcast flutter >>>> [78835.197615] sas: done REVALIDATING DOMAIN on port 0, pid:10, res 0x0 >>>> >>>> The cause of the problem is that the related ex_phy's >>>> attached_sas_addr was >>>> not cleared after the end device probe failed. In order to solve the >>>> above >>>> problem, a function sas_ex_unregister_end_dev() is defined to clear the >>>> ex_phy information and unregister the end device after the >>>> exp-attached end >>>> device probe failed. >>> >>> Can you just manually clear the ex_phy's attached_sas_addr at the >>> appropiate point (along with calling sas_unregister_dev())? It seems >>> that we are using heavy-handed approach in calling >>> sas_unregister_devs_sas_addr(), which does the clearing and much more. >> >> I just tried it and it worked. If we only clear ex_phy's >> attached_sas_addr, there is no need to call sas_destruct_ports(). We >> are currently using sas_unregister_devs_sas_addr() which will add the >> port to sas_port_del_list, so we need to call sas_destruct_ports() >> separately to delete the port. >> >> Should we also delete the port after the devices probe failed? > > I'm not sure. Please check it. > > sas_fail_probe() would still call sas_unregister_dev(), as required. > > And you said that the sas_fail_probe() probe call would be asynchronous > to sas_revalidate_domainin(). I actually expected you to have the new > call to sas_destruct_ports() at the top of sas_revalidate_domainin(), > like v2, but it is in sas_probe_devices(). > > Anyway, please check whether you require this additional call to delete > the port. > Sorry, there was something wrong with the previous process description. the correct is: 1. REVALIDATING DOMAIN 2. new device attached, create port,etc. 4. done REVALIDATING DOMAIN 5. @out, handle parent->port->sas_port_del_list 6. sas_probe_devices() 7. if device probe failed in step 6 and call sas_unregister_devs_sas_addr(), then add phy->port->list to parent->port->sas_port_del_list // port won't delete 8. next, REVALIDATING DOMAIN 9. new device attached 10. new port create failed, as port already exits. So, v3 delete port at then end of sas_probe_devices(). And if we don't use sas_unregister_devs_sas_addr() follow your suggestion then we don't need to call sas_destruct_ports(). Thanks, Xingui
On 18/06/2024 14:10, yangxingui wrote: >>>> >>>>> We found that it is judged as broadcast flutter when the >>>>> exp-attached end >>>>> device reconnects after probe failed, as follows: >>>>> >>>>> [78779.654026] sas: broadcast received: 0 >>>>> [78779.654037] sas: REVALIDATING DOMAIN on port 0, pid:10 >>>>> [78779.654680] sas: ex 500e004aaaaaaa1f phy05 change count has changed >>>>> [78779.662977] sas: ex 500e004aaaaaaa1f phy05 originated >>>>> BROADCAST(CHANGE) >>>>> [78779.662986] sas: ex 500e004aaaaaaa1f phy05 new device attached >>>>> [78779.663079] sas: ex 500e004aaaaaaa1f phy05:U:8 attached: >>>>> 500e004aaaaaaa05 (stp) >>>>> [78779.693542] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] found >>>>> [78779.701155] sas: done REVALIDATING DOMAIN on port 0, pid:10, res >>>>> 0x0 >>>>> [78779.707864] sas: Enter sas_scsi_recover_host busy: 0 failed: 0 >>>>> ... >>>>> [78835.161307] sas: --- Exit sas_scsi_recover_host: busy: 0 failed: >>>>> 0 tries: 1 >>>>> [78835.171344] sas: sas_probe_sata: for exp-attached device >>>>> 500e004aaaaaaa05 returned -19 >>>>> [78835.180879] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] is gone >>>>> [78835.187487] sas: broadcast received: 0 >>>>> [78835.187504] sas: REVALIDATING DOMAIN on port 0, pid:10 >>>>> [78835.188263] sas: ex 500e004aaaaaaa1f phy05 change count has changed >>>>> [78835.195870] sas: ex 500e004aaaaaaa1f phy05 originated >>>>> BROADCAST(CHANGE) >>>>> [78835.195875] sas: ex 500e004aaaaaaa1f rediscovering phy05 >>>>> [78835.196022] sas: ex 500e004aaaaaaa1f phy05:U:A attached: >>>>> 500e004aaaaaaa05 (stp) >>>>> [78835.196026] sas: ex 500e004aaaaaaa1f phy05 broadcast flutter >>>>> [78835.197615] sas: done REVALIDATING DOMAIN on port 0, pid:10, res >>>>> 0x0 >>>>> >>>>> The cause of the problem is that the related ex_phy's >>>>> attached_sas_addr was >>>>> not cleared after the end device probe failed. In order to solve >>>>> the above >>>>> problem, a function sas_ex_unregister_end_dev() is defined to clear >>>>> the >>>>> ex_phy information and unregister the end device after the >>>>> exp-attached end >>>>> device probe failed. >>>> >>>> Can you just manually clear the ex_phy's attached_sas_addr at the >>>> appropiate point (along with calling sas_unregister_dev())? It seems >>>> that we are using heavy-handed approach in calling >>>> sas_unregister_devs_sas_addr(), which does the clearing and much more. >>> >>> I just tried it and it worked. If we only clear ex_phy's >>> attached_sas_addr, there is no need to call sas_destruct_ports(). We >>> are currently using sas_unregister_devs_sas_addr() which will add the >>> port to sas_port_del_list, so we need to call sas_destruct_ports() >>> separately to delete the port. >>> >>> Should we also delete the port after the devices probe failed? >> >> I'm not sure. Please check it. >> >> sas_fail_probe() would still call sas_unregister_dev(), as required. >> >> And you said that the sas_fail_probe() probe call would be >> asynchronous to sas_revalidate_domainin(). I actually expected you to >> have the new call to sas_destruct_ports() at the top of >> sas_revalidate_domainin(), like v2, but it is in sas_probe_devices(). >> >> Anyway, please check whether you require this additional call to >> delete the port. >> > Sorry, there was something wrong with the previous process description. > the correct is: > > 1. REVALIDATING DOMAIN > 2. new device attached, create port,etc. > 4. done REVALIDATING DOMAIN > 5. @out, handle parent->port->sas_port_del_list > 6. sas_probe_devices() > 7. if device probe failed in step 6 and call > sas_unregister_devs_sas_addr(), then add phy->port->list to > parent->port->sas_port_del_list // port won't delete > > 8. next, REVALIDATING DOMAIN > 9. new device attached > 10. new port create failed, as port already exits. > > > So, v3 delete port at then end of sas_probe_devices(). And if we don't > use sas_unregister_devs_sas_addr() follow your suggestion then we don't > need to call sas_destruct_ports(). I am finding it hard to follow you now. Can you show the complete change which you think that we now require to fix this issue?
Hi, John On 2024/6/18 23:21, John Garry wrote: > On 18/06/2024 14:10, yangxingui wrote: >>>>> >>>>>> We found that it is judged as broadcast flutter when the >>>>>> exp-attached end >>>>>> device reconnects after probe failed, as follows: >>>>>> >>>>>> [78779.654026] sas: broadcast received: 0 >>>>>> [78779.654037] sas: REVALIDATING DOMAIN on port 0, pid:10 >>>>>> [78779.654680] sas: ex 500e004aaaaaaa1f phy05 change count has >>>>>> changed >>>>>> [78779.662977] sas: ex 500e004aaaaaaa1f phy05 originated >>>>>> BROADCAST(CHANGE) >>>>>> [78779.662986] sas: ex 500e004aaaaaaa1f phy05 new device attached >>>>>> [78779.663079] sas: ex 500e004aaaaaaa1f phy05:U:8 attached: >>>>>> 500e004aaaaaaa05 (stp) >>>>>> [78779.693542] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] found >>>>>> [78779.701155] sas: done REVALIDATING DOMAIN on port 0, pid:10, >>>>>> res 0x0 >>>>>> [78779.707864] sas: Enter sas_scsi_recover_host busy: 0 failed: 0 >>>>>> ... >>>>>> [78835.161307] sas: --- Exit sas_scsi_recover_host: busy: 0 >>>>>> failed: 0 tries: 1 >>>>>> [78835.171344] sas: sas_probe_sata: for exp-attached device >>>>>> 500e004aaaaaaa05 returned -19 >>>>>> [78835.180879] hisi_sas_v3_hw 0000:b4:02.0: dev[16:5] is gone >>>>>> [78835.187487] sas: broadcast received: 0 >>>>>> [78835.187504] sas: REVALIDATING DOMAIN on port 0, pid:10 >>>>>> [78835.188263] sas: ex 500e004aaaaaaa1f phy05 change count has >>>>>> changed >>>>>> [78835.195870] sas: ex 500e004aaaaaaa1f phy05 originated >>>>>> BROADCAST(CHANGE) >>>>>> [78835.195875] sas: ex 500e004aaaaaaa1f rediscovering phy05 >>>>>> [78835.196022] sas: ex 500e004aaaaaaa1f phy05:U:A attached: >>>>>> 500e004aaaaaaa05 (stp) >>>>>> [78835.196026] sas: ex 500e004aaaaaaa1f phy05 broadcast flutter >>>>>> [78835.197615] sas: done REVALIDATING DOMAIN on port 0, pid:10, >>>>>> res 0x0 >>>>>> >>>>>> The cause of the problem is that the related ex_phy's >>>>>> attached_sas_addr was >>>>>> not cleared after the end device probe failed. In order to solve >>>>>> the above >>>>>> problem, a function sas_ex_unregister_end_dev() is defined to >>>>>> clear the >>>>>> ex_phy information and unregister the end device after the >>>>>> exp-attached end >>>>>> device probe failed. >>>>> >>>>> Can you just manually clear the ex_phy's attached_sas_addr at the >>>>> appropiate point (along with calling sas_unregister_dev())? It >>>>> seems that we are using heavy-handed approach in calling >>>>> sas_unregister_devs_sas_addr(), which does the clearing and much more. >>>> >>>> I just tried it and it worked. If we only clear ex_phy's >>>> attached_sas_addr, there is no need to call sas_destruct_ports(). We >>>> are currently using sas_unregister_devs_sas_addr() which will add >>>> the port to sas_port_del_list, so we need to call >>>> sas_destruct_ports() separately to delete the port. >>>> >>>> Should we also delete the port after the devices probe failed? >>> >>> I'm not sure. Please check it. >>> >>> sas_fail_probe() would still call sas_unregister_dev(), as required. >>> >>> And you said that the sas_fail_probe() probe call would be >>> asynchronous to sas_revalidate_domainin(). I actually expected you to >>> have the new call to sas_destruct_ports() at the top of >>> sas_revalidate_domainin(), like v2, but it is in sas_probe_devices(). >>> >>> Anyway, please check whether you require this additional call to >>> delete the port. >>> >> Sorry, there was something wrong with the previous process description. >> the correct is: >> >> 1. REVALIDATING DOMAIN >> 2. new device attached, create port,etc. >> 4. done REVALIDATING DOMAIN >> 5. @out, handle parent->port->sas_port_del_list >> 6. sas_probe_devices() >> 7. if device probe failed in step 6 and call >> sas_unregister_devs_sas_addr(), then add phy->port->list to >> parent->port->sas_port_del_list // port won't delete >> >> 8. next, REVALIDATING DOMAIN >> 9. new device attached >> 10. new port create failed, as port already exits. >> >> >> So, v3 delete port at then end of sas_probe_devices(). And if we don't >> use sas_unregister_devs_sas_addr() follow your suggestion then we >> don't need to call sas_destruct_ports(). > > I am finding it hard to follow you now. I'm sorry for that. ^-^ > > Can you show the complete change which you think that we now require to > fix this issue? > Okay, I'll update a new version. Thanks, Xingui
© 2016 - 2026 Red Hat, Inc.