drivers/scsi/libsas/sas_discover.c | 14 +++++++++++++ drivers/scsi/libsas/sas_expander.c | 32 +++++++++++++++++++++++------- drivers/scsi/libsas/sas_port.c | 10 ++++++++++ 3 files changed, 49 insertions(+), 7 deletions(-)
Multiple functions in libsas were accessing port->dev_list without
proper locking, leading to potential race conditions that could cause:
- Use-after-free when devices are removed during list traversal
- List corruption from concurrent modifications
- System crashes from accessing freed memory
This patch adds proper dev_list_lock protection to the following functions:
1. sas_ex_level_discovery(): Added locking around list traversal with
safe iteration and reference counting for devices. The lock is
released before calling functions that may sleep
(sas_ex_discover_devices).
2. sas_dev_present_in_domain(): Added locking for read-only list access
to prevent reading inconsistent list state.
3. sas_suspend_devices(): Added locking around list traversal to prevent
concurrent modifications during device suspension.
4. sas_unregister_domain_devices(): Added proper locking with reference
counting. The lock is released before calling sas_unregister_dev()
which may sleep, but device references are held to prevent premature
removal.
5. sas_port_event_worker(): Added locking around list traversal with
reference counting for devices accessed outside the lock.
All modifications follow the pattern of:
- Hold dev_list_lock during list traversal
- Use list_for_each_entry_safe where list may be modified
- Take device reference (kref_get) before releasing lock
- Release lock before calling functions that may sleep
- Release device reference (sas_put_device) after use
Signed-off-by: Chaohai Chen <wdhh6@aliyun.com>
---
drivers/scsi/libsas/sas_discover.c | 14 +++++++++++++
drivers/scsi/libsas/sas_expander.c | 32 +++++++++++++++++++++++-------
drivers/scsi/libsas/sas_port.c | 10 ++++++++++
3 files changed, 49 insertions(+), 7 deletions(-)
diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
index b07062db50b2..3c18fdfde8c2 100644
--- a/drivers/scsi/libsas/sas_discover.c
+++ b/drivers/scsi/libsas/sas_discover.c
@@ -245,8 +245,10 @@ static void sas_suspend_devices(struct work_struct *work)
* suspension, we force the issue here to keep the reference
* counts aligned
*/
+ spin_lock_irq(&port->dev_list_lock);
list_for_each_entry(dev, &port->dev_list, dev_list_node)
sas_notify_lldd_dev_gone(dev);
+ spin_unlock_irq(&port->dev_list_lock);
/* we are suspending, so we know events are disabled and
* phy_list is not being mutated
@@ -410,11 +412,23 @@ void sas_unregister_domain_devices(struct asd_sas_port *port, bool gone)
{
struct domain_device *dev, *n;
+ /* Lock while iterating to prevent concurrent modifications.
+ * We need to unlock before calling sas_unregister_dev() as it
+ * may sleep, but we hold a reference to prevent device removal.
+ */
+ spin_lock_irq(&port->dev_list_lock);
list_for_each_entry_safe_reverse(dev, n, &port->dev_list, dev_list_node) {
if (gone)
set_bit(SAS_DEV_GONE, &dev->state);
+ kref_get(&dev->kref);
+ spin_unlock_irq(&port->dev_list_lock);
+
sas_unregister_dev(port, dev);
+ sas_put_device(dev);
+
+ spin_lock_irq(&port->dev_list_lock);
}
+ spin_unlock_irq(&port->dev_list_lock);
list_for_each_entry_safe(dev, n, &port->disco_list, disco_list_node)
sas_unregister_dev(port, dev);
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index d953225f6cc2..c82c9b3d5103 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -643,14 +643,21 @@ static int sas_dev_present_in_domain(struct asd_sas_port *port,
u8 *sas_addr)
{
struct domain_device *dev;
+ int found = 0;
if (SAS_ADDR(port->sas_addr) == SAS_ADDR(sas_addr))
return 1;
+
+ spin_lock_irq(&port->dev_list_lock);
list_for_each_entry(dev, &port->dev_list, dev_list_node) {
- if (SAS_ADDR(dev->sas_addr) == SAS_ADDR(sas_addr))
- return 1;
+ if (SAS_ADDR(dev->sas_addr) == SAS_ADDR(sas_addr)) {
+ found = 1;
+ break;
+ }
}
- return 0;
+ spin_unlock_irq(&port->dev_list_lock);
+
+ return found;
}
#define RPEL_REQ_SIZE 16
@@ -1579,20 +1586,31 @@ static int sas_discover_expander(struct domain_device *dev)
static int sas_ex_level_discovery(struct asd_sas_port *port, const int level)
{
int res = 0;
- struct domain_device *dev;
+ struct domain_device *dev, *n;
- list_for_each_entry(dev, &port->dev_list, dev_list_node) {
+ spin_lock_irq(&port->dev_list_lock);
+ list_for_each_entry_safe(dev, n, &port->dev_list, dev_list_node) {
if (dev_is_expander(dev->dev_type)) {
struct sas_expander_device *ex =
rphy_to_expander_device(dev->rphy);
- if (level == ex->level)
+ if (level == ex->level) {
+ kref_get(&dev->kref);
+ spin_unlock_irq(&port->dev_list_lock);
res = sas_ex_discover_devices(dev, -1);
- else if (level > 0)
+ sas_put_device(dev);
+ spin_lock_irq(&port->dev_list_lock);
+ } else if (level > 0) {
+ kref_get(&port->port_dev->kref);
+ spin_unlock_irq(&port->dev_list_lock);
res = sas_ex_discover_devices(port->port_dev, -1);
+ sas_put_device(port->port_dev);
+ spin_lock_irq(&port->dev_list_lock);
+ }
}
}
+ spin_unlock_irq(&port->dev_list_lock);
return res;
}
diff --git a/drivers/scsi/libsas/sas_port.c b/drivers/scsi/libsas/sas_port.c
index de7556070048..491c9f7104c6 100644
--- a/drivers/scsi/libsas/sas_port.c
+++ b/drivers/scsi/libsas/sas_port.c
@@ -44,13 +44,19 @@ static void sas_resume_port(struct asd_sas_phy *phy)
* 1/ presume every device came back
* 2/ force the next revalidation to check all expander phys
*/
+ spin_lock_irq(&port->dev_list_lock);
list_for_each_entry_safe(dev, n, &port->dev_list, dev_list_node) {
int i, rc;
+ kref_get(&dev->kref);
+ spin_unlock_irq(&port->dev_list_lock);
+
rc = sas_notify_lldd_dev_found(dev);
if (rc) {
sas_unregister_dev(port, dev);
sas_destruct_devices(port);
+ sas_put_device(dev);
+ spin_lock_irq(&port->dev_list_lock);
continue;
}
@@ -62,7 +68,11 @@ static void sas_resume_port(struct asd_sas_phy *phy)
phy->phy_change_count = -1;
}
}
+
+ sas_put_device(dev);
+ spin_lock_irq(&port->dev_list_lock);
}
+ spin_unlock_irq(&port->dev_list_lock);
sas_discover_event(port, DISCE_RESUME);
}
--
2.43.7
Hi, 在 2026/1/29 17:38, Chaohai Chen 写道: > Multiple functions in libsas were accessing port->dev_list without > proper locking, leading to potential race conditions that could cause: > - Use-after-free when devices are removed during list traversal > - List corruption from concurrent modifications > - System crashes from accessing freed memory libsas events are processed in orderd workqueue. Do you have a crash log? Thanks, 祝一切顺利
On Fri, Jan 30, 2026 at 04:27:11PM +0800, Jason Yan wrote: > Hi, > > 在 2026/1/29 17:38, Chaohai Chen 写道: > > Multiple functions in libsas were accessing port->dev_list without > > proper locking, leading to potential race conditions that could cause: > > - Use-after-free when devices are removed during list traversal > > - List corruption from concurrent modifications > > - System crashes from accessing freed memory > > libsas events are processed in orderd workqueue. Do you have a crash log? No crash log. I noticed the missing locks while watching the code. But there are tow queues, event_q and disco_q which may cause conflicts. And I think the dev_ist_lock is designed to prevent conflicts. > > Thanks, > 祝一切顺利 -- Chaohai Chen
On 1/31/26 19:47, Chaohai Chen wrote: > On Fri, Jan 30, 2026 at 04:27:11PM +0800, Jason Yan wrote: >> Hi, >> >> 在 2026/1/29 17:38, Chaohai Chen 写道: >>> Multiple functions in libsas were accessing port->dev_list without >>> proper locking, leading to potential race conditions that could cause: >>> - Use-after-free when devices are removed during list traversal >>> - List corruption from concurrent modifications >>> - System crashes from accessing freed memory >> >> libsas events are processed in orderd workqueue. Do you have a crash log? > No crash log. I noticed the missing locks while watching the code. But > there are tow queues, event_q and disco_q which may cause conflicts. > And I think the dev_ist_lock is designed to prevent conflicts. I am not so sure this is true in general. E.g. when sas_suspend_devices() is called form a disco_q work context, events are disabled, so there cannot be changes to a port dev_list. If you have no crash log, please provide a more detailed analysis of the potential problems. Given that no one has reported an issue in this area (that I know of), if there is a problem, it is definitely not obvious and thus requires detailed explanation to be understood. -- Damien Le Moal Western Digital Research
On 1/29/26 18:38, Chaohai Chen wrote:
> Multiple functions in libsas were accessing port->dev_list without
> proper locking, leading to potential race conditions that could cause:
> - Use-after-free when devices are removed during list traversal
> - List corruption from concurrent modifications
> - System crashes from accessing freed memory
There si a lot going on in this patch with reference counting changes that are
not very clear. Ideally, this patch should be split to address reference
counting and exclusive access to the list with a spinlock.
More comments below.
>
> This patch adds proper dev_list_lock protection to the following functions:
>
> 1. sas_ex_level_discovery(): Added locking around list traversal with
> safe iteration and reference counting for devices. The lock is
> released before calling functions that may sleep
> (sas_ex_discover_devices).
>
> 2. sas_dev_present_in_domain(): Added locking for read-only list access
> to prevent reading inconsistent list state.
>
> 3. sas_suspend_devices(): Added locking around list traversal to prevent
> concurrent modifications during device suspension.
>
> 4. sas_unregister_domain_devices(): Added proper locking with reference
> counting. The lock is released before calling sas_unregister_dev()
> which may sleep, but device references are held to prevent premature
> removal.
>
> 5. sas_port_event_worker(): Added locking around list traversal with
> reference counting for devices accessed outside the lock.
>
> All modifications follow the pattern of:
> - Hold dev_list_lock during list traversal
> - Use list_for_each_entry_safe where list may be modified
> - Take device reference (kref_get) before releasing lock
> - Release lock before calling functions that may sleep
> - Release device reference (sas_put_device) after use
>
> Signed-off-by: Chaohai Chen <wdhh6@aliyun.com>
> ---
> drivers/scsi/libsas/sas_discover.c | 14 +++++++++++++
> drivers/scsi/libsas/sas_expander.c | 32 +++++++++++++++++++++++-------
> drivers/scsi/libsas/sas_port.c | 10 ++++++++++
> 3 files changed, 49 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
> index b07062db50b2..3c18fdfde8c2 100644
> --- a/drivers/scsi/libsas/sas_discover.c
> +++ b/drivers/scsi/libsas/sas_discover.c
> @@ -245,8 +245,10 @@ static void sas_suspend_devices(struct work_struct *work)
> * suspension, we force the issue here to keep the reference
> * counts aligned
> */
> + spin_lock_irq(&port->dev_list_lock);
> list_for_each_entry(dev, &port->dev_list, dev_list_node)
> sas_notify_lldd_dev_gone(dev);
> + spin_unlock_irq(&port->dev_list_lock);
>
> /* we are suspending, so we know events are disabled and
> * phy_list is not being mutated
> @@ -410,11 +412,23 @@ void sas_unregister_domain_devices(struct asd_sas_port *port, bool gone)
> {
> struct domain_device *dev, *n;
>
> + /* Lock while iterating to prevent concurrent modifications.
Please use the correct kernel style: multi-line comments start with a "/*" line
with no text.
> + * We need to unlock before calling sas_unregister_dev() as it
> + * may sleep, but we hold a reference to prevent device removal.
And why is that necessary ?
> + */
> + spin_lock_irq(&port->dev_list_lock);
> list_for_each_entry_safe_reverse(dev, n, &port->dev_list, dev_list_node) {
> if (gone)
> set_bit(SAS_DEV_GONE, &dev->state);
> + kref_get(&dev->kref);
> + spin_unlock_irq(&port->dev_list_lock);
> +
> sas_unregister_dev(port, dev);
> + sas_put_device(dev);
> +
> + spin_lock_irq(&port->dev_list_lock);
> }
> + spin_unlock_irq(&port->dev_list_lock);
>
> list_for_each_entry_safe(dev, n, &port->disco_list, disco_list_node)
> sas_unregister_dev(port, dev);
> diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
> index d953225f6cc2..c82c9b3d5103 100644
> --- a/drivers/scsi/libsas/sas_expander.c
> +++ b/drivers/scsi/libsas/sas_expander.c
> @@ -643,14 +643,21 @@ static int sas_dev_present_in_domain(struct asd_sas_port *port,
> u8 *sas_addr)
> {
> struct domain_device *dev;
> + int found = 0;
Use a bool please. That menas changing the function to return a bool as well.
>
> if (SAS_ADDR(port->sas_addr) == SAS_ADDR(sas_addr))
> return 1;
> +
> + spin_lock_irq(&port->dev_list_lock);
> list_for_each_entry(dev, &port->dev_list, dev_list_node) {
> - if (SAS_ADDR(dev->sas_addr) == SAS_ADDR(sas_addr))
> - return 1;
> + if (SAS_ADDR(dev->sas_addr) == SAS_ADDR(sas_addr)) {
> + found = 1;
> + break;
> + }
> }
> - return 0;
> + spin_unlock_irq(&port->dev_list_lock);
> +
> + return found;
> }
>
> #define RPEL_REQ_SIZE 16
> @@ -1579,20 +1586,31 @@ static int sas_discover_expander(struct domain_device *dev)
> static int sas_ex_level_discovery(struct asd_sas_port *port, const int level)
> {
> int res = 0;
> - struct domain_device *dev;
> + struct domain_device *dev, *n;
>
> - list_for_each_entry(dev, &port->dev_list, dev_list_node) {
> + spin_lock_irq(&port->dev_list_lock);
> + list_for_each_entry_safe(dev, n, &port->dev_list, dev_list_node) {
> if (dev_is_expander(dev->dev_type)) {
> struct sas_expander_device *ex =
> rphy_to_expander_device(dev->rphy);
>
> - if (level == ex->level)
> + if (level == ex->level) {
> + kref_get(&dev->kref);
> + spin_unlock_irq(&port->dev_list_lock);
> res = sas_ex_discover_devices(dev, -1);
> - else if (level > 0)
> + sas_put_device(dev);
> + spin_lock_irq(&port->dev_list_lock);
This was locked already before the loop. So you will deadlock here... no ?
> + } else if (level > 0) {
> + kref_get(&port->port_dev->kref);
> + spin_unlock_irq(&port->dev_list_lock);
> res = sas_ex_discover_devices(port->port_dev, -1);
> + sas_put_device(port->port_dev);
> + spin_lock_irq(&port->dev_list_lock);
> + }
>
> }
> }
> + spin_unlock_irq(&port->dev_list_lock);
>
> return res;
> }
> diff --git a/drivers/scsi/libsas/sas_port.c b/drivers/scsi/libsas/sas_port.c
> index de7556070048..491c9f7104c6 100644
> --- a/drivers/scsi/libsas/sas_port.c
> +++ b/drivers/scsi/libsas/sas_port.c
> @@ -44,13 +44,19 @@ static void sas_resume_port(struct asd_sas_phy *phy)
> * 1/ presume every device came back
> * 2/ force the next revalidation to check all expander phys
> */
> + spin_lock_irq(&port->dev_list_lock);
> list_for_each_entry_safe(dev, n, &port->dev_list, dev_list_node) {
> int i, rc;
>
> + kref_get(&dev->kref);
> + spin_unlock_irq(&port->dev_list_lock);
> +
> rc = sas_notify_lldd_dev_found(dev);
> if (rc) {
> sas_unregister_dev(port, dev);
> sas_destruct_devices(port);
> + sas_put_device(dev);
Same here. Why is the extra reference count needed ?
> + spin_lock_irq(&port->dev_list_lock);
> continue;
> }
>
> @@ -62,7 +68,11 @@ static void sas_resume_port(struct asd_sas_phy *phy)
> phy->phy_change_count = -1;
> }
> }
> +
> + sas_put_device(dev);
> + spin_lock_irq(&port->dev_list_lock);
> }
> + spin_unlock_irq(&port->dev_list_lock);
>
> sas_discover_event(port, DISCE_RESUME);
> }
--
Damien Le Moal
Western Digital Research
On Fri, Jan 30, 2026 at 12:02:01PM +0900, Damien Le Moal wrote:
> On 1/29/26 18:38, Chaohai Chen wrote:
> > Multiple functions in libsas were accessing port->dev_list without
> > proper locking, leading to potential race conditions that could cause:
> > - Use-after-free when devices are removed during list traversal
> > - List corruption from concurrent modifications
> > - System crashes from accessing freed memory
>
> There si a lot going on in this patch with reference counting changes that are
> not very clear. Ideally, this patch should be split to address reference
> counting and exclusive access to the list with a spinlock.
>
> More comments below.
>
> >
> > This patch adds proper dev_list_lock protection to the following functions:
> >
> > 1. sas_ex_level_discovery(): Added locking around list traversal with
> > safe iteration and reference counting for devices. The lock is
> > released before calling functions that may sleep
> > (sas_ex_discover_devices).
> >
> > 2. sas_dev_present_in_domain(): Added locking for read-only list access
> > to prevent reading inconsistent list state.
> >
> > 3. sas_suspend_devices(): Added locking around list traversal to prevent
> > concurrent modifications during device suspension.
> >
> > 4. sas_unregister_domain_devices(): Added proper locking with reference
> > counting. The lock is released before calling sas_unregister_dev()
> > which may sleep, but device references are held to prevent premature
> > removal.
> >
> > 5. sas_port_event_worker(): Added locking around list traversal with
> > reference counting for devices accessed outside the lock.
> >
> > All modifications follow the pattern of:
> > - Hold dev_list_lock during list traversal
> > - Use list_for_each_entry_safe where list may be modified
> > - Take device reference (kref_get) before releasing lock
> > - Release lock before calling functions that may sleep
> > - Release device reference (sas_put_device) after use
> >
> > Signed-off-by: Chaohai Chen <wdhh6@aliyun.com>
> > ---
> > drivers/scsi/libsas/sas_discover.c | 14 +++++++++++++
> > drivers/scsi/libsas/sas_expander.c | 32 +++++++++++++++++++++++-------
> > drivers/scsi/libsas/sas_port.c | 10 ++++++++++
> > 3 files changed, 49 insertions(+), 7 deletions(-)
> >
> > diff --git a/drivers/scsi/libsas/sas_discover.c b/drivers/scsi/libsas/sas_discover.c
> > index b07062db50b2..3c18fdfde8c2 100644
> > --- a/drivers/scsi/libsas/sas_discover.c
> > +++ b/drivers/scsi/libsas/sas_discover.c
> > @@ -245,8 +245,10 @@ static void sas_suspend_devices(struct work_struct *work)
> > * suspension, we force the issue here to keep the reference
> > * counts aligned
> > */
> > + spin_lock_irq(&port->dev_list_lock);
> > list_for_each_entry(dev, &port->dev_list, dev_list_node)
> > sas_notify_lldd_dev_gone(dev);
> > + spin_unlock_irq(&port->dev_list_lock);
> >
> > /* we are suspending, so we know events are disabled and
> > * phy_list is not being mutated
> > @@ -410,11 +412,23 @@ void sas_unregister_domain_devices(struct asd_sas_port *port, bool gone)
> > {
> > struct domain_device *dev, *n;
> >
> > + /* Lock while iterating to prevent concurrent modifications.
>
> Please use the correct kernel style: multi-line comments start with a "/*" line
> with no text.
>
> > + * We need to unlock before calling sas_unregister_dev() as it
> > + * may sleep, but we hold a reference to prevent device removal.
>
> And why is that necessary ?
>
Because when unlocked, it is possible that the device has already been
released by another thread. If there is no reference count, it will lead
to used after free.
> > + */
> > + spin_lock_irq(&port->dev_list_lock);
> > list_for_each_entry_safe_reverse(dev, n, &port->dev_list, dev_list_node) {
> > if (gone)
> > set_bit(SAS_DEV_GONE, &dev->state);
> > + kref_get(&dev->kref);
> > + spin_unlock_irq(&port->dev_list_lock);
> > +
> > sas_unregister_dev(port, dev);
> > + sas_put_device(dev);
> > +
> > + spin_lock_irq(&port->dev_list_lock);
> > }
> > + spin_unlock_irq(&port->dev_list_lock);
> >
> > list_for_each_entry_safe(dev, n, &port->disco_list, disco_list_node)
> > sas_unregister_dev(port, dev);
> > diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
> > index d953225f6cc2..c82c9b3d5103 100644
> > --- a/drivers/scsi/libsas/sas_expander.c
> > +++ b/drivers/scsi/libsas/sas_expander.c
> > @@ -643,14 +643,21 @@ static int sas_dev_present_in_domain(struct asd_sas_port *port,
> > u8 *sas_addr)
> > {
> > struct domain_device *dev;
> > + int found = 0;
>
> Use a bool please. That menas changing the function to return a bool as well.
>
> >
> > if (SAS_ADDR(port->sas_addr) == SAS_ADDR(sas_addr))
> > return 1;
> > +
> > + spin_lock_irq(&port->dev_list_lock);
> > list_for_each_entry(dev, &port->dev_list, dev_list_node) {
> > - if (SAS_ADDR(dev->sas_addr) == SAS_ADDR(sas_addr))
> > - return 1;
> > + if (SAS_ADDR(dev->sas_addr) == SAS_ADDR(sas_addr)) {
> > + found = 1;
> > + break;
> > + }
> > }
> > - return 0;
> > + spin_unlock_irq(&port->dev_list_lock);
> > +
> > + return found;
> > }
> >
> > #define RPEL_REQ_SIZE 16
> > @@ -1579,20 +1586,31 @@ static int sas_discover_expander(struct domain_device *dev)
> > static int sas_ex_level_discovery(struct asd_sas_port *port, const int level)
> > {
> > int res = 0;
> > - struct domain_device *dev;
> > + struct domain_device *dev, *n;
> >
> > - list_for_each_entry(dev, &port->dev_list, dev_list_node) {
> > + spin_lock_irq(&port->dev_list_lock);
> > + list_for_each_entry_safe(dev, n, &port->dev_list, dev_list_node) {
> > if (dev_is_expander(dev->dev_type)) {
> > struct sas_expander_device *ex =
> > rphy_to_expander_device(dev->rphy);
> >
> > - if (level == ex->level)
> > + if (level == ex->level) {
> > + kref_get(&dev->kref);
> > + spin_unlock_irq(&port->dev_list_lock);
This will be unlocked here, so there will be no deadlock below.
> > res = sas_ex_discover_devices(dev, -1);
> > - else if (level > 0)
> > + sas_put_device(dev);
> > + spin_lock_irq(&port->dev_list_lock);
>
> This was locked already before the loop. So you will deadlock here... no ?
>
No deadlock here.
> > + } else if (level > 0) {
> > + kref_get(&port->port_dev->kref);
> > + spin_unlock_irq(&port->dev_list_lock);
> > res = sas_ex_discover_devices(port->port_dev, -1);
> > + sas_put_device(port->port_dev);
> > + spin_lock_irq(&port->dev_list_lock);
> > + }
> >
> > }
> > }
> > + spin_unlock_irq(&port->dev_list_lock);
> >
> > return res;
> > }
> > diff --git a/drivers/scsi/libsas/sas_port.c b/drivers/scsi/libsas/sas_port.c
> > index de7556070048..491c9f7104c6 100644
> > --- a/drivers/scsi/libsas/sas_port.c
> > +++ b/drivers/scsi/libsas/sas_port.c
> > @@ -44,13 +44,19 @@ static void sas_resume_port(struct asd_sas_phy *phy)
> > * 1/ presume every device came back
> > * 2/ force the next revalidation to check all expander phys
> > */
> > + spin_lock_irq(&port->dev_list_lock);
> > list_for_each_entry_safe(dev, n, &port->dev_list, dev_list_node) {
> > int i, rc;
> >
> > + kref_get(&dev->kref);
> > + spin_unlock_irq(&port->dev_list_lock);
> > +
> > rc = sas_notify_lldd_dev_found(dev);
> > if (rc) {
> > sas_unregister_dev(port, dev);
> > sas_destruct_devices(port);
> > + sas_put_device(dev);
>
> Same here. Why is the extra reference count needed ?
>
The use of reference counting is to unlock the logic for device discovery
and exception handling which may involve scheduling. There cannot be
scheduling inside the spin lock.
> > + spin_lock_irq(&port->dev_list_lock);
> > continue;
> > }
> >
> > @@ -62,7 +68,11 @@ static void sas_resume_port(struct asd_sas_phy *phy)
> > phy->phy_change_count = -1;
> > }
> > }
> > +
> > + sas_put_device(dev);
> > + spin_lock_irq(&port->dev_list_lock);
> > }
> > + spin_unlock_irq(&port->dev_list_lock);
> >
> > sas_discover_event(port, DISCE_RESUME);
> > }
>
>
> --
> Damien Le Moal
> Western Digital Research
--
Chaohai Chen
On 1/31/26 20:19, Chaohai Chen wrote:
>>> + * We need to unlock before calling sas_unregister_dev() as it
>>> + * may sleep, but we hold a reference to prevent device removal.
>>
>> And why is that necessary ?
>>
> Because when unlocked, it is possible that the device has already been
> released by another thread. If there is no reference count, it will lead
> to used after free.
Please clearly explain the problem path. Your statements about "another thread"
is too vague.
>>> + */
>>> + spin_lock_irq(&port->dev_list_lock);
>>> list_for_each_entry_safe_reverse(dev, n, &port->dev_list, dev_list_node) {
>>> if (gone)
>>> set_bit(SAS_DEV_GONE, &dev->state);
>>> + kref_get(&dev->kref);
>>> + spin_unlock_irq(&port->dev_list_lock);
>>> +
>>> sas_unregister_dev(port, dev);
>>> + sas_put_device(dev);
>>> +
>>> + spin_lock_irq(&port->dev_list_lock);
>>> }
>>> + spin_unlock_irq(&port->dev_list_lock);
--
Damien Le Moal
Western Digital Research
On Mon, Feb 02, 2026 at 10:21:43AM +0900, Damien Le Moal wrote:
> On 1/31/26 20:19, Chaohai Chen wrote:
> >>> + * We need to unlock before calling sas_unregister_dev() as it
> >>> + * may sleep, but we hold a reference to prevent device removal.
> >>
> >> And why is that necessary ?
> >>
> > Because when unlocked, it is possible that the device has already been
> > released by another thread. If there is no reference count, it will lead
> > to used after free.
>
> Please clearly explain the problem path. Your statements about "another thread"
> is too vague.
>
1.
CPU 1: disco_q CPU 2: event_q
============================== ==============================
sas_discover_domain() sas_phye_loss_of_signal()
sas_ex_level_discovery() sas_deform_port(phy, true)
list_for_each_entry(dev, sas_unregister_domain_devices()
&port->dev_list, ...)
NOP list_for_each_entry_safe_reverse(
dev, n, &port->dev_list, ...)
NOP sas_unregister_dev(port, dev)
NOP kfree(dev)
if (dev_is_expander(dev->dev_type))(UAF)
...
2.
CPU 1: disco_q CPU 2: event_q
============================== ==============================
sas_resume_devices() sas_porte_link_reset_err()
sas_resume_port() sas_deform_port(phy, true)
list_for_each_entry_safe(dev, sas_unregister_domain_devices()
&port->dev_list, ...)
NOP free dev
visit dev->ex_dev(UAF)
在 2026/2/2 15:36, Chaohai Chen 写道: > On Mon, Feb 02, 2026 at 10:21:43AM +0900, Damien Le Moal wrote: >> On 1/31/26 20:19, Chaohai Chen wrote: >>>>> + * We need to unlock before calling sas_unregister_dev() as it >>>>> + * may sleep, but we hold a reference to prevent device removal. >>>> >>>> And why is that necessary ? >>>> >>> Because when unlocked, it is possible that the device has already been >>> released by another thread. If there is no reference count, it will lead >>> to used after free. >> >> Please clearly explain the problem path. Your statements about "another thread" >> is too vague. >> > 1. > CPU 1: disco_q CPU 2: event_q > ============================== ============================== > sas_discover_domain() sas_phye_loss_of_signal() > > sas_ex_level_discovery() sas_deform_port(phy, true) > > list_for_each_entry(dev, sas_unregister_domain_devices() > &port->dev_list, ...) > > NOP list_for_each_entry_safe_reverse( > dev, n, &port->dev_list, ...) > > NOP sas_unregister_dev(port, dev) > > NOP kfree(dev) > > if (dev_is_expander(dev->dev_type))(UAF) > ... DISCE_DISCOVER_DOMAIN(sas_discover_domain) event is queued and executed in sas_form_port() synchronously, and sas_form_port() is executed in event_q. So I don't think the race condition above will happen. > > 2. > CPU 1: disco_q CPU 2: event_q > ============================== ============================== > sas_resume_devices() sas_porte_link_reset_err() > > sas_resume_port() sas_deform_port(phy, true) > > list_for_each_entry_safe(dev, sas_unregister_domain_devices() > &port->dev_list, ...) > > NOP free dev > > visit dev->ex_dev(UAF) This may happen in theory. Thanks, 祝一切顺利
On 2/2/26 16:36, Chaohai Chen wrote: > On Mon, Feb 02, 2026 at 10:21:43AM +0900, Damien Le Moal wrote: >> On 1/31/26 20:19, Chaohai Chen wrote: >>>>> + * We need to unlock before calling sas_unregister_dev() as it >>>>> + * may sleep, but we hold a reference to prevent device removal. >>>> >>>> And why is that necessary ? >>>> >>> Because when unlocked, it is possible that the device has already been >>> released by another thread. If there is no reference count, it will lead >>> to used after free. >> >> Please clearly explain the problem path. Your statements about "another thread" >> is too vague. >> > 1. > CPU 1: disco_q CPU 2: event_q > ============================== ============================== > sas_discover_domain() sas_phye_loss_of_signal() > > sas_ex_level_discovery() sas_deform_port(phy, true) > > list_for_each_entry(dev, sas_unregister_domain_devices() > &port->dev_list, ...) > > NOP list_for_each_entry_safe_reverse( > dev, n, &port->dev_list, ...) > > NOP sas_unregister_dev(port, dev) > > NOP kfree(dev) > > if (dev_is_expander(dev->dev_type))(UAF) > ... > > 2. > CPU 1: disco_q CPU 2: event_q > ============================== ============================== > sas_resume_devices() sas_porte_link_reset_err() > > sas_resume_port() sas_deform_port(phy, true) > > list_for_each_entry_safe(dev, sas_unregister_domain_devices() > &port->dev_list, ...) > > NOP free dev > > visit dev->ex_dev(UAF) OK. Then add this to the commit message to make everything clear. Also, the examples above are only for device removal, and given that list_for_each_entry_safe() is safe against removals, that is likely the only change needed (in case 1, disco_q side), together with a device get/put inside the loop. Not sure that the spinlock to protect the list is actually needed. -- Damien Le Moal Western Digital Research
© 2016 - 2026 Red Hat, Inc.