drivers/base/base.h | 9 +++++++++ drivers/base/core.c | 2 +- fs/kernfs/file.c | 24 +++++++++++++++++++----- fs/sysfs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ fs/sysfs/group.c | 4 ++-- fs/sysfs/sysfs.h | 18 ++++++++++++++++++ include/linux/kernfs.h | 1 + include/linux/sysfs.h | 1 + 8 files changed, 98 insertions(+), 8 deletions(-)
Commit "driver core: Fix uevent_show() vs driver detach race" [1]
attempted to fix a lockdep report in uevent_show() by making the lookup
of driver information for a given device lockless. It turns out that
userspace likely depends on the side-effect of uevent_show() flushing
device probing, as evidenced by the removal of the lock causing a
regression initializing USB devices [2].
Introduce a new "locked" type for sysfs attributes that arranges for the
attribute to be called under the device-lock, but without setting up a
reverse locking order dependency with the kernfs_get_active() lock.
This new facility holds a reference on a device while any locked-sysfs
attribute of that device is open. It then takes the lock around sysfs
read/write operations in the following order:
of->mutex
of->op_mutex <= device_lock()
kernfs_get_active()
<operation>
Compare that to problematic locking order of:
of->mutex
kernfs_get_active()
<operation>
device_lock()
...which causes potential deadlocks with kernfs_drain() that may be
called while the device_lock() is held.
Note that the synchronize_rcu() in module_remove_driver(), introduced in
the precedubg fix, likely needs to remain since dev_uevent() is not
always called with the device_lock held. Userspace can potentially
trigger use after free by racing module removal against kernel
invocations of kobject_uevent().
Note2, this change effectively allows userspace to test for
CONFIG_DEBUG_KOBJECT_RELEASE-style driver bugs as the lifetime of open
file descriptors on the @uevent attribute keeps the device reference
count elevated indefinitely.
Reported-by: brmails+k
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219244 [2]
Tested-by: brmails+k
Fixes: 15fffc6a5624 ("driver core: Fix uevent_show() vs driver detach race") [1]
Cc: stable@vger.kernel.org
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
Changes since v1 [1]:
- Move the new "locked" infrastructure to private header files to make
it clear it is not approved for general usage (Greg)
[1]: http://lore.kernel.org/172772671177.1003633.7355063154793624707.stgit@dwillia2-xfh.jf.intel.com
drivers/base/base.h | 9 +++++++++
drivers/base/core.c | 2 +-
fs/kernfs/file.c | 24 +++++++++++++++++++-----
fs/sysfs/file.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
fs/sysfs/group.c | 4 ++--
fs/sysfs/sysfs.h | 18 ++++++++++++++++++
include/linux/kernfs.h | 1 +
include/linux/sysfs.h | 1 +
8 files changed, 98 insertions(+), 8 deletions(-)
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 0b53593372d7..0405e7667184 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -246,3 +246,12 @@ static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
void software_node_notify(struct device *dev);
void software_node_notify_remove(struct device *dev);
+
+/*
+ * "locked" device attributes are an internal exception for the @uevent
+ * attribute.
+ */
+#include "../../fs/sysfs/sysfs.h"
+
+#define DEVICE_ATTR_LOCKED_RW(_name) \
+ struct device_attribute dev_attr_##_name = __ATTR_LOCKED_RW(_name)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 8c0733d3aad8..1fd5a18cbb62 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2772,7 +2772,7 @@ static ssize_t uevent_store(struct device *dev, struct device_attribute *attr,
return count;
}
-static DEVICE_ATTR_RW(uevent);
+static DEVICE_ATTR_LOCKED_RW(uevent);
static ssize_t online_show(struct device *dev, struct device_attribute *attr,
char *buf)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 8502ef68459b..eb5c2167beb9 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -142,6 +142,20 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
kernfs_put_active(of->kn);
}
+static void kernfs_open_file_lock(struct kernfs_open_file *of)
+{
+ mutex_lock(&of->mutex);
+ if (of->op_mutex)
+ mutex_lock(of->op_mutex);
+}
+
+static void kernfs_open_file_unlock(struct kernfs_open_file *of)
+{
+ if (of->op_mutex)
+ mutex_unlock(of->op_mutex);
+ mutex_unlock(&of->mutex);
+}
+
static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
{
struct kernfs_open_file *of = sf->private;
@@ -151,7 +165,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
* @of->mutex nests outside active ref and is primarily to ensure that
* the ops aren't called concurrently for the same open file.
*/
- mutex_lock(&of->mutex);
+ kernfs_open_file_lock(of);
if (!kernfs_get_active(of->kn))
return ERR_PTR(-ENODEV);
@@ -193,7 +207,7 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
if (v != ERR_PTR(-ENODEV))
kernfs_seq_stop_active(sf, v);
- mutex_unlock(&of->mutex);
+ kernfs_open_file_unlock(of);
}
static int kernfs_seq_show(struct seq_file *sf, void *v)
@@ -322,9 +336,9 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* @of->mutex nests outside active ref and is used both to ensure that
* the ops aren't called concurrently for the same open file.
*/
- mutex_lock(&of->mutex);
+ kernfs_open_file_lock(of);
if (!kernfs_get_active(of->kn)) {
- mutex_unlock(&of->mutex);
+ kernfs_open_file_unlock(of);
len = -ENODEV;
goto out_free;
}
@@ -336,7 +350,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
len = -EINVAL;
kernfs_put_active(of->kn);
- mutex_unlock(&of->mutex);
+ kernfs_open_file_unlock(of);
if (len > 0)
iocb->ki_pos += len;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d1995e2d6c94..1bb878efcf00 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/seq_file.h>
+#include <linux/device.h>
#include <linux/mm.h>
#include "sysfs.h"
@@ -189,6 +190,26 @@ static int sysfs_kf_bin_open(struct kernfs_open_file *of)
return 0;
}
+/* locked attributes are always device attributes */
+static int sysfs_kf_setup_lock(struct kernfs_open_file *of)
+{
+ struct kobject *kobj = of->kn->parent->priv;
+ struct device *dev = kobj_to_dev(kobj);
+
+ get_device(dev);
+ of->op_mutex = &dev->mutex;
+
+ return 0;
+}
+
+static void sysfs_kf_free_lock(struct kernfs_open_file *of)
+{
+ struct kobject *kobj = of->kn->parent->priv;
+ struct device *dev = kobj_to_dev(kobj);
+
+ put_device(dev);
+}
+
void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
{
struct kernfs_node *kn = kobj->sd, *tmp;
@@ -227,6 +248,25 @@ static const struct kernfs_ops sysfs_file_kfops_rw = {
.write = sysfs_kf_write,
};
+static const struct kernfs_ops sysfs_locked_kfops_ro = {
+ .seq_show = sysfs_kf_seq_show,
+ .open = sysfs_kf_setup_lock,
+ .release = sysfs_kf_free_lock,
+};
+
+static const struct kernfs_ops sysfs_locked_kfops_wo = {
+ .write = sysfs_kf_write,
+ .open = sysfs_kf_setup_lock,
+ .release = sysfs_kf_free_lock,
+};
+
+static const struct kernfs_ops sysfs_locked_kfops_rw = {
+ .seq_show = sysfs_kf_seq_show,
+ .write = sysfs_kf_write,
+ .open = sysfs_kf_setup_lock,
+ .release = sysfs_kf_free_lock,
+};
+
static const struct kernfs_ops sysfs_prealloc_kfops_ro = {
.read = sysfs_kf_read,
.prealloc = true,
@@ -287,6 +327,13 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
ops = &sysfs_prealloc_kfops_ro;
else if (sysfs_ops->store)
ops = &sysfs_prealloc_kfops_wo;
+ } else if (mode & SYSFS_LOCKED) {
+ if (sysfs_ops->show && sysfs_ops->store)
+ ops = &sysfs_locked_kfops_rw;
+ else if (sysfs_ops->show)
+ ops = &sysfs_locked_kfops_ro;
+ else if (sysfs_ops->store)
+ ops = &sysfs_locked_kfops_wo;
} else {
if (sysfs_ops->show && sysfs_ops->store)
ops = &sysfs_file_kfops_rw;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index d22ad67a0f32..0158367866be 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -68,11 +68,11 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
continue;
}
- WARN(mode & ~(SYSFS_PREALLOC | 0664),
+ WARN(mode & ~(SYSFS_PREALLOC | SYSFS_LOCKED | 0664),
"Attribute %s: Invalid permissions 0%o\n",
(*attr)->name, mode);
- mode &= SYSFS_PREALLOC | 0664;
+ mode &= SYSFS_PREALLOC | SYSFS_LOCKED | 0664;
error = sysfs_add_file_mode_ns(parent, *attr, mode, uid,
gid, NULL);
if (unlikely(error))
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3f28c9af5756..51d5b6af243f 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -40,4 +40,22 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
const char *name);
+#define SYSFS_LOCKED 040000
+
+/*
+ * uevent_show() needs this due to userspace expectations of reading
+ * that attribute flushing device probing, it is not intended to be a
+ * general semantic for other device sysfs attributes. It is broken to
+ * use this with non-device / pure-kobject sysfs attributes, see
+ * sysfs_kf_setup_lock().
+ */
+#define __ATTR_LOCKED(_name, _mode, _show, _store) { \
+ .attr = {.name = __stringify(_name), \
+ .mode = SYSFS_LOCKED | VERIFY_OCTAL_PERMISSIONS(_mode) },\
+ .show = _show, \
+ .store = _store, \
+}
+
+#define __ATTR_LOCKED_RW(_name) __ATTR_LOCKED(_name, 0644, _name##_show, _name##_store)
+
#endif /* __SYSFS_INTERNAL_H */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 87c79d076d6d..df6828a7cd3e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -257,6 +257,7 @@ struct kernfs_open_file {
/* private fields, do not use outside kernfs proper */
struct mutex mutex;
+ struct mutex *op_mutex;
struct mutex prealloc_mutex;
int event;
struct list_head list;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index c4e64dc11206..dd28c6238b64 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -103,6 +103,7 @@ struct attribute_group {
#define SYSFS_PREALLOC 010000
#define SYSFS_GROUP_INVISIBLE 020000
+/* see fs/sysfs/sysfs.h for private mode-flags */
/*
* DEFINE_SYSFS_GROUP_VISIBLE(name):
Hi, Dan, Greg,
We have found a potential tungtask issue has been introduce by commit 9a71892cbcdb ("Revert "driver core: Fix uevent_show() vs driver detach race""), which revert the rcu in device_uevent but reintroduce the device_lock() in uevent_show(). The reproduce procedure is quite simple:
$ lspci -n -s 0000:00:03.0
00:03.0 0108: 8086:5845 (rev 02)
$ echo 8086 5845 > /sys/bus/pci/drivers/vfio-pci/new_id
$ ls /dev/vfio
3 vfio
$ ./a.out & (Binary program compiled with the following vfio demo program)
$ echo 1 > /sys/devices/pci0000:00/0000:00:03.0/remove
The root cause of the hung task is due to the device_lock() in uevent_show() is blocked by device_realease_driver() in pci remove routeine which is also blocked to wait for vfio_device->refcount == 0, which can never come because it's refcount has been lifted in the same user process.
Besides, the patch proposed by Dan [1] can not prevent hungtask from happening. For 6.13-rc5, in the following case:
ioctl(..,VFIO_GROUP_GET_DEVICE_FD,..)
...
vfio_device_get_from_name()
vfio_devcie->refcount -> 2
pci_stop_and_remove_bus_device()
pci_remove_bus()
device_unregister()
...
device_release_driver()
device_lock()
device_remove()
vfio_unregister_group_dev()
vfio_devcie_put()
vfio_devcie->refcount -> 1
wait_for(vfio_devcie->refcount = 0)
uevent_show()
device_lock()
[1] https://lore.kernel.org/all/172790598832.1168608.4519484276671503678.stgit@dwillia2-xfh.jf.intel.com/#R
-------------------->8---------------------------------
int main(int argc, char *argv[])
{
char buf[128];
int container, group, device, uevent, ret;
struct vfio_group_status group_status =
{ .argsz = sizeof(group_status) };
struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };
container = open("/dev/vfio/vfio", O_RDWR);
group = open("/dev/vfio/3", O_RDWR);
ioctl(group, VFIO_GROUP_GET_STATUS, &group_status);
ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info);
device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:00:03.0");
uevent = open("/sys/devices/pci0000:00/0000:00:03.0/uevent", O_RDWR);
sleep(10); /* Remove the pci device here */
ret = read(uevent, buf, 128); /* We will get hung task here */
printf("ret %d\n", ret);
return 0;
}
Thanks,
Zekun
On Tue, Dec 31, 2024 at 03:56:08PM +0800, Zhang Zekun wrote:
> Hi, Dan, Greg,
>
> We have found a potential tungtask issue has been introduce by commit 9a71892cbcdb ("Revert "driver core: Fix uevent_show() vs driver detach race""), which revert the rcu in device_uevent but reintroduce the device_lock() in uevent_show(). The reproduce procedure is quite simple:
The revert just puts the original logic back in place, so this is not
anything new that has been introduced, right? It's just that the
attempted fix didn't work, so a different fix needs to happen.
If you could help debug the original fix, that would be most appreciated
as none of us here are arguing that there isn't a bug present, only that
the fix caused different problems which required it to be reverted.
thanks,
greg k-h
在 2024/12/31 16:26, Greg KH 写道:
> On Tue, Dec 31, 2024 at 03:56:08PM +0800, Zhang Zekun wrote:
>> Hi, Dan, Greg,
>>
>> We have found a potential tungtask issue has been introduce by commit 9a71892cbcdb ("Revert "driver core: Fix uevent_show() vs driver detach race""), which revert the rcu in device_uevent but reintroduce the device_lock() in uevent_show(). The reproduce procedure is quite simple:
>
> The revert just puts the original logic back in place, so this is not
> anything new that has been introduced, right? It's just that the
> attempted fix didn't work, so a different fix needs to happen.
Hi, Greg,
Yes, there is nothing new introduced here. We have been testing the rcu
fix (commit 15fffc6a5624 ("driver core: Fix uevent_show() vs driver
detach race")) for monthes but has not obersved problems.
Noticing that, brmails+k write in bugzilla "Also, I can't say why the
issue appeared in the past even without this commit being present, as I
haven't bisected any kernel version before v6.6.45.". I doubt that there
could still have problem described in bugzilla [1] even if commit
c0a40097f0bc ("drivers: core: synchronize really_probe() and
dev_uevent()") has been reverted, and the problem is not directly
introduced by rcu in dev_uevent.
[1] https://bugzilla.kernel.org/show_bug.cgi?id=219244
Thanks,
Zekun
zhangzekun (A) wrote:
>
>
> 在 2024/12/31 16:26, Greg KH 写道:
> > On Tue, Dec 31, 2024 at 03:56:08PM +0800, Zhang Zekun wrote:
> >> Hi, Dan, Greg,
> >>
> >> We have found a potential tungtask issue has been introduce by commit 9a71892cbcdb ("Revert "driver core: Fix uevent_show() vs driver detach race""), which revert the rcu in device_uevent but reintroduce the device_lock() in uevent_show(). The reproduce procedure is quite simple:
> >
> > The revert just puts the original logic back in place, so this is not
> > anything new that has been introduced, right? It's just that the
> > attempted fix didn't work, so a different fix needs to happen.
> Hi, Greg,
>
> Yes, there is nothing new introduced here. We have been testing the rcu
> fix (commit 15fffc6a5624 ("driver core: Fix uevent_show() vs driver
> detach race")) for monthes but has not obersved problems.
Hi Zekun,
If you have some cycles to help investigate the replacement fix that
would be much appreciated.
So far I came up with this:
http://lore.kernel.org/172790598832.1168608.4519484276671503678.stgit@dwillia2-xfh.jf.intel.com
...but have not had time to debug the 0day report.
My worry is still that Linux has long since shipped the expectation that
reading 'uevent' bounces the device_lock() which, among other things,
makes sure that any in-flight driver probing has completed.
The report of USB devices disappearing is consistent with a udev
operation failing due to the driver not being done attaching, or
something similar.
So even though you have not seen any issues, I suspect small differences
in the devices on your system and the reporter's system, or udev rule
differences could result in a failure to trigger the regression.
> Hi Zekun,
>
> If you have some cycles to help investigate the replacement fix that
> would be much appreciated.
>
> So far I came up with this:
>
> http://lore.kernel.org/172790598832.1168608.4519484276671503678.stgit@dwillia2-xfh.jf.intel.com
>
> ...but have not had time to debug the 0day report.
>
> My worry is still that Linux has long since shipped the expectation that
> reading 'uevent' bounces the device_lock() which, among other things,
> makes sure that any in-flight driver probing has completed.
>
> The report of USB devices disappearing is consistent with a udev
> operation failing due to the driver not being done attaching, or
> something similar.
>
> So even though you have not seen any issues, I suspect small differences
> in the devices on your system and the reporter's system, or udev rule
> differences could result in a failure to trigger the regression.
>
>
Hi, Dan
The patch trys to prevent the potential dead lock by letting
device_lock() gets before kernfs_get_active(), but I think it might not
be enough to solve all the potential dead lock. The device_lock() will
be held while removing the driver in device_release_driver(), which
means will be held through driver's .remove(). If .remove() is waiting
for resources to be released by userspace process, and then the
userspace process call device_lock() in uevent_show(), we will have dead
locks here.
In the following case:
device_release_driver
device_lock() blocked by
<operation 1> ---------------------------> <operation 2>
uevent_show()
device_lock()
In [1], It is beacause that pci_stop_and_remove_bus_device() will call
device_release_driver() first and then call device_del() to remove the
uevent sysfs attributes (Sorry for mistakes made in [1],
device_release_driver() is called by pci_stop_bus_device()). So, I think
it might not be a good idea to hold the device_lock() through uevent_show().
ioctl(..,VFIO_GROUP_GET_DEVICE_FD,..)
...
vfio_device_get_from_name()
vfio_devcie->refcount -> 2
pci_stop_and_remove_bus_device()
pci_stop_bus_device()
device_release_driver()
device_lock()
device_remove()
vfio_unregister_group_dev()
vfio_device_put_registration()
vfio_devcie->refcount -> 1
wait for refcount == 0
uevent_show()
device_lock()
[1]
https://lore.kernel.org/lkml/20241231075608.84009-1-zhangzekun11@huawei.com/
Best Regards,
Zekun
On Sat, Jan 04, 2025 at 02:02:54PM +0800, zhangzekun (A) wrote:
>
>
> 在 2024/12/31 16:26, Greg KH 写道:
> > On Tue, Dec 31, 2024 at 03:56:08PM +0800, Zhang Zekun wrote:
> > > Hi, Dan, Greg,
> > >
> > > We have found a potential tungtask issue has been introduce by commit 9a71892cbcdb ("Revert "driver core: Fix uevent_show() vs driver detach race""), which revert the rcu in device_uevent but reintroduce the device_lock() in uevent_show(). The reproduce procedure is quite simple:
> >
> > The revert just puts the original logic back in place, so this is not
> > anything new that has been introduced, right? It's just that the
> > attempted fix didn't work, so a different fix needs to happen.
> Hi, Greg,
>
> Yes, there is nothing new introduced here. We have been testing the rcu fix
> (commit 15fffc6a5624 ("driver core: Fix uevent_show() vs driver detach
> race")) for monthes but has not obersved problems.
That's great, but other people did report problems, which can't be
ignored.
> Noticing that, brmails+k write in bugzilla "Also, I can't say why the issue
> appeared in the past even without this commit being present, as I haven't
> bisected any kernel version before v6.6.45.". I doubt that there could still
> have problem described in bugzilla [1] even if commit c0a40097f0bc
> ("drivers: core: synchronize really_probe() and dev_uevent()") has been
> reverted, and the problem is not directly introduced by rcu in dev_uevent.
>
> [1] https://bugzilla.kernel.org/show_bug.cgi?id=219244
So you think that the bug report is wrong? Ok, then you might need to
take the time to prove it :)
If you feel this should be added back, please feel free to resubmit it
with the new information as to why it should be ok now to apply it.
thanks,
greg k-h
Dan Williams wrote: > Changes since v1 [1]: > - Move the new "locked" infrastructure to private header files to make > it clear it is not approved for general usage (Greg) Greg, per the 0day report and further testing I am missing something subtle in using kernfs open files to pin device objects. So hold off on this for now until I can get that root caused. If someone else can spot what I missed feel free to chime in, but otherwise I will circle back. If I don't get back to this before -rc6 I think the theoretical deadlock that would be re-introduced by a revert of 15fffc6a5624 would be preferable to this reported regression. I am not aware of any reports of that deadlock triggering in practice.
On 08.10.24 02:26, Dan Williams wrote: > Dan Williams wrote: >> Changes since v1 [1]: >> - Move the new "locked" infrastructure to private header files to make >> it clear it is not approved for general usage (Greg) > > Greg, per the 0day report and further testing I am missing something > subtle in using kernfs open files to pin device objects. So hold off on > this for now until I can get that root caused. If someone else can spot > what I missed feel free to chime in, but otherwise I will circle back. > > If I don't get back to this before -rc6 I think the theoretical deadlock > that would be re-introduced by a revert of 15fffc6a5624 would be > preferable to this reported regression. I am not aware of any reports of > that deadlock triggering in practice. Was there any progress? If not: given that Linus prefers to have things fixed by -rc6 I wonder if now would be a good time to get the revert on track for a merge later this week. Ciao, Thorsten #regzbot poke
Thorsten Leemhuis wrote:
> On 08.10.24 02:26, Dan Williams wrote:
> > Dan Williams wrote:
> >> Changes since v1 [1]:
> >> - Move the new "locked" infrastructure to private header files to make
> >> it clear it is not approved for general usage (Greg)
> >
> > Greg, per the 0day report and further testing I am missing something
> > subtle in using kernfs open files to pin device objects. So hold off on
> > this for now until I can get that root caused. If someone else can spot
> > what I missed feel free to chime in, but otherwise I will circle back.
> >
> > If I don't get back to this before -rc6 I think the theoretical deadlock
> > that would be re-introduced by a revert of 15fffc6a5624 would be
> > preferable to this reported regression. I am not aware of any reports of
> > that deadlock triggering in practice.
> Was there any progress? If not: given that Linus prefers to have things
> fixed by -rc6 I wonder if now would be a good time to get the revert on
> track for a merge later this week.
Revert 15fffc6a5624 ("driver core: Fix uevent_show() vs driver detach
race"), which reintroduces a theoretical lockdep splat, is my preference
at this point.
Even if I had a new version of this replacement patch in hand today I
would still want it to be v6.13 material, not v6.12-rc. It deserves a
full kernel cycle soak time to shake out issues before release.
On Mon, Oct 28, 2024 at 03:57:55PM -0700, Dan Williams wrote:
> Thorsten Leemhuis wrote:
> > On 08.10.24 02:26, Dan Williams wrote:
> > > Dan Williams wrote:
> > >> Changes since v1 [1]:
> > >> - Move the new "locked" infrastructure to private header files to make
> > >> it clear it is not approved for general usage (Greg)
> > >
> > > Greg, per the 0day report and further testing I am missing something
> > > subtle in using kernfs open files to pin device objects. So hold off on
> > > this for now until I can get that root caused. If someone else can spot
> > > what I missed feel free to chime in, but otherwise I will circle back.
> > >
> > > If I don't get back to this before -rc6 I think the theoretical deadlock
> > > that would be re-introduced by a revert of 15fffc6a5624 would be
> > > preferable to this reported regression. I am not aware of any reports of
> > > that deadlock triggering in practice.
> > Was there any progress? If not: given that Linus prefers to have things
> > fixed by -rc6 I wonder if now would be a good time to get the revert on
> > track for a merge later this week.
>
> Revert 15fffc6a5624 ("driver core: Fix uevent_show() vs driver detach
> race"), which reintroduces a theoretical lockdep splat, is my preference
> at this point.
>
> Even if I had a new version of this replacement patch in hand today I
> would still want it to be v6.13 material, not v6.12-rc. It deserves a
> full kernel cycle soak time to shake out issues before release.
Ok, I'll do the revert in my tree and push it for the next -rc release
thanks.
greg k-h
On Mon, Oct 07, 2024 at 05:26:57PM -0700, Dan Williams wrote: > Dan Williams wrote: > > Changes since v1 [1]: > > - Move the new "locked" infrastructure to private header files to make > > it clear it is not approved for general usage (Greg) > > Greg, per the 0day report and further testing I am missing something > subtle in using kernfs open files to pin device objects. So hold off on > this for now until I can get that root caused. If someone else can spot > what I missed feel free to chime in, but otherwise I will circle back. Ok, I'll drop this for now and wait for a follow-on patch to come for whatever you decide. thanks, greg k-h
Hello,
kernel test robot noticed "BUG:KASAN:slab-use-after-free_in__mutex_lock" on:
commit: cfb789a84ee6fe1368d091941af50e0eb6381d30 ("[PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier")
url: https://github.com/intel-lab-lkp/linux/commits/Dan-Williams/driver-core-Fix-userspace-expectations-of-uevent_show-as-a-probe-barrier/20241003-055515
base: https://git.kernel.org/cgit/linux/kernel/git/gregkh/driver-core.git 9852d85ec9d492ebef56dc5f229416c925758edc
patch link: https://lore.kernel.org/all/172790598832.1168608.4519484276671503678.stgit@dwillia2-xfh.jf.intel.com/
patch subject: [PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier
in testcase: blktests
version: blktests-x86_64-80430af-1_20240910
with following parameters:
disk: 1SSD
test: block-001
compiler: gcc-12
test machine: 4 threads Intel(R) Core(TM) i5-6500 CPU @ 3.20GHz (Skylake) with 32G memory
(please refer to attached dmesg/kmsg for entire log/backtrace)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202410071741.4aa3984e-oliver.sang@intel.com
[ 51.504416][ T130] BUG: KASAN: slab-use-after-free in __mutex_lock+0x1003/0x1170
[ 51.504423][ T130] Read of size 8 at addr ffff8887ff562250 by task systemd-journal/130
[ 51.504427][ T130]
[ 51.504429][ T130] CPU: 2 UID: 0 PID: 130 Comm: systemd-journal Not tainted 6.12.0-rc1-00001-gcfb789a84ee6 #1
[ 51.504434][ T130] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.8.1 12/05/2017
[ 51.504436][ T130] Call Trace:
[ 51.504438][ T130] <TASK>
[ 51.504439][ T130] dump_stack_lvl (lib/dump_stack.c:123 (discriminator 1))
[ 51.504445][ T130] print_address_description+0x2c/0x3a0
[ 51.504451][ T130] ? __mutex_lock+0x1003/0x1170
[ 51.504455][ T130] print_report (mm/kasan/report.c:489)
[ 51.504459][ T130] ? kasan_addr_to_slab (mm/kasan/common.c:37)
[ 51.504462][ T130] ? __mutex_lock+0x1003/0x1170
[ 51.504466][ T130] kasan_report (mm/kasan/report.c:603)
[ 51.504470][ T130] ? __mutex_lock+0x1003/0x1170
[ 51.504475][ T130] __mutex_lock+0x1003/0x1170
[ 51.504480][ T130] ? __pfx___mutex_lock+0x10/0x10
[ 51.504485][ T130] ? __pfx___might_resched (kernel/sched/core.c:8586)
[ 51.504491][ T130] mutex_lock (kernel/locking/mutex.c:286)
[ 51.504495][ T130] ? __pfx_mutex_lock (kernel/locking/mutex.c:282)
[ 51.504499][ T130] ? __kmalloc_node_noprof (include/linux/kasan.h:257 mm/slub.c:4265 mm/slub.c:4271)
[ 51.504503][ T130] ? seq_read_iter (fs/seq_file.c:210)
[ 51.504507][ T130] kernfs_seq_start (fs/kernfs/file.c:169)
[ 51.504511][ T130] seq_read_iter (fs/seq_file.c:225)
[ 51.504516][ T130] ? rw_verify_area (fs/read_write.c:470)
[ 51.504521][ T130] vfs_read (fs/read_write.c:488 fs/read_write.c:569)
[ 51.504524][ T130] ? kernfs_iop_getattr (fs/kernfs/inode.c:197)
[ 51.504528][ T130] ? __pfx_vfs_read (fs/read_write.c:550)
[ 51.504533][ T130] ? fdget_pos (arch/x86/include/asm/atomic64_64.h:15 include/linux/atomic/atomic-arch-fallback.h:2583 include/linux/atomic/atomic-long.h:38 include/linux/atomic/atomic-instrumented.h:3189 fs/file.c:1177 fs/file.c:1185)
[ 51.504537][ T130] ? __pfx___seccomp_filter (kernel/seccomp.c:1218)
[ 51.504543][ T130] ksys_read (fs/read_write.c:712)
[ 51.504546][ T130] ? __pfx_ksys_read (fs/read_write.c:702)
[ 51.504550][ T130] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83)
[ 51.504555][ T130] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
[ 51.504558][ T130] RIP: 0033:0x7fddff64e1dc
[ 51.504562][ T130] Code: ec 28 48 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 d9 d5 f8 ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 34 44 89 c7 48 89 44 24 08 e8 2f d6 f8 ff 48
All code
========
0: ec in (%dx),%al
1: 28 48 89 sub %cl,-0x77(%rax)
4: 54 push %rsp
5: 24 18 and $0x18,%al
7: 48 89 74 24 10 mov %rsi,0x10(%rsp)
c: 89 7c 24 08 mov %edi,0x8(%rsp)
10: e8 d9 d5 f8 ff callq 0xfffffffffff8d5ee
15: 48 8b 54 24 18 mov 0x18(%rsp),%rdx
1a: 48 8b 74 24 10 mov 0x10(%rsp),%rsi
1f: 41 89 c0 mov %eax,%r8d
22: 8b 7c 24 08 mov 0x8(%rsp),%edi
26: 31 c0 xor %eax,%eax
28: 0f 05 syscall
2a:* 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax <-- trapping instruction
30: 77 34 ja 0x66
32: 44 89 c7 mov %r8d,%edi
35: 48 89 44 24 08 mov %rax,0x8(%rsp)
3a: e8 2f d6 f8 ff callq 0xfffffffffff8d66e
3f: 48 rex.W
Code starting with the faulting instruction
===========================================
0: 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax
6: 77 34 ja 0x3c
8: 44 89 c7 mov %r8d,%edi
b: 48 89 44 24 08 mov %rax,0x8(%rsp)
10: e8 2f d6 f8 ff callq 0xfffffffffff8d644
15: 48 rex.W
[ 51.504565][ T130] RSP: 002b:00007ffe3e501df0 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[ 51.504569][ T130] RAX: ffffffffffffffda RBX: 000055d748fcdac0 RCX: 00007fddff64e1dc
[ 51.504572][ T130] RDX: 0000000000001008 RSI: 000055d748fcdac0 RDI: 000000000000001c
[ 51.504574][ T130] RBP: 000000000000001c R08: 0000000000000000 R09: 00007fddff728ce0
[ 51.504576][ T130] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000001008
[ 51.504578][ T130] R13: 0000000000001007 R14: ffffffffffffffff R15: 0000000000000002
[ 51.504583][ T130] </TASK>
[ 51.504585][ T130]
[ 51.504586][ T130] Allocated by task 1044:
[ 51.504588][ T130] kasan_save_stack (mm/kasan/common.c:48)
[ 51.504591][ T130] kasan_save_track (arch/x86/include/asm/current.h:49 mm/kasan/common.c:60 mm/kasan/common.c:69)
[ 51.504594][ T130] __kasan_kmalloc (mm/kasan/common.c:377 mm/kasan/common.c:394)
[ 51.504597][ T130] __kmalloc_noprof (include/linux/kasan.h:257 mm/slub.c:4265 mm/slub.c:4277)
[ 51.504600][ T130] scsi_alloc_sdev (include/linux/slab.h:882 include/linux/slab.h:1014 drivers/scsi/scsi_scan.c:288)
[ 51.504604][ T130] scsi_probe_and_add_lun (drivers/scsi/scsi_scan.c:1210)
[ 51.504607][ T130] __scsi_scan_target (drivers/scsi/scsi_scan.c:1769)
[ 51.504610][ T130] scsi_scan_host_selected (drivers/scsi/scsi_scan.c:1877)
[ 51.504614][ T130] store_scan (drivers/scsi/scsi_sysfs.c:151 drivers/scsi/scsi_sysfs.c:191)
[ 51.504617][ T130] kernfs_fop_write_iter (fs/kernfs/file.c:348)
[ 51.504619][ T130] vfs_write (fs/read_write.c:590 fs/read_write.c:683)
[ 51.504622][ T130] ksys_write (fs/read_write.c:736)
[ 51.504624][ T130] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83)
[ 51.504627][ T130] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
[ 51.504629][ T130]
[ 51.504630][ T130] Freed by task 1044:
[ 51.504632][ T130] kasan_save_stack (mm/kasan/common.c:48)
[ 51.504635][ T130] kasan_save_track (arch/x86/include/asm/current.h:49 mm/kasan/common.c:60 mm/kasan/common.c:69)
[ 51.504638][ T130] kasan_save_free_info (mm/kasan/generic.c:582)
[ 51.504640][ T130] __kasan_slab_free (mm/kasan/common.c:271)
[ 51.504643][ T130] kfree (mm/slub.c:4580 mm/slub.c:4728)
[ 51.504646][ T130] scsi_device_dev_release (drivers/scsi/scsi_sysfs.c:521 (discriminator 5))
[ 51.504649][ T130] device_release (drivers/base/core.c:2575)
[ 51.504653][ T130] kobject_cleanup (lib/kobject.c:689)
[ 51.504656][ T130] scsi_device_put (drivers/scsi/scsi.c:794)
[ 51.504658][ T130] sdev_store_delete (drivers/scsi/scsi_sysfs.c:791)
[ 51.504661][ T130] kernfs_fop_write_iter (fs/kernfs/file.c:348)
[ 51.504664][ T130] vfs_write (fs/read_write.c:590 fs/read_write.c:683)
[ 51.504666][ T130] ksys_write (fs/read_write.c:736)
[ 51.504668][ T130] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83)
[ 51.504671][ T130] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
[ 51.504674][ T130]
[ 51.504675][ T130] Last potentially related work creation:
[ 51.504676][ T130] kasan_save_stack (mm/kasan/common.c:48)
[ 51.504679][ T130] __kasan_record_aux_stack (mm/kasan/generic.c:541)
[ 51.504681][ T130] insert_work (include/linux/instrumented.h:68 include/asm-generic/bitops/instrumented-non-atomic.h:141 kernel/workqueue.c:788 kernel/workqueue.c:795 kernel/workqueue.c:2186)
[ 51.504685][ T130] __queue_work (kernel/workqueue.c:6723)
[ 51.504689][ T130] queue_work_on (kernel/workqueue.c:2391)
[ 51.504692][ T130] sdev_evt_send (include/linux/spinlock.h:406 drivers/scsi/scsi_lib.c:2645)
[ 51.504694][ T130] scsi_evt_thread (drivers/scsi/scsi_lib.c:2596)
[ 51.504697][ T130] process_one_work (kernel/workqueue.c:3229)
[ 51.504701][ T130] worker_thread (kernel/workqueue.c:3304 kernel/workqueue.c:3391)
[ 51.504704][ T130] kthread (kernel/kthread.c:389)
[ 51.504707][ T130] ret_from_fork (arch/x86/kernel/process.c:147)
[ 51.504710][ T130] ret_from_fork_asm (arch/x86/entry/entry_64.S:257)
[ 51.504713][ T130]
[ 51.504714][ T130] Second to last potentially related work creation:
[ 51.504716][ T130] kasan_save_stack (mm/kasan/common.c:48)
[ 51.504719][ T130] __kasan_record_aux_stack (mm/kasan/generic.c:541)
[ 51.504721][ T130] insert_work (include/linux/instrumented.h:68 include/asm-generic/bitops/instrumented-non-atomic.h:141 kernel/workqueue.c:788 kernel/workqueue.c:795 kernel/workqueue.c:2186)
[ 51.504724][ T130] __queue_work (kernel/workqueue.c:6723)
[ 51.504727][ T130] queue_work_on (kernel/workqueue.c:2391)
[ 51.504731][ T130] scsi_check_sense (include/scsi/scsi_eh.h:24 drivers/scsi/scsi_error.c:550)
[ 51.504734][ T130] scsi_decide_disposition (drivers/scsi/scsi_error.c:2024)
[ 51.504737][ T130] scsi_complete (drivers/scsi/scsi_lib.c:1515)
[ 51.504740][ T130] blk_complete_reqs (block/blk-mq.c:1126 (discriminator 3))
[ 51.504744][ T130] handle_softirqs (kernel/softirq.c:554)
[ 51.504747][ T130] run_ksoftirqd (arch/x86/include/asm/irqflags.h:42 arch/x86/include/asm/irqflags.h:97 kernel/softirq.c:411 kernel/softirq.c:928 kernel/softirq.c:919)
[ 51.504750][ T130] smpboot_thread_fn (kernel/smpboot.c:164 (discriminator 3))
[ 51.504753][ T130] kthread (kernel/kthread.c:389)
[ 51.504755][ T130] ret_from_fork (arch/x86/kernel/process.c:147)
[ 51.504758][ T130] ret_from_fork_asm (arch/x86/entry/entry_64.S:257)
[ 51.504760][ T130]
[ 51.504761][ T130] The buggy address belongs to the object at ffff8887ff562000
[ 51.504761][ T130] which belongs to the cache kmalloc-4k of size 4096
[ 51.504764][ T130] The buggy address is located 592 bytes inside of
[ 51.504764][ T130] freed 4096-byte region [ffff8887ff562000, ffff8887ff563000)
[ 51.504767][ T130]
[ 51.504768][ T130] The buggy address belongs to the physical page:
[ 51.504770][ T130] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7ff560
[ 51.504774][ T130] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
[ 51.504776][ T130] flags: 0x17ffffc0000040(head|node=0|zone=2|lastcpupid=0x1fffff)
[ 51.504780][ T130] page_type: f5(slab)
[ 51.504783][ T130] raw: 0017ffffc0000040 ffff88810c843040 dead000000000122 0000000000000000
[ 51.504786][ T130] raw: 0000000000000000 0000000000040004 00000001f5000000 0000000000000000
[ 51.504789][ T130] head: 0017ffffc0000040 ffff88810c843040 dead000000000122 0000000000000000
The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20241007/202410071741.4aa3984e-oliver.sang@intel.com
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2026 Red Hat, Inc.