[PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier

Dan Williams posted 1 patch 1 year, 4 months ago
drivers/base/base.h    |    9 +++++++++
drivers/base/core.c    |    2 +-
fs/kernfs/file.c       |   24 +++++++++++++++++++-----
fs/sysfs/file.c        |   47 +++++++++++++++++++++++++++++++++++++++++++++++
fs/sysfs/group.c       |    4 ++--
fs/sysfs/sysfs.h       |   18 ++++++++++++++++++
include/linux/kernfs.h |    1 +
include/linux/sysfs.h  |    1 +
8 files changed, 98 insertions(+), 8 deletions(-)
[PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier
Posted by Dan Williams 1 year, 4 months ago
Commit "driver core: Fix uevent_show() vs driver detach race" [1]
attempted to fix a lockdep report in uevent_show() by making the lookup
of driver information for a given device lockless. It turns out that
userspace likely depends on the side-effect of uevent_show() flushing
device probing, as evidenced by the removal of the lock causing a
regression initializing USB devices [2].

Introduce a new "locked" type for sysfs attributes that arranges for the
attribute to be called under the device-lock, but without setting up a
reverse locking order dependency with the kernfs_get_active() lock.

This new facility holds a reference on a device while any locked-sysfs
attribute of that device is open. It then takes the lock around sysfs
read/write operations in the following order:

    of->mutex
    of->op_mutex <= device_lock()
    kernfs_get_active()
    <operation>

Compare that to problematic locking order of:

    of->mutex
    kernfs_get_active()
    <operation>
        device_lock()

...which causes potential deadlocks with kernfs_drain() that may be
called while the device_lock() is held.

Note that the synchronize_rcu() in module_remove_driver(), introduced in
the precedubg fix, likely needs to remain since dev_uevent() is not
always called with the device_lock held. Userspace can potentially
trigger use after free by racing module removal against kernel
invocations of kobject_uevent().

Note2, this change effectively allows userspace to test for
CONFIG_DEBUG_KOBJECT_RELEASE-style driver bugs as the lifetime of open
file descriptors on the @uevent attribute keeps the device reference
count elevated indefinitely.

Reported-by: brmails+k
Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219244 [2]
Tested-by: brmails+k
Fixes: 15fffc6a5624 ("driver core: Fix uevent_show() vs driver detach race") [1]
Cc: stable@vger.kernel.org
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
---
Changes since v1 [1]:
- Move the new "locked" infrastructure to private header files to make
  it clear it is not approved for general usage (Greg)

[1]: http://lore.kernel.org/172772671177.1003633.7355063154793624707.stgit@dwillia2-xfh.jf.intel.com

 drivers/base/base.h    |    9 +++++++++
 drivers/base/core.c    |    2 +-
 fs/kernfs/file.c       |   24 +++++++++++++++++++-----
 fs/sysfs/file.c        |   47 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/sysfs/group.c       |    4 ++--
 fs/sysfs/sysfs.h       |   18 ++++++++++++++++++
 include/linux/kernfs.h |    1 +
 include/linux/sysfs.h  |    1 +
 8 files changed, 98 insertions(+), 8 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index 0b53593372d7..0405e7667184 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -246,3 +246,12 @@ static inline int devtmpfs_delete_node(struct device *dev) { return 0; }
 
 void software_node_notify(struct device *dev);
 void software_node_notify_remove(struct device *dev);
+
+/*
+ * "locked" device attributes are an internal exception for the @uevent
+ * attribute.
+ */
+#include "../../fs/sysfs/sysfs.h"
+
+#define DEVICE_ATTR_LOCKED_RW(_name) \
+	struct device_attribute dev_attr_##_name = __ATTR_LOCKED_RW(_name)
diff --git a/drivers/base/core.c b/drivers/base/core.c
index 8c0733d3aad8..1fd5a18cbb62 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -2772,7 +2772,7 @@ static ssize_t uevent_store(struct device *dev, struct device_attribute *attr,
 
 	return count;
 }
-static DEVICE_ATTR_RW(uevent);
+static DEVICE_ATTR_LOCKED_RW(uevent);
 
 static ssize_t online_show(struct device *dev, struct device_attribute *attr,
 			   char *buf)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 8502ef68459b..eb5c2167beb9 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -142,6 +142,20 @@ static void kernfs_seq_stop_active(struct seq_file *sf, void *v)
 	kernfs_put_active(of->kn);
 }
 
+static void kernfs_open_file_lock(struct kernfs_open_file *of)
+{
+	mutex_lock(&of->mutex);
+	if (of->op_mutex)
+		mutex_lock(of->op_mutex);
+}
+
+static void kernfs_open_file_unlock(struct kernfs_open_file *of)
+{
+	if (of->op_mutex)
+		mutex_unlock(of->op_mutex);
+	mutex_unlock(&of->mutex);
+}
+
 static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 {
 	struct kernfs_open_file *of = sf->private;
@@ -151,7 +165,7 @@ static void *kernfs_seq_start(struct seq_file *sf, loff_t *ppos)
 	 * @of->mutex nests outside active ref and is primarily to ensure that
 	 * the ops aren't called concurrently for the same open file.
 	 */
-	mutex_lock(&of->mutex);
+	kernfs_open_file_lock(of);
 	if (!kernfs_get_active(of->kn))
 		return ERR_PTR(-ENODEV);
 
@@ -193,7 +207,7 @@ static void kernfs_seq_stop(struct seq_file *sf, void *v)
 
 	if (v != ERR_PTR(-ENODEV))
 		kernfs_seq_stop_active(sf, v);
-	mutex_unlock(&of->mutex);
+	kernfs_open_file_unlock(of);
 }
 
 static int kernfs_seq_show(struct seq_file *sf, void *v)
@@ -322,9 +336,9 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	 * @of->mutex nests outside active ref and is used both to ensure that
 	 * the ops aren't called concurrently for the same open file.
 	 */
-	mutex_lock(&of->mutex);
+	kernfs_open_file_lock(of);
 	if (!kernfs_get_active(of->kn)) {
-		mutex_unlock(&of->mutex);
+		kernfs_open_file_unlock(of);
 		len = -ENODEV;
 		goto out_free;
 	}
@@ -336,7 +350,7 @@ static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 		len = -EINVAL;
 
 	kernfs_put_active(of->kn);
-	mutex_unlock(&of->mutex);
+	kernfs_open_file_unlock(of);
 
 	if (len > 0)
 		iocb->ki_pos += len;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index d1995e2d6c94..1bb878efcf00 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -15,6 +15,7 @@
 #include <linux/list.h>
 #include <linux/mutex.h>
 #include <linux/seq_file.h>
+#include <linux/device.h>
 #include <linux/mm.h>
 
 #include "sysfs.h"
@@ -189,6 +190,26 @@ static int sysfs_kf_bin_open(struct kernfs_open_file *of)
 	return 0;
 }
 
+/* locked attributes are always device attributes */
+static int sysfs_kf_setup_lock(struct kernfs_open_file *of)
+{
+	struct kobject *kobj = of->kn->parent->priv;
+	struct device *dev = kobj_to_dev(kobj);
+
+	get_device(dev);
+	of->op_mutex = &dev->mutex;
+
+	return 0;
+}
+
+static void sysfs_kf_free_lock(struct kernfs_open_file *of)
+{
+	struct kobject *kobj = of->kn->parent->priv;
+	struct device *dev = kobj_to_dev(kobj);
+
+	put_device(dev);
+}
+
 void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr)
 {
 	struct kernfs_node *kn = kobj->sd, *tmp;
@@ -227,6 +248,25 @@ static const struct kernfs_ops sysfs_file_kfops_rw = {
 	.write		= sysfs_kf_write,
 };
 
+static const struct kernfs_ops sysfs_locked_kfops_ro = {
+	.seq_show	= sysfs_kf_seq_show,
+	.open		= sysfs_kf_setup_lock,
+	.release	= sysfs_kf_free_lock,
+};
+
+static const struct kernfs_ops sysfs_locked_kfops_wo = {
+	.write		= sysfs_kf_write,
+	.open		= sysfs_kf_setup_lock,
+	.release	= sysfs_kf_free_lock,
+};
+
+static const struct kernfs_ops sysfs_locked_kfops_rw = {
+	.seq_show	= sysfs_kf_seq_show,
+	.write		= sysfs_kf_write,
+	.open		= sysfs_kf_setup_lock,
+	.release	= sysfs_kf_free_lock,
+};
+
 static const struct kernfs_ops sysfs_prealloc_kfops_ro = {
 	.read		= sysfs_kf_read,
 	.prealloc	= true,
@@ -287,6 +327,13 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
 			ops = &sysfs_prealloc_kfops_ro;
 		else if (sysfs_ops->store)
 			ops = &sysfs_prealloc_kfops_wo;
+	} else if (mode & SYSFS_LOCKED) {
+		if (sysfs_ops->show && sysfs_ops->store)
+			ops = &sysfs_locked_kfops_rw;
+		else if (sysfs_ops->show)
+			ops = &sysfs_locked_kfops_ro;
+		else if (sysfs_ops->store)
+			ops = &sysfs_locked_kfops_wo;
 	} else {
 		if (sysfs_ops->show && sysfs_ops->store)
 			ops = &sysfs_file_kfops_rw;
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index d22ad67a0f32..0158367866be 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -68,11 +68,11 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 					continue;
 			}
 
-			WARN(mode & ~(SYSFS_PREALLOC | 0664),
+			WARN(mode & ~(SYSFS_PREALLOC | SYSFS_LOCKED | 0664),
 			     "Attribute %s: Invalid permissions 0%o\n",
 			     (*attr)->name, mode);
 
-			mode &= SYSFS_PREALLOC | 0664;
+			mode &= SYSFS_PREALLOC | SYSFS_LOCKED | 0664;
 			error = sysfs_add_file_mode_ns(parent, *attr, mode, uid,
 						       gid, NULL);
 			if (unlikely(error))
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3f28c9af5756..51d5b6af243f 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -40,4 +40,22 @@ int sysfs_add_bin_file_mode_ns(struct kernfs_node *parent,
 int sysfs_create_link_sd(struct kernfs_node *kn, struct kobject *target,
 			 const char *name);
 
+#define SYSFS_LOCKED		040000
+
+/*
+ * uevent_show() needs this due to userspace expectations of reading
+ * that attribute flushing device probing, it is not intended to be a
+ * general semantic for other device sysfs attributes. It is broken to
+ * use this with non-device / pure-kobject sysfs attributes, see
+ * sysfs_kf_setup_lock().
+ */
+#define __ATTR_LOCKED(_name, _mode, _show, _store) {			\
+	.attr = {.name = __stringify(_name),				\
+		 .mode = SYSFS_LOCKED | VERIFY_OCTAL_PERMISSIONS(_mode) },\
+	.show	= _show,						\
+	.store	= _store,						\
+}
+
+#define __ATTR_LOCKED_RW(_name) __ATTR_LOCKED(_name, 0644, _name##_show, _name##_store)
+
 #endif	/* __SYSFS_INTERNAL_H */
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 87c79d076d6d..df6828a7cd3e 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -257,6 +257,7 @@ struct kernfs_open_file {
 
 	/* private fields, do not use outside kernfs proper */
 	struct mutex		mutex;
+	struct mutex		*op_mutex;
 	struct mutex		prealloc_mutex;
 	int			event;
 	struct list_head	list;
diff --git a/include/linux/sysfs.h b/include/linux/sysfs.h
index c4e64dc11206..dd28c6238b64 100644
--- a/include/linux/sysfs.h
+++ b/include/linux/sysfs.h
@@ -103,6 +103,7 @@ struct attribute_group {
 
 #define SYSFS_PREALLOC		010000
 #define SYSFS_GROUP_INVISIBLE	020000
+/* see fs/sysfs/sysfs.h for private mode-flags */
 
 /*
  * DEFINE_SYSFS_GROUP_VISIBLE(name):
Possible hungtask issue will be introduced with device_lock() in uevent_show()
Posted by Zhang Zekun 1 year, 1 month ago
Hi, Dan, Greg,

We have found a potential tungtask issue has been introduce by commit 9a71892cbcdb ("Revert "driver core: Fix uevent_show() vs driver detach race""), which revert the rcu in device_uevent but reintroduce the device_lock() in uevent_show(). The reproduce procedure is quite simple:

$ lspci -n -s 0000:00:03.0
00:03.0 0108: 8086:5845 (rev 02)
$ echo 8086 5845 > /sys/bus/pci/drivers/vfio-pci/new_id
$ ls /dev/vfio
3  vfio
$ ./a.out &  (Binary program compiled with the following vfio demo program)
$ echo 1 > /sys/devices/pci0000:00/0000:00:03.0/remove

The root cause of the hung task is due to the device_lock() in uevent_show() is blocked by device_realease_driver() in pci remove routeine which is also blocked to wait for vfio_device->refcount == 0, which can never come because it's refcount has been lifted in the same user process.
Besides, the patch proposed by Dan [1] can not prevent hungtask from happening. For 6.13-rc5, in the following case:

ioctl(..,VFIO_GROUP_GET_DEVICE_FD,..)
        ...
        vfio_device_get_from_name()
        vfio_devcie->refcount -> 2
                                        pci_stop_and_remove_bus_device()
                                        pci_remove_bus()
                                        device_unregister()
                                        ...
                                        device_release_driver()
                                        device_lock()
                                        device_remove()
                                                vfio_unregister_group_dev()
                                                vfio_devcie_put()
                                                vfio_devcie->refcount -> 1
                                                wait_for(vfio_devcie->refcount = 0)
uevent_show()
        device_lock()

[1] https://lore.kernel.org/all/172790598832.1168608.4519484276671503678.stgit@dwillia2-xfh.jf.intel.com/#R
                                                        
-------------------->8---------------------------------
int main(int argc, char *argv[])
{
        char buf[128];
        int container, group, device, uevent, ret;
        struct vfio_group_status group_status =
                                        { .argsz = sizeof(group_status) };
        struct vfio_iommu_type1_info iommu_info = { .argsz = sizeof(iommu_info) };

        container = open("/dev/vfio/vfio", O_RDWR);
        group = open("/dev/vfio/3", O_RDWR);
        ioctl(group, VFIO_GROUP_GET_STATUS, &group_status);
        ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
        ioctl(container, VFIO_SET_IOMMU, VFIO_TYPE1_IOMMU);
        ioctl(container, VFIO_IOMMU_GET_INFO, &iommu_info);
        device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, "0000:00:03.0");
        uevent = open("/sys/devices/pci0000:00/0000:00:03.0/uevent", O_RDWR);

        sleep(10);  /* Remove the pci device here */
        ret = read(uevent, buf, 128); /* We will get hung task here */
        printf("ret %d\n", ret);
        return 0;
}

Thanks,
Zekun
Re: Possible hungtask issue will be introduced with device_lock() in uevent_show()
Posted by Greg KH 1 year, 1 month ago
On Tue, Dec 31, 2024 at 03:56:08PM +0800, Zhang Zekun wrote:
> Hi, Dan, Greg,
> 
> We have found a potential tungtask issue has been introduce by commit 9a71892cbcdb ("Revert "driver core: Fix uevent_show() vs driver detach race""), which revert the rcu in device_uevent but reintroduce the device_lock() in uevent_show(). The reproduce procedure is quite simple:

The revert just puts the original logic back in place, so this is not
anything new that has been introduced, right?  It's just that the
attempted fix didn't work, so a different fix needs to happen.

If you could help debug the original fix, that would be most appreciated
as none of us here are arguing that there isn't a bug present, only that
the fix caused different problems which required it to be reverted.

thanks,

greg k-h
Re: Possible hungtask issue will be introduced with device_lock() in uevent_show()
Posted by zhangzekun (A) 1 year, 1 month ago

在 2024/12/31 16:26, Greg KH 写道:
> On Tue, Dec 31, 2024 at 03:56:08PM +0800, Zhang Zekun wrote:
>> Hi, Dan, Greg,
>>
>> We have found a potential tungtask issue has been introduce by commit 9a71892cbcdb ("Revert "driver core: Fix uevent_show() vs driver detach race""), which revert the rcu in device_uevent but reintroduce the device_lock() in uevent_show(). The reproduce procedure is quite simple:
> 
> The revert just puts the original logic back in place, so this is not
> anything new that has been introduced, right?  It's just that the
> attempted fix didn't work, so a different fix needs to happen.
Hi, Greg,

Yes, there is nothing new introduced here. We have been testing the rcu 
fix (commit 15fffc6a5624 ("driver core: Fix uevent_show() vs driver 
detach race")) for monthes but has not obersved problems.

Noticing that, brmails+k write in bugzilla "Also, I can't say why the 
issue appeared in the past even without this commit being present, as I 
haven't bisected any kernel version before v6.6.45.". I doubt that there 
could still have problem described in bugzilla [1] even if commit 
c0a40097f0bc ("drivers: core: synchronize really_probe() and 
dev_uevent()") has been reverted, and the problem is not directly 
introduced by rcu in dev_uevent.

[1] https://bugzilla.kernel.org/show_bug.cgi?id=219244

Thanks,
Zekun
Re: Possible hungtask issue will be introduced with device_lock() in uevent_show()
Posted by Dan Williams 1 year ago
zhangzekun (A) wrote:
> 
> 
> 在 2024/12/31 16:26, Greg KH 写道:
> > On Tue, Dec 31, 2024 at 03:56:08PM +0800, Zhang Zekun wrote:
> >> Hi, Dan, Greg,
> >>
> >> We have found a potential tungtask issue has been introduce by commit 9a71892cbcdb ("Revert "driver core: Fix uevent_show() vs driver detach race""), which revert the rcu in device_uevent but reintroduce the device_lock() in uevent_show(). The reproduce procedure is quite simple:
> > 
> > The revert just puts the original logic back in place, so this is not
> > anything new that has been introduced, right?  It's just that the
> > attempted fix didn't work, so a different fix needs to happen.
> Hi, Greg,
> 
> Yes, there is nothing new introduced here. We have been testing the rcu 
> fix (commit 15fffc6a5624 ("driver core: Fix uevent_show() vs driver 
> detach race")) for monthes but has not obersved problems.

Hi Zekun,

If you have some cycles to help investigate the replacement fix that
would be much appreciated.

So far I came up with this:

http://lore.kernel.org/172790598832.1168608.4519484276671503678.stgit@dwillia2-xfh.jf.intel.com

...but have not had time to debug the 0day report.

My worry is still that Linux has long since shipped the expectation that
reading 'uevent' bounces the device_lock() which, among other things,
makes sure that any in-flight driver probing has completed.

The report of USB devices disappearing is consistent with a udev
operation failing due to the driver not being done attaching, or
something similar.

So even though you have not seen any issues, I suspect small differences
in the devices on your system and the reporter's system, or udev rule
differences could result in a failure to trigger the regression.
Re: Possible hungtask issue will be introduced with device_lock() in uevent_show()
Posted by zhangzekun (A) 1 year ago
> Hi Zekun,
> 
> If you have some cycles to help investigate the replacement fix that
> would be much appreciated.
> 
> So far I came up with this:
> 
> http://lore.kernel.org/172790598832.1168608.4519484276671503678.stgit@dwillia2-xfh.jf.intel.com
> 
> ...but have not had time to debug the 0day report.
> 
> My worry is still that Linux has long since shipped the expectation that
> reading 'uevent' bounces the device_lock() which, among other things,
> makes sure that any in-flight driver probing has completed.
> 
> The report of USB devices disappearing is consistent with a udev
> operation failing due to the driver not being done attaching, or
> something similar.
> 
> So even though you have not seen any issues, I suspect small differences
> in the devices on your system and the reporter's system, or udev rule
> differences could result in a failure to trigger the regression.
> 
> 
Hi, Dan

The patch trys to prevent the potential dead lock by letting 
device_lock() gets before kernfs_get_active(), but I think it might not 
be enough to solve all the potential dead lock. The device_lock() will 
be held while removing the driver in device_release_driver(), which 
means will be held through driver's .remove(). If .remove() is waiting 
for resources to be released by userspace process, and then the 
userspace process call device_lock() in uevent_show(), we will have dead 
locks here.

In the following case:

device_release_driver
device_lock()          blocked by
<operation 1> ---------------------------> <operation 2>
                                            uevent_show()
                                            device_lock()


In [1], It is beacause that pci_stop_and_remove_bus_device() will call 
device_release_driver() first and then call device_del() to remove the 
uevent sysfs attributes (Sorry for mistakes made in [1], 
device_release_driver() is called by pci_stop_bus_device()). So, I think 
it might not be a good idea to hold the device_lock() through uevent_show().

ioctl(..,VFIO_GROUP_GET_DEVICE_FD,..)
         ...
         vfio_device_get_from_name()
         vfio_devcie->refcount -> 2
                                         pci_stop_and_remove_bus_device()
                                         pci_stop_bus_device()
                                         device_release_driver()
                                         device_lock()
                                         device_remove()
												vfio_unregister_group_dev()
												vfio_device_put_registration()
												vfio_devcie->refcount -> 1
												wait for refcount == 0
uevent_show()
         device_lock()

[1] 
https://lore.kernel.org/lkml/20241231075608.84009-1-zhangzekun11@huawei.com/

Best Regards,
Zekun
Re: Possible hungtask issue will be introduced with device_lock() in uevent_show()
Posted by Greg KH 1 year, 1 month ago
On Sat, Jan 04, 2025 at 02:02:54PM +0800, zhangzekun (A) wrote:
> 
> 
> 在 2024/12/31 16:26, Greg KH 写道:
> > On Tue, Dec 31, 2024 at 03:56:08PM +0800, Zhang Zekun wrote:
> > > Hi, Dan, Greg,
> > > 
> > > We have found a potential tungtask issue has been introduce by commit 9a71892cbcdb ("Revert "driver core: Fix uevent_show() vs driver detach race""), which revert the rcu in device_uevent but reintroduce the device_lock() in uevent_show(). The reproduce procedure is quite simple:
> > 
> > The revert just puts the original logic back in place, so this is not
> > anything new that has been introduced, right?  It's just that the
> > attempted fix didn't work, so a different fix needs to happen.
> Hi, Greg,
> 
> Yes, there is nothing new introduced here. We have been testing the rcu fix
> (commit 15fffc6a5624 ("driver core: Fix uevent_show() vs driver detach
> race")) for monthes but has not obersved problems.

That's great, but other people did report problems, which can't be
ignored.

> Noticing that, brmails+k write in bugzilla "Also, I can't say why the issue
> appeared in the past even without this commit being present, as I haven't
> bisected any kernel version before v6.6.45.". I doubt that there could still
> have problem described in bugzilla [1] even if commit c0a40097f0bc
> ("drivers: core: synchronize really_probe() and dev_uevent()") has been
> reverted, and the problem is not directly introduced by rcu in dev_uevent.
> 
> [1] https://bugzilla.kernel.org/show_bug.cgi?id=219244

So you think that the bug report is wrong?  Ok, then you might need to
take the time to prove it :)

If you feel this should be added back, please feel free to resubmit it
with the new information as to why it should be ok now to apply it.

thanks,

greg k-h
Re: [PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier
Posted by Dan Williams 1 year, 4 months ago
Dan Williams wrote:
> Changes since v1 [1]:
> - Move the new "locked" infrastructure to private header files to make
>   it clear it is not approved for general usage (Greg)

Greg, per the 0day report and further testing I am missing something
subtle in using kernfs open files to pin device objects. So hold off on
this for now until I can get that root caused. If someone else can spot
what I missed feel free to chime in, but otherwise I will circle back.

If I don't get back to this before -rc6 I think the theoretical deadlock
that would be re-introduced by a revert of 15fffc6a5624 would be
preferable to this reported regression. I am not aware of any reports of
that deadlock triggering in practice.
Re: [PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier
Posted by Thorsten Leemhuis 1 year, 3 months ago
On 08.10.24 02:26, Dan Williams wrote:
> Dan Williams wrote:
>> Changes since v1 [1]:
>> - Move the new "locked" infrastructure to private header files to make
>>   it clear it is not approved for general usage (Greg)
> 
> Greg, per the 0day report and further testing I am missing something
> subtle in using kernfs open files to pin device objects. So hold off on
> this for now until I can get that root caused. If someone else can spot
> what I missed feel free to chime in, but otherwise I will circle back.
> 
> If I don't get back to this before -rc6 I think the theoretical deadlock
> that would be re-introduced by a revert of 15fffc6a5624 would be
> preferable to this reported regression. I am not aware of any reports of
> that deadlock triggering in practice.
Was there any progress? If not: given that Linus prefers to have things
fixed by -rc6 I wonder if now would be a good time to get the revert on
track for a merge later this week.

Ciao, Thorsten

#regzbot poke
Re: [PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier
Posted by Dan Williams 1 year, 3 months ago
Thorsten Leemhuis wrote:
> On 08.10.24 02:26, Dan Williams wrote:
> > Dan Williams wrote:
> >> Changes since v1 [1]:
> >> - Move the new "locked" infrastructure to private header files to make
> >>   it clear it is not approved for general usage (Greg)
> > 
> > Greg, per the 0day report and further testing I am missing something
> > subtle in using kernfs open files to pin device objects. So hold off on
> > this for now until I can get that root caused. If someone else can spot
> > what I missed feel free to chime in, but otherwise I will circle back.
> > 
> > If I don't get back to this before -rc6 I think the theoretical deadlock
> > that would be re-introduced by a revert of 15fffc6a5624 would be
> > preferable to this reported regression. I am not aware of any reports of
> > that deadlock triggering in practice.
> Was there any progress? If not: given that Linus prefers to have things
> fixed by -rc6 I wonder if now would be a good time to get the revert on
> track for a merge later this week.

Revert 15fffc6a5624 ("driver core: Fix uevent_show() vs driver detach
race"), which reintroduces a theoretical lockdep splat, is my preference
at this point.

Even if I had a new version of this replacement patch in hand today I
would still want it to be v6.13 material, not v6.12-rc. It deserves a
full kernel cycle soak time to shake out issues before release.
Re: [PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier
Posted by Greg KH 1 year, 3 months ago
On Mon, Oct 28, 2024 at 03:57:55PM -0700, Dan Williams wrote:
> Thorsten Leemhuis wrote:
> > On 08.10.24 02:26, Dan Williams wrote:
> > > Dan Williams wrote:
> > >> Changes since v1 [1]:
> > >> - Move the new "locked" infrastructure to private header files to make
> > >>   it clear it is not approved for general usage (Greg)
> > > 
> > > Greg, per the 0day report and further testing I am missing something
> > > subtle in using kernfs open files to pin device objects. So hold off on
> > > this for now until I can get that root caused. If someone else can spot
> > > what I missed feel free to chime in, but otherwise I will circle back.
> > > 
> > > If I don't get back to this before -rc6 I think the theoretical deadlock
> > > that would be re-introduced by a revert of 15fffc6a5624 would be
> > > preferable to this reported regression. I am not aware of any reports of
> > > that deadlock triggering in practice.
> > Was there any progress? If not: given that Linus prefers to have things
> > fixed by -rc6 I wonder if now would be a good time to get the revert on
> > track for a merge later this week.
> 
> Revert 15fffc6a5624 ("driver core: Fix uevent_show() vs driver detach
> race"), which reintroduces a theoretical lockdep splat, is my preference
> at this point.
> 
> Even if I had a new version of this replacement patch in hand today I
> would still want it to be v6.13 material, not v6.12-rc. It deserves a
> full kernel cycle soak time to shake out issues before release.

Ok, I'll do the revert in my tree and push it for the next -rc release
thanks.

greg k-h
Re: [PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier
Posted by Greg KH 1 year, 3 months ago
On Mon, Oct 07, 2024 at 05:26:57PM -0700, Dan Williams wrote:
> Dan Williams wrote:
> > Changes since v1 [1]:
> > - Move the new "locked" infrastructure to private header files to make
> >   it clear it is not approved for general usage (Greg)
> 
> Greg, per the 0day report and further testing I am missing something
> subtle in using kernfs open files to pin device objects. So hold off on
> this for now until I can get that root caused. If someone else can spot
> what I missed feel free to chime in, but otherwise I will circle back.

Ok, I'll drop this for now and wait for a follow-on patch to come for
whatever you decide.

thanks,

greg k-h
Re: [PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier
Posted by kernel test robot 1 year, 4 months ago

Hello,

kernel test robot noticed "BUG:KASAN:slab-use-after-free_in__mutex_lock" on:

commit: cfb789a84ee6fe1368d091941af50e0eb6381d30 ("[PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier")
url: https://github.com/intel-lab-lkp/linux/commits/Dan-Williams/driver-core-Fix-userspace-expectations-of-uevent_show-as-a-probe-barrier/20241003-055515
base: https://git.kernel.org/cgit/linux/kernel/git/gregkh/driver-core.git 9852d85ec9d492ebef56dc5f229416c925758edc
patch link: https://lore.kernel.org/all/172790598832.1168608.4519484276671503678.stgit@dwillia2-xfh.jf.intel.com/
patch subject: [PATCH v2] driver core: Fix userspace expectations of uevent_show() as a probe barrier

in testcase: blktests
version: blktests-x86_64-80430af-1_20240910
with following parameters:

	disk: 1SSD
	test: block-001



compiler: gcc-12
test machine: 4 threads Intel(R) Core(TM) i5-6500 CPU @ 3.20GHz (Skylake) with 32G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202410071741.4aa3984e-oliver.sang@intel.com


[ 51.504416][ T130] BUG: KASAN: slab-use-after-free in __mutex_lock+0x1003/0x1170 
[   51.504423][  T130] Read of size 8 at addr ffff8887ff562250 by task systemd-journal/130
[   51.504427][  T130]
[   51.504429][  T130] CPU: 2 UID: 0 PID: 130 Comm: systemd-journal Not tainted 6.12.0-rc1-00001-gcfb789a84ee6 #1
[   51.504434][  T130] Hardware name: Dell Inc. OptiPlex 7040/0Y7WYT, BIOS 1.8.1 12/05/2017
[   51.504436][  T130] Call Trace:
[   51.504438][  T130]  <TASK>
[ 51.504439][ T130] dump_stack_lvl (lib/dump_stack.c:123 (discriminator 1)) 
[ 51.504445][ T130] print_address_description+0x2c/0x3a0 
[ 51.504451][ T130] ? __mutex_lock+0x1003/0x1170 
[ 51.504455][ T130] print_report (mm/kasan/report.c:489) 
[ 51.504459][ T130] ? kasan_addr_to_slab (mm/kasan/common.c:37) 
[ 51.504462][ T130] ? __mutex_lock+0x1003/0x1170 
[ 51.504466][ T130] kasan_report (mm/kasan/report.c:603) 
[ 51.504470][ T130] ? __mutex_lock+0x1003/0x1170 
[ 51.504475][ T130] __mutex_lock+0x1003/0x1170 
[ 51.504480][ T130] ? __pfx___mutex_lock+0x10/0x10 
[ 51.504485][ T130] ? __pfx___might_resched (kernel/sched/core.c:8586) 
[ 51.504491][ T130] mutex_lock (kernel/locking/mutex.c:286) 
[ 51.504495][ T130] ? __pfx_mutex_lock (kernel/locking/mutex.c:282) 
[ 51.504499][ T130] ? __kmalloc_node_noprof (include/linux/kasan.h:257 mm/slub.c:4265 mm/slub.c:4271) 
[ 51.504503][ T130] ? seq_read_iter (fs/seq_file.c:210) 
[ 51.504507][ T130] kernfs_seq_start (fs/kernfs/file.c:169) 
[ 51.504511][ T130] seq_read_iter (fs/seq_file.c:225) 
[ 51.504516][ T130] ? rw_verify_area (fs/read_write.c:470) 
[ 51.504521][ T130] vfs_read (fs/read_write.c:488 fs/read_write.c:569) 
[ 51.504524][ T130] ? kernfs_iop_getattr (fs/kernfs/inode.c:197) 
[ 51.504528][ T130] ? __pfx_vfs_read (fs/read_write.c:550) 
[ 51.504533][ T130] ? fdget_pos (arch/x86/include/asm/atomic64_64.h:15 include/linux/atomic/atomic-arch-fallback.h:2583 include/linux/atomic/atomic-long.h:38 include/linux/atomic/atomic-instrumented.h:3189 fs/file.c:1177 fs/file.c:1185) 
[ 51.504537][ T130] ? __pfx___seccomp_filter (kernel/seccomp.c:1218) 
[ 51.504543][ T130] ksys_read (fs/read_write.c:712) 
[ 51.504546][ T130] ? __pfx_ksys_read (fs/read_write.c:702) 
[ 51.504550][ T130] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) 
[ 51.504555][ T130] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) 
[   51.504558][  T130] RIP: 0033:0x7fddff64e1dc
[ 51.504562][ T130] Code: ec 28 48 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 d9 d5 f8 ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 31 c0 0f 05 <48> 3d 00 f0 ff ff 77 34 44 89 c7 48 89 44 24 08 e8 2f d6 f8 ff 48
All code
========
   0:	ec                   	in     (%dx),%al
   1:	28 48 89             	sub    %cl,-0x77(%rax)
   4:	54                   	push   %rsp
   5:	24 18                	and    $0x18,%al
   7:	48 89 74 24 10       	mov    %rsi,0x10(%rsp)
   c:	89 7c 24 08          	mov    %edi,0x8(%rsp)
  10:	e8 d9 d5 f8 ff       	callq  0xfffffffffff8d5ee
  15:	48 8b 54 24 18       	mov    0x18(%rsp),%rdx
  1a:	48 8b 74 24 10       	mov    0x10(%rsp),%rsi
  1f:	41 89 c0             	mov    %eax,%r8d
  22:	8b 7c 24 08          	mov    0x8(%rsp),%edi
  26:	31 c0                	xor    %eax,%eax
  28:	0f 05                	syscall 
  2a:*	48 3d 00 f0 ff ff    	cmp    $0xfffffffffffff000,%rax		<-- trapping instruction
  30:	77 34                	ja     0x66
  32:	44 89 c7             	mov    %r8d,%edi
  35:	48 89 44 24 08       	mov    %rax,0x8(%rsp)
  3a:	e8 2f d6 f8 ff       	callq  0xfffffffffff8d66e
  3f:	48                   	rex.W

Code starting with the faulting instruction
===========================================
   0:	48 3d 00 f0 ff ff    	cmp    $0xfffffffffffff000,%rax
   6:	77 34                	ja     0x3c
   8:	44 89 c7             	mov    %r8d,%edi
   b:	48 89 44 24 08       	mov    %rax,0x8(%rsp)
  10:	e8 2f d6 f8 ff       	callq  0xfffffffffff8d644
  15:	48                   	rex.W
[   51.504565][  T130] RSP: 002b:00007ffe3e501df0 EFLAGS: 00000246 ORIG_RAX: 0000000000000000
[   51.504569][  T130] RAX: ffffffffffffffda RBX: 000055d748fcdac0 RCX: 00007fddff64e1dc
[   51.504572][  T130] RDX: 0000000000001008 RSI: 000055d748fcdac0 RDI: 000000000000001c
[   51.504574][  T130] RBP: 000000000000001c R08: 0000000000000000 R09: 00007fddff728ce0
[   51.504576][  T130] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000001008
[   51.504578][  T130] R13: 0000000000001007 R14: ffffffffffffffff R15: 0000000000000002
[   51.504583][  T130]  </TASK>
[   51.504585][  T130]
[   51.504586][  T130] Allocated by task 1044:
[ 51.504588][ T130] kasan_save_stack (mm/kasan/common.c:48) 
[ 51.504591][ T130] kasan_save_track (arch/x86/include/asm/current.h:49 mm/kasan/common.c:60 mm/kasan/common.c:69) 
[ 51.504594][ T130] __kasan_kmalloc (mm/kasan/common.c:377 mm/kasan/common.c:394) 
[ 51.504597][ T130] __kmalloc_noprof (include/linux/kasan.h:257 mm/slub.c:4265 mm/slub.c:4277) 
[ 51.504600][ T130] scsi_alloc_sdev (include/linux/slab.h:882 include/linux/slab.h:1014 drivers/scsi/scsi_scan.c:288) 
[ 51.504604][ T130] scsi_probe_and_add_lun (drivers/scsi/scsi_scan.c:1210) 
[ 51.504607][ T130] __scsi_scan_target (drivers/scsi/scsi_scan.c:1769) 
[ 51.504610][ T130] scsi_scan_host_selected (drivers/scsi/scsi_scan.c:1877) 
[ 51.504614][ T130] store_scan (drivers/scsi/scsi_sysfs.c:151 drivers/scsi/scsi_sysfs.c:191) 
[ 51.504617][ T130] kernfs_fop_write_iter (fs/kernfs/file.c:348) 
[ 51.504619][ T130] vfs_write (fs/read_write.c:590 fs/read_write.c:683) 
[ 51.504622][ T130] ksys_write (fs/read_write.c:736) 
[ 51.504624][ T130] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) 
[ 51.504627][ T130] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) 
[   51.504629][  T130]
[   51.504630][  T130] Freed by task 1044:
[ 51.504632][ T130] kasan_save_stack (mm/kasan/common.c:48) 
[ 51.504635][ T130] kasan_save_track (arch/x86/include/asm/current.h:49 mm/kasan/common.c:60 mm/kasan/common.c:69) 
[ 51.504638][ T130] kasan_save_free_info (mm/kasan/generic.c:582) 
[ 51.504640][ T130] __kasan_slab_free (mm/kasan/common.c:271) 
[ 51.504643][ T130] kfree (mm/slub.c:4580 mm/slub.c:4728) 
[ 51.504646][ T130] scsi_device_dev_release (drivers/scsi/scsi_sysfs.c:521 (discriminator 5)) 
[ 51.504649][ T130] device_release (drivers/base/core.c:2575) 
[ 51.504653][ T130] kobject_cleanup (lib/kobject.c:689) 
[ 51.504656][ T130] scsi_device_put (drivers/scsi/scsi.c:794) 
[ 51.504658][ T130] sdev_store_delete (drivers/scsi/scsi_sysfs.c:791) 
[ 51.504661][ T130] kernfs_fop_write_iter (fs/kernfs/file.c:348) 
[ 51.504664][ T130] vfs_write (fs/read_write.c:590 fs/read_write.c:683) 
[ 51.504666][ T130] ksys_write (fs/read_write.c:736) 
[ 51.504668][ T130] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) 
[ 51.504671][ T130] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) 
[   51.504674][  T130]
[   51.504675][  T130] Last potentially related work creation:
[ 51.504676][ T130] kasan_save_stack (mm/kasan/common.c:48) 
[ 51.504679][ T130] __kasan_record_aux_stack (mm/kasan/generic.c:541) 
[ 51.504681][ T130] insert_work (include/linux/instrumented.h:68 include/asm-generic/bitops/instrumented-non-atomic.h:141 kernel/workqueue.c:788 kernel/workqueue.c:795 kernel/workqueue.c:2186) 
[ 51.504685][ T130] __queue_work (kernel/workqueue.c:6723) 
[ 51.504689][ T130] queue_work_on (kernel/workqueue.c:2391) 
[ 51.504692][ T130] sdev_evt_send (include/linux/spinlock.h:406 drivers/scsi/scsi_lib.c:2645) 
[ 51.504694][ T130] scsi_evt_thread (drivers/scsi/scsi_lib.c:2596) 
[ 51.504697][ T130] process_one_work (kernel/workqueue.c:3229) 
[ 51.504701][ T130] worker_thread (kernel/workqueue.c:3304 kernel/workqueue.c:3391) 
[ 51.504704][ T130] kthread (kernel/kthread.c:389) 
[ 51.504707][ T130] ret_from_fork (arch/x86/kernel/process.c:147) 
[ 51.504710][ T130] ret_from_fork_asm (arch/x86/entry/entry_64.S:257) 
[   51.504713][  T130]
[   51.504714][  T130] Second to last potentially related work creation:
[ 51.504716][ T130] kasan_save_stack (mm/kasan/common.c:48) 
[ 51.504719][ T130] __kasan_record_aux_stack (mm/kasan/generic.c:541) 
[ 51.504721][ T130] insert_work (include/linux/instrumented.h:68 include/asm-generic/bitops/instrumented-non-atomic.h:141 kernel/workqueue.c:788 kernel/workqueue.c:795 kernel/workqueue.c:2186) 
[ 51.504724][ T130] __queue_work (kernel/workqueue.c:6723) 
[ 51.504727][ T130] queue_work_on (kernel/workqueue.c:2391) 
[ 51.504731][ T130] scsi_check_sense (include/scsi/scsi_eh.h:24 drivers/scsi/scsi_error.c:550) 
[ 51.504734][ T130] scsi_decide_disposition (drivers/scsi/scsi_error.c:2024) 
[ 51.504737][ T130] scsi_complete (drivers/scsi/scsi_lib.c:1515) 
[ 51.504740][ T130] blk_complete_reqs (block/blk-mq.c:1126 (discriminator 3)) 
[ 51.504744][ T130] handle_softirqs (kernel/softirq.c:554) 
[ 51.504747][ T130] run_ksoftirqd (arch/x86/include/asm/irqflags.h:42 arch/x86/include/asm/irqflags.h:97 kernel/softirq.c:411 kernel/softirq.c:928 kernel/softirq.c:919) 
[ 51.504750][ T130] smpboot_thread_fn (kernel/smpboot.c:164 (discriminator 3)) 
[ 51.504753][ T130] kthread (kernel/kthread.c:389) 
[ 51.504755][ T130] ret_from_fork (arch/x86/kernel/process.c:147) 
[ 51.504758][ T130] ret_from_fork_asm (arch/x86/entry/entry_64.S:257) 
[   51.504760][  T130]
[   51.504761][  T130] The buggy address belongs to the object at ffff8887ff562000
[   51.504761][  T130]  which belongs to the cache kmalloc-4k of size 4096
[   51.504764][  T130] The buggy address is located 592 bytes inside of
[   51.504764][  T130]  freed 4096-byte region [ffff8887ff562000, ffff8887ff563000)
[   51.504767][  T130]
[   51.504768][  T130] The buggy address belongs to the physical page:
[   51.504770][  T130] page: refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x7ff560
[   51.504774][  T130] head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0
[   51.504776][  T130] flags: 0x17ffffc0000040(head|node=0|zone=2|lastcpupid=0x1fffff)
[   51.504780][  T130] page_type: f5(slab)
[   51.504783][  T130] raw: 0017ffffc0000040 ffff88810c843040 dead000000000122 0000000000000000
[   51.504786][  T130] raw: 0000000000000000 0000000000040004 00000001f5000000 0000000000000000
[   51.504789][  T130] head: 0017ffffc0000040 ffff88810c843040 dead000000000122 0000000000000000


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20241007/202410071741.4aa3984e-oliver.sang@intel.com



-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki