[PATCH v5] devcoredump : Serialize devcd_del work

Mukesh Ojha posted 1 patch 3 years, 11 months ago
There is a newer version of this series
drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 81 insertions(+), 2 deletions(-)
[PATCH v5] devcoredump : Serialize devcd_del work
Posted by Mukesh Ojha 3 years, 11 months ago
In following scenario(diagram), when one thread X running dev_coredumpm()
adds devcd device to the framework which sends uevent notification to
userspace and another thread Y reads this uevent and call to
devcd_data_write() which eventually try to delete the queued timer that
is not initialized/queued yet.

So, debug object reports some warning and in the meantime, timer is
initialized and queued from X path. and from Y path, it gets reinitialized
again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.

To fix this, introduce mutex and a boolean flag to serialize the behaviour.

 	cpu0(X)			                cpu1(Y)

    dev_coredump() uevent sent to user space
    device_add()  ======================> user space process Y reads the
                                          uevents writes to devcd fd
                                          which results into writes to

                                         devcd_data_write()
                                           mod_delayed_work()
                                             try_to_grab_pending()
                                               del_timer()
                                                 debug_assert_init()
   INIT_DELAYED_WORK()
   schedule_delayed_work()
                                                   debug_object_fixup()
                                                     timer_fixup_assert_init()
                                                       timer_setup()
                                                         do_init_timer()
                                                       /*
                                                        Above call reinitializes
                                                        the timer to
                                                        timer->entry.pprev=NULL
                                                        and this will be checked
                                                        later in timer_pending() call.
                                                       */
                                                 timer_pending()
                                                  !hlist_unhashed_lockless(&timer->entry)
                                                    !h->pprev
                                                /*
                                                  del_timer() checks h->pprev and finds
                                                  it to be NULL due to which
                                                  try_to_grab_pending() stucks.
                                                */

Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
---
v4->v5:
 - Rebased it.

v3->v4:
 - flg variable renamed to delete_work.

v2->v3:
 Addressed comments from gregkh
 - Wrapped the commit text and corrected the alignment.
 - Described the reason to introduce new variables.
 - Restored the blank line.
 - rename the del_wk_queued to flg.
 Addressed comments from tglx
 - Added a comment which explains the race which looks obvious however
   would not occur between disabled_store and devcd_del work.


v1->v2:
 - Added del_wk_queued flag to serialize the race between devcd_data_write()
   and disabled_store() => devcd_free().
 drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 81 insertions(+), 2 deletions(-)

diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index f4d794d..1c06781 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -25,6 +25,47 @@ struct devcd_entry {
 	struct device devcd_dev;
 	void *data;
 	size_t datalen;
+	/*
+	 * Here, mutex is required to serialize the calls to del_wk work between
+	 * user/kernel space which happens when devcd is added with device_add()
+	 * and that sends uevent to user space. User space reads the uevents,
+	 * and calls to devcd_data_write() which try to modify the work which is
+	 * not even initialized/queued from devcoredump.
+	 *
+	 *
+	 *
+	 *        cpu0(X)                                 cpu1(Y)
+	 *
+	 *        dev_coredump() uevent sent to user space
+	 *        device_add()  ======================> user space process Y reads the
+	 *                                              uevents writes to devcd fd
+	 *                                              which results into writes to
+	 *
+	 *                                             devcd_data_write()
+	 *                                               mod_delayed_work()
+	 *                                                 try_to_grab_pending()
+	 *                                                   del_timer()
+	 *                                                     debug_assert_init()
+	 *       INIT_DELAYED_WORK()
+	 *       schedule_delayed_work()
+	 *
+	 *
+	 * Also, mutex alone would not be enough to avoid scheduling of
+	 * del_wk work after it get flush from a call to devcd_free()
+	 * mentioned as below.
+	 *
+	 *	disabled_store()
+	 *        devcd_free()
+	 *          mutex_lock()             devcd_data_write()
+	 *          flush_delayed_work()
+	 *          mutex_unlock()
+	 *                                   mutex_lock()
+	 *                                   mod_delayed_work()
+	 *                                   mutex_unlock()
+	 * So, delete_work flag is required.
+	 */
+	struct mutex mutex;
+	bool delete_work;
 	struct module *owner;
 	ssize_t (*read)(char *buffer, loff_t offset, size_t count,
 			void *data, size_t datalen);
@@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
 	struct device *dev = kobj_to_dev(kobj);
 	struct devcd_entry *devcd = dev_to_devcd(dev);
 
-	mod_delayed_work(system_wq, &devcd->del_wk, 0);
+	mutex_lock(&devcd->mutex);
+	if (!devcd->delete_work) {
+		devcd->delete_work = true;
+		mod_delayed_work(system_wq, &devcd->del_wk, 0);
+	}
+	mutex_unlock(&devcd->mutex);
 
 	return count;
 }
@@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data)
 {
 	struct devcd_entry *devcd = dev_to_devcd(dev);
 
+	mutex_lock(&devcd->mutex);
+	if (!devcd->delete_work)
+		devcd->delete_work = true;
+
 	flush_delayed_work(&devcd->del_wk);
+	mutex_unlock(&devcd->mutex);
 	return 0;
 }
 
@@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
 	return sysfs_emit(buf, "%d\n", devcd_disabled);
 }
 
+/*
+ *
+ *	disabled_store()                                	worker()
+ *	 class_for_each_device(&devcd_class,
+ *		NULL, NULL, devcd_free)
+ *         ...
+ *         ...
+ *	   while ((dev = class_dev_iter_next(&iter))
+ *                                                             devcd_del()
+ *                                                               device_del()
+ *                                                                 put_device() <- last reference
+ *             error = fn(dev, data)                           devcd_dev_release()
+ *             devcd_free(dev, data)                           kfree(devcd)
+ *             mutex_lock(&devcd->mutex);
+ *
+ *
+ * In the above diagram, It looks like disabled_store() would be racing with parallely
+ * running devcd_del() and result in memory abort while acquiring devcd->mutex which
+ * is called after kfree of devcd memory  after dropping its last reference with
+ * put_device(). However, this will not happens as fn(dev, data) runs
+ * with its own reference to device via klist_node so it is not its last reference.
+ * so, above situation would not occur.
+ */
+
 static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
 			      const char *buf, size_t count)
 {
@@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner,
 	devcd->read = read;
 	devcd->free = free;
 	devcd->failing_dev = get_device(dev);
+	devcd->delete_work = false;
 
+	mutex_init(&devcd->mutex);
 	device_initialize(&devcd->devcd_dev);
 
 	dev_set_name(&devcd->devcd_dev, "devcd%d",
 		     atomic_inc_return(&devcd_count));
 	devcd->devcd_dev.class = &devcd_class;
 
+	mutex_lock(&devcd->mutex);
 	if (device_add(&devcd->devcd_dev))
 		goto put_device;
 
@@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
 
 	INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
 	schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
-
+	mutex_unlock(&devcd->mutex);
 	return;
  put_device:
 	put_device(&devcd->devcd_dev);
+	mutex_unlock(&devcd->mutex);
  put_module:
 	module_put(owner);
  free:
-- 
2.7.4
Re: [PATCH v5] devcoredump : Serialize devcd_del work
Posted by Mukesh Ojha 3 years, 8 months ago
Hi Johannes/Kees,

Sorry for reminding on it again.
Any hope of this one to get into devcoredump ?

-Mukesh


On 5/27/2022 7:33 PM, Mukesh Ojha wrote:
> In following scenario(diagram), when one thread X running dev_coredumpm()
> adds devcd device to the framework which sends uevent notification to
> userspace and another thread Y reads this uevent and call to
> devcd_data_write() which eventually try to delete the queued timer that
> is not initialized/queued yet.
> 
> So, debug object reports some warning and in the meantime, timer is
> initialized and queued from X path. and from Y path, it gets reinitialized
> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
> 
> To fix this, introduce mutex and a boolean flag to serialize the behaviour.
> 
>   	cpu0(X)			                cpu1(Y)
> 
>      dev_coredump() uevent sent to user space
>      device_add()  ======================> user space process Y reads the
>                                            uevents writes to devcd fd
>                                            which results into writes to
> 
>                                           devcd_data_write()
>                                             mod_delayed_work()
>                                               try_to_grab_pending()
>                                                 del_timer()
>                                                   debug_assert_init()
>     INIT_DELAYED_WORK()
>     schedule_delayed_work()
>                                                     debug_object_fixup()
>                                                       timer_fixup_assert_init()
>                                                         timer_setup()
>                                                           do_init_timer()
>                                                         /*
>                                                          Above call reinitializes
>                                                          the timer to
>                                                          timer->entry.pprev=NULL
>                                                          and this will be checked
>                                                          later in timer_pending() call.
>                                                         */
>                                                   timer_pending()
>                                                    !hlist_unhashed_lockless(&timer->entry)
>                                                      !h->pprev
>                                                  /*
>                                                    del_timer() checks h->pprev and finds
>                                                    it to be NULL due to which
>                                                    try_to_grab_pending() stucks.
>                                                  */
> 
> Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
> ---
> v4->v5:
>   - Rebased it.
> 
> v3->v4:
>   - flg variable renamed to delete_work.
> 
> v2->v3:
>   Addressed comments from gregkh
>   - Wrapped the commit text and corrected the alignment.
>   - Described the reason to introduce new variables.
>   - Restored the blank line.
>   - rename the del_wk_queued to flg.
>   Addressed comments from tglx
>   - Added a comment which explains the race which looks obvious however
>     would not occur between disabled_store and devcd_del work.
> 
> 
> v1->v2:
>   - Added del_wk_queued flag to serialize the race between devcd_data_write()
>     and disabled_store() => devcd_free().
>   drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 81 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
> index f4d794d..1c06781 100644
> --- a/drivers/base/devcoredump.c
> +++ b/drivers/base/devcoredump.c
> @@ -25,6 +25,47 @@ struct devcd_entry {
>   	struct device devcd_dev;
>   	void *data;
>   	size_t datalen;
> +	/*
> +	 * Here, mutex is required to serialize the calls to del_wk work between
> +	 * user/kernel space which happens when devcd is added with device_add()
> +	 * and that sends uevent to user space. User space reads the uevents,
> +	 * and calls to devcd_data_write() which try to modify the work which is
> +	 * not even initialized/queued from devcoredump.
> +	 *
> +	 *
> +	 *
> +	 *        cpu0(X)                                 cpu1(Y)
> +	 *
> +	 *        dev_coredump() uevent sent to user space
> +	 *        device_add()  ======================> user space process Y reads the
> +	 *                                              uevents writes to devcd fd
> +	 *                                              which results into writes to
> +	 *
> +	 *                                             devcd_data_write()
> +	 *                                               mod_delayed_work()
> +	 *                                                 try_to_grab_pending()
> +	 *                                                   del_timer()
> +	 *                                                     debug_assert_init()
> +	 *       INIT_DELAYED_WORK()
> +	 *       schedule_delayed_work()
> +	 *
> +	 *
> +	 * Also, mutex alone would not be enough to avoid scheduling of
> +	 * del_wk work after it get flush from a call to devcd_free()
> +	 * mentioned as below.
> +	 *
> +	 *	disabled_store()
> +	 *        devcd_free()
> +	 *          mutex_lock()             devcd_data_write()
> +	 *          flush_delayed_work()
> +	 *          mutex_unlock()
> +	 *                                   mutex_lock()
> +	 *                                   mod_delayed_work()
> +	 *                                   mutex_unlock()
> +	 * So, delete_work flag is required.
> +	 */
> +	struct mutex mutex;
> +	bool delete_work;
>   	struct module *owner;
>   	ssize_t (*read)(char *buffer, loff_t offset, size_t count,
>   			void *data, size_t datalen);
> @@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
>   	struct device *dev = kobj_to_dev(kobj);
>   	struct devcd_entry *devcd = dev_to_devcd(dev);
>   
> -	mod_delayed_work(system_wq, &devcd->del_wk, 0);
> +	mutex_lock(&devcd->mutex);
> +	if (!devcd->delete_work) {
> +		devcd->delete_work = true;
> +		mod_delayed_work(system_wq, &devcd->del_wk, 0);
> +	}
> +	mutex_unlock(&devcd->mutex);
>   
>   	return count;
>   }
> @@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data)
>   {
>   	struct devcd_entry *devcd = dev_to_devcd(dev);
>   
> +	mutex_lock(&devcd->mutex);
> +	if (!devcd->delete_work)
> +		devcd->delete_work = true;
> +
>   	flush_delayed_work(&devcd->del_wk);
> +	mutex_unlock(&devcd->mutex);
>   	return 0;
>   }
>   
> @@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
>   	return sysfs_emit(buf, "%d\n", devcd_disabled);
>   }
>   
> +/*
> + *
> + *	disabled_store()                                	worker()
> + *	 class_for_each_device(&devcd_class,
> + *		NULL, NULL, devcd_free)
> + *         ...
> + *         ...
> + *	   while ((dev = class_dev_iter_next(&iter))
> + *                                                             devcd_del()
> + *                                                               device_del()
> + *                                                                 put_device() <- last reference
> + *             error = fn(dev, data)                           devcd_dev_release()
> + *             devcd_free(dev, data)                           kfree(devcd)
> + *             mutex_lock(&devcd->mutex);
> + *
> + *
> + * In the above diagram, It looks like disabled_store() would be racing with parallely
> + * running devcd_del() and result in memory abort while acquiring devcd->mutex which
> + * is called after kfree of devcd memory  after dropping its last reference with
> + * put_device(). However, this will not happens as fn(dev, data) runs
> + * with its own reference to device via klist_node so it is not its last reference.
> + * so, above situation would not occur.
> + */
> +
>   static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
>   			      const char *buf, size_t count)
>   {
> @@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>   	devcd->read = read;
>   	devcd->free = free;
>   	devcd->failing_dev = get_device(dev);
> +	devcd->delete_work = false;
>   
> +	mutex_init(&devcd->mutex);
>   	device_initialize(&devcd->devcd_dev);
>   
>   	dev_set_name(&devcd->devcd_dev, "devcd%d",
>   		     atomic_inc_return(&devcd_count));
>   	devcd->devcd_dev.class = &devcd_class;
>   
> +	mutex_lock(&devcd->mutex);
>   	if (device_add(&devcd->devcd_dev))
>   		goto put_device;
>   
> @@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>   
>   	INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
>   	schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
> -
> +	mutex_unlock(&devcd->mutex);
>   	return;
>    put_device:
>   	put_device(&devcd->devcd_dev);
> +	mutex_unlock(&devcd->mutex);
>    put_module:
>   	module_put(owner);
>    free:
Re: [PATCH v5] devcoredump : Serialize devcd_del work
Posted by Kees Cook 3 years, 8 months ago
On Thu, Aug 11, 2022 at 09:43:48PM +0530, Mukesh Ojha wrote:
> Hi Johannes/Kees,

Hi!

> 
> Sorry for reminding on it again.
> Any hope of this one to get into devcoredump ?

I don't know this code well enough to comment on the solution, but it
seems designed and justified correctly, at least. :)

I'll leave it to Johannes for review.

-Kees

> 
> -Mukesh
> 
> 
> On 5/27/2022 7:33 PM, Mukesh Ojha wrote:
> > In following scenario(diagram), when one thread X running dev_coredumpm()
> > adds devcd device to the framework which sends uevent notification to
> > userspace and another thread Y reads this uevent and call to
> > devcd_data_write() which eventually try to delete the queued timer that
> > is not initialized/queued yet.
> > 
> > So, debug object reports some warning and in the meantime, timer is
> > initialized and queued from X path. and from Y path, it gets reinitialized
> > again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
> > 
> > To fix this, introduce mutex and a boolean flag to serialize the behaviour.
> > 
> >   	cpu0(X)			                cpu1(Y)
> > 
> >      dev_coredump() uevent sent to user space
> >      device_add()  ======================> user space process Y reads the
> >                                            uevents writes to devcd fd
> >                                            which results into writes to
> > 
> >                                           devcd_data_write()
> >                                             mod_delayed_work()
> >                                               try_to_grab_pending()
> >                                                 del_timer()
> >                                                   debug_assert_init()
> >     INIT_DELAYED_WORK()
> >     schedule_delayed_work()
> >                                                     debug_object_fixup()
> >                                                       timer_fixup_assert_init()
> >                                                         timer_setup()
> >                                                           do_init_timer()
> >                                                         /*
> >                                                          Above call reinitializes
> >                                                          the timer to
> >                                                          timer->entry.pprev=NULL
> >                                                          and this will be checked
> >                                                          later in timer_pending() call.
> >                                                         */
> >                                                   timer_pending()
> >                                                    !hlist_unhashed_lockless(&timer->entry)
> >                                                      !h->pprev
> >                                                  /*
> >                                                    del_timer() checks h->pprev and finds
> >                                                    it to be NULL due to which
> >                                                    try_to_grab_pending() stucks.
> >                                                  */
> > 
> > Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
> > Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
> > ---
> > v4->v5:
> >   - Rebased it.
> > 
> > v3->v4:
> >   - flg variable renamed to delete_work.
> > 
> > v2->v3:
> >   Addressed comments from gregkh
> >   - Wrapped the commit text and corrected the alignment.
> >   - Described the reason to introduce new variables.
> >   - Restored the blank line.
> >   - rename the del_wk_queued to flg.
> >   Addressed comments from tglx
> >   - Added a comment which explains the race which looks obvious however
> >     would not occur between disabled_store and devcd_del work.
> > 
> > 
> > v1->v2:
> >   - Added del_wk_queued flag to serialize the race between devcd_data_write()
> >     and disabled_store() => devcd_free().
> >   drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
> >   1 file changed, 81 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
> > index f4d794d..1c06781 100644
> > --- a/drivers/base/devcoredump.c
> > +++ b/drivers/base/devcoredump.c
> > @@ -25,6 +25,47 @@ struct devcd_entry {
> >   	struct device devcd_dev;
> >   	void *data;
> >   	size_t datalen;
> > +	/*
> > +	 * Here, mutex is required to serialize the calls to del_wk work between
> > +	 * user/kernel space which happens when devcd is added with device_add()
> > +	 * and that sends uevent to user space. User space reads the uevents,
> > +	 * and calls to devcd_data_write() which try to modify the work which is
> > +	 * not even initialized/queued from devcoredump.
> > +	 *
> > +	 *
> > +	 *
> > +	 *        cpu0(X)                                 cpu1(Y)
> > +	 *
> > +	 *        dev_coredump() uevent sent to user space
> > +	 *        device_add()  ======================> user space process Y reads the
> > +	 *                                              uevents writes to devcd fd
> > +	 *                                              which results into writes to
> > +	 *
> > +	 *                                             devcd_data_write()
> > +	 *                                               mod_delayed_work()
> > +	 *                                                 try_to_grab_pending()
> > +	 *                                                   del_timer()
> > +	 *                                                     debug_assert_init()
> > +	 *       INIT_DELAYED_WORK()
> > +	 *       schedule_delayed_work()
> > +	 *
> > +	 *
> > +	 * Also, mutex alone would not be enough to avoid scheduling of
> > +	 * del_wk work after it get flush from a call to devcd_free()
> > +	 * mentioned as below.
> > +	 *
> > +	 *	disabled_store()
> > +	 *        devcd_free()
> > +	 *          mutex_lock()             devcd_data_write()
> > +	 *          flush_delayed_work()
> > +	 *          mutex_unlock()
> > +	 *                                   mutex_lock()
> > +	 *                                   mod_delayed_work()
> > +	 *                                   mutex_unlock()
> > +	 * So, delete_work flag is required.
> > +	 */
> > +	struct mutex mutex;
> > +	bool delete_work;
> >   	struct module *owner;
> >   	ssize_t (*read)(char *buffer, loff_t offset, size_t count,
> >   			void *data, size_t datalen);
> > @@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
> >   	struct device *dev = kobj_to_dev(kobj);
> >   	struct devcd_entry *devcd = dev_to_devcd(dev);
> > -	mod_delayed_work(system_wq, &devcd->del_wk, 0);
> > +	mutex_lock(&devcd->mutex);
> > +	if (!devcd->delete_work) {
> > +		devcd->delete_work = true;
> > +		mod_delayed_work(system_wq, &devcd->del_wk, 0);
> > +	}
> > +	mutex_unlock(&devcd->mutex);
> >   	return count;
> >   }
> > @@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data)
> >   {
> >   	struct devcd_entry *devcd = dev_to_devcd(dev);
> > +	mutex_lock(&devcd->mutex);
> > +	if (!devcd->delete_work)
> > +		devcd->delete_work = true;
> > +
> >   	flush_delayed_work(&devcd->del_wk);
> > +	mutex_unlock(&devcd->mutex);
> >   	return 0;
> >   }
> > @@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
> >   	return sysfs_emit(buf, "%d\n", devcd_disabled);
> >   }
> > +/*
> > + *
> > + *	disabled_store()                                	worker()
> > + *	 class_for_each_device(&devcd_class,
> > + *		NULL, NULL, devcd_free)
> > + *         ...
> > + *         ...
> > + *	   while ((dev = class_dev_iter_next(&iter))
> > + *                                                             devcd_del()
> > + *                                                               device_del()
> > + *                                                                 put_device() <- last reference
> > + *             error = fn(dev, data)                           devcd_dev_release()
> > + *             devcd_free(dev, data)                           kfree(devcd)
> > + *             mutex_lock(&devcd->mutex);
> > + *
> > + *
> > + * In the above diagram, It looks like disabled_store() would be racing with parallely
> > + * running devcd_del() and result in memory abort while acquiring devcd->mutex which
> > + * is called after kfree of devcd memory  after dropping its last reference with
> > + * put_device(). However, this will not happens as fn(dev, data) runs
> > + * with its own reference to device via klist_node so it is not its last reference.
> > + * so, above situation would not occur.
> > + */
> > +
> >   static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
> >   			      const char *buf, size_t count)
> >   {
> > @@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner,
> >   	devcd->read = read;
> >   	devcd->free = free;
> >   	devcd->failing_dev = get_device(dev);
> > +	devcd->delete_work = false;
> > +	mutex_init(&devcd->mutex);
> >   	device_initialize(&devcd->devcd_dev);
> >   	dev_set_name(&devcd->devcd_dev, "devcd%d",
> >   		     atomic_inc_return(&devcd_count));
> >   	devcd->devcd_dev.class = &devcd_class;
> > +	mutex_lock(&devcd->mutex);
> >   	if (device_add(&devcd->devcd_dev))
> >   		goto put_device;
> > @@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
> >   	INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
> >   	schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
> > -
> > +	mutex_unlock(&devcd->mutex);
> >   	return;
> >    put_device:
> >   	put_device(&devcd->devcd_dev);
> > +	mutex_unlock(&devcd->mutex);
> >    put_module:
> >   	module_put(owner);
> >    free:

-- 
Kees Cook
Re: [PATCH v5] devcoredump : Serialize devcd_del work
Posted by Mukesh Ojha 3 years, 8 months ago
On 8/17/2022 1:53 AM, Kees Cook wrote:
> On Thu, Aug 11, 2022 at 09:43:48PM +0530, Mukesh Ojha wrote:
>> Hi Johannes/Kees,
> 
> Hi!
> 
>>
>> Sorry for reminding on it again.
>> Any hope of this one to get into devcoredump ?
> 
> I don't know this code well enough to comment on the solution, but it
> seems designed and justified correctly, at least. :)

Thanks Kees for the reply.

Hi @Johannes,

This patch is running in our internal build from a very long time and it 
did not show any regression.
Would like to get your ack on it ?

-Mukesh
> 
> I'll leave it to Johannes for review.
> 
> -Kees
> 
>>
>> -Mukesh
>>
>>
>> On 5/27/2022 7:33 PM, Mukesh Ojha wrote:
>>> In following scenario(diagram), when one thread X running dev_coredumpm()
>>> adds devcd device to the framework which sends uevent notification to
>>> userspace and another thread Y reads this uevent and call to
>>> devcd_data_write() which eventually try to delete the queued timer that
>>> is not initialized/queued yet.
>>>
>>> So, debug object reports some warning and in the meantime, timer is
>>> initialized and queued from X path. and from Y path, it gets reinitialized
>>> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
>>>
>>> To fix this, introduce mutex and a boolean flag to serialize the behaviour.
>>>
>>>    	cpu0(X)			                cpu1(Y)
>>>
>>>       dev_coredump() uevent sent to user space
>>>       device_add()  ======================> user space process Y reads the
>>>                                             uevents writes to devcd fd
>>>                                             which results into writes to
>>>
>>>                                            devcd_data_write()
>>>                                              mod_delayed_work()
>>>                                                try_to_grab_pending()
>>>                                                  del_timer()
>>>                                                    debug_assert_init()
>>>      INIT_DELAYED_WORK()
>>>      schedule_delayed_work()
>>>                                                      debug_object_fixup()
>>>                                                        timer_fixup_assert_init()
>>>                                                          timer_setup()
>>>                                                            do_init_timer()
>>>                                                          /*
>>>                                                           Above call reinitializes
>>>                                                           the timer to
>>>                                                           timer->entry.pprev=NULL
>>>                                                           and this will be checked
>>>                                                           later in timer_pending() call.
>>>                                                          */
>>>                                                    timer_pending()
>>>                                                     !hlist_unhashed_lockless(&timer->entry)
>>>                                                       !h->pprev
>>>                                                   /*
>>>                                                     del_timer() checks h->pprev and finds
>>>                                                     it to be NULL due to which
>>>                                                     try_to_grab_pending() stucks.
>>>                                                   */
>>>
>>> Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
>>> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
>>> ---
>>> v4->v5:
>>>    - Rebased it.
>>>
>>> v3->v4:
>>>    - flg variable renamed to delete_work.
>>>
>>> v2->v3:
>>>    Addressed comments from gregkh
>>>    - Wrapped the commit text and corrected the alignment.
>>>    - Described the reason to introduce new variables.
>>>    - Restored the blank line.
>>>    - rename the del_wk_queued to flg.
>>>    Addressed comments from tglx
>>>    - Added a comment which explains the race which looks obvious however
>>>      would not occur between disabled_store and devcd_del work.
>>>
>>>
>>> v1->v2:
>>>    - Added del_wk_queued flag to serialize the race between devcd_data_write()
>>>      and disabled_store() => devcd_free().
>>>    drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
>>>    1 file changed, 81 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
>>> index f4d794d..1c06781 100644
>>> --- a/drivers/base/devcoredump.c
>>> +++ b/drivers/base/devcoredump.c
>>> @@ -25,6 +25,47 @@ struct devcd_entry {
>>>    	struct device devcd_dev;
>>>    	void *data;
>>>    	size_t datalen;
>>> +	/*
>>> +	 * Here, mutex is required to serialize the calls to del_wk work between
>>> +	 * user/kernel space which happens when devcd is added with device_add()
>>> +	 * and that sends uevent to user space. User space reads the uevents,
>>> +	 * and calls to devcd_data_write() which try to modify the work which is
>>> +	 * not even initialized/queued from devcoredump.
>>> +	 *
>>> +	 *
>>> +	 *
>>> +	 *        cpu0(X)                                 cpu1(Y)
>>> +	 *
>>> +	 *        dev_coredump() uevent sent to user space
>>> +	 *        device_add()  ======================> user space process Y reads the
>>> +	 *                                              uevents writes to devcd fd
>>> +	 *                                              which results into writes to
>>> +	 *
>>> +	 *                                             devcd_data_write()
>>> +	 *                                               mod_delayed_work()
>>> +	 *                                                 try_to_grab_pending()
>>> +	 *                                                   del_timer()
>>> +	 *                                                     debug_assert_init()
>>> +	 *       INIT_DELAYED_WORK()
>>> +	 *       schedule_delayed_work()
>>> +	 *
>>> +	 *
>>> +	 * Also, mutex alone would not be enough to avoid scheduling of
>>> +	 * del_wk work after it get flush from a call to devcd_free()
>>> +	 * mentioned as below.
>>> +	 *
>>> +	 *	disabled_store()
>>> +	 *        devcd_free()
>>> +	 *          mutex_lock()             devcd_data_write()
>>> +	 *          flush_delayed_work()
>>> +	 *          mutex_unlock()
>>> +	 *                                   mutex_lock()
>>> +	 *                                   mod_delayed_work()
>>> +	 *                                   mutex_unlock()
>>> +	 * So, delete_work flag is required.
>>> +	 */
>>> +	struct mutex mutex;
>>> +	bool delete_work;
>>>    	struct module *owner;
>>>    	ssize_t (*read)(char *buffer, loff_t offset, size_t count,
>>>    			void *data, size_t datalen);
>>> @@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
>>>    	struct device *dev = kobj_to_dev(kobj);
>>>    	struct devcd_entry *devcd = dev_to_devcd(dev);
>>> -	mod_delayed_work(system_wq, &devcd->del_wk, 0);
>>> +	mutex_lock(&devcd->mutex);
>>> +	if (!devcd->delete_work) {
>>> +		devcd->delete_work = true;
>>> +		mod_delayed_work(system_wq, &devcd->del_wk, 0);
>>> +	}
>>> +	mutex_unlock(&devcd->mutex);
>>>    	return count;
>>>    }
>>> @@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data)
>>>    {
>>>    	struct devcd_entry *devcd = dev_to_devcd(dev);
>>> +	mutex_lock(&devcd->mutex);
>>> +	if (!devcd->delete_work)
>>> +		devcd->delete_work = true;
>>> +
>>>    	flush_delayed_work(&devcd->del_wk);
>>> +	mutex_unlock(&devcd->mutex);
>>>    	return 0;
>>>    }
>>> @@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
>>>    	return sysfs_emit(buf, "%d\n", devcd_disabled);
>>>    }
>>> +/*
>>> + *
>>> + *	disabled_store()                                	worker()
>>> + *	 class_for_each_device(&devcd_class,
>>> + *		NULL, NULL, devcd_free)
>>> + *         ...
>>> + *         ...
>>> + *	   while ((dev = class_dev_iter_next(&iter))
>>> + *                                                             devcd_del()
>>> + *                                                               device_del()
>>> + *                                                                 put_device() <- last reference
>>> + *             error = fn(dev, data)                           devcd_dev_release()
>>> + *             devcd_free(dev, data)                           kfree(devcd)
>>> + *             mutex_lock(&devcd->mutex);
>>> + *
>>> + *
>>> + * In the above diagram, It looks like disabled_store() would be racing with parallely
>>> + * running devcd_del() and result in memory abort while acquiring devcd->mutex which
>>> + * is called after kfree of devcd memory  after dropping its last reference with
>>> + * put_device(). However, this will not happens as fn(dev, data) runs
>>> + * with its own reference to device via klist_node so it is not its last reference.
>>> + * so, above situation would not occur.
>>> + */
>>> +
>>>    static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
>>>    			      const char *buf, size_t count)
>>>    {
>>> @@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>>>    	devcd->read = read;
>>>    	devcd->free = free;
>>>    	devcd->failing_dev = get_device(dev);
>>> +	devcd->delete_work = false;
>>> +	mutex_init(&devcd->mutex);
>>>    	device_initialize(&devcd->devcd_dev);
>>>    	dev_set_name(&devcd->devcd_dev, "devcd%d",
>>>    		     atomic_inc_return(&devcd_count));
>>>    	devcd->devcd_dev.class = &devcd_class;
>>> +	mutex_lock(&devcd->mutex);
>>>    	if (device_add(&devcd->devcd_dev))
>>>    		goto put_device;
>>> @@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>>>    	INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
>>>    	schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
>>> -
>>> +	mutex_unlock(&devcd->mutex);
>>>    	return;
>>>     put_device:
>>>    	put_device(&devcd->devcd_dev);
>>> +	mutex_unlock(&devcd->mutex);
>>>     put_module:
>>>    	module_put(owner);
>>>     free:
>
Re: [PATCH v5] devcoredump : Serialize devcd_del work
Posted by Greg KH 3 years, 10 months ago
On Fri, May 27, 2022 at 07:33:40PM +0530, Mukesh Ojha wrote:
> In following scenario(diagram), when one thread X running dev_coredumpm()
> adds devcd device to the framework which sends uevent notification to
> userspace and another thread Y reads this uevent and call to
> devcd_data_write() which eventually try to delete the queued timer that
> is not initialized/queued yet.
> 
> So, debug object reports some warning and in the meantime, timer is
> initialized and queued from X path. and from Y path, it gets reinitialized
> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
> 
> To fix this, introduce mutex and a boolean flag to serialize the behaviour.
> 
>  	cpu0(X)			                cpu1(Y)
> 
>     dev_coredump() uevent sent to user space
>     device_add()  ======================> user space process Y reads the
>                                           uevents writes to devcd fd
>                                           which results into writes to
> 
>                                          devcd_data_write()
>                                            mod_delayed_work()
>                                              try_to_grab_pending()
>                                                del_timer()
>                                                  debug_assert_init()
>    INIT_DELAYED_WORK()
>    schedule_delayed_work()
>                                                    debug_object_fixup()
>                                                      timer_fixup_assert_init()
>                                                        timer_setup()
>                                                          do_init_timer()
>                                                        /*
>                                                         Above call reinitializes
>                                                         the timer to
>                                                         timer->entry.pprev=NULL
>                                                         and this will be checked
>                                                         later in timer_pending() call.
>                                                        */
>                                                  timer_pending()
>                                                   !hlist_unhashed_lockless(&timer->entry)
>                                                     !h->pprev
>                                                 /*
>                                                   del_timer() checks h->pprev and finds
>                                                   it to be NULL due to which
>                                                   try_to_grab_pending() stucks.
>                                                 */
> 
> Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
> ---

I need an ack from the devcoredump maintainer before I can take this...

thanks,

greg k-h
Re: [PATCH v5] devcoredump : Serialize devcd_del work
Posted by Mukesh Ojha 3 years, 10 months ago
Thanks @greg.

Hi @johannes,

Could you review this patch?

-Mukesh

On 6/27/2022 6:41 PM, Greg KH wrote:
> On Fri, May 27, 2022 at 07:33:40PM +0530, Mukesh Ojha wrote:
>> In following scenario(diagram), when one thread X running dev_coredumpm()
>> adds devcd device to the framework which sends uevent notification to
>> userspace and another thread Y reads this uevent and call to
>> devcd_data_write() which eventually try to delete the queued timer that
>> is not initialized/queued yet.
>>
>> So, debug object reports some warning and in the meantime, timer is
>> initialized and queued from X path. and from Y path, it gets reinitialized
>> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
>>
>> To fix this, introduce mutex and a boolean flag to serialize the behaviour.
>>
>>   	cpu0(X)			                cpu1(Y)
>>
>>      dev_coredump() uevent sent to user space
>>      device_add()  ======================> user space process Y reads the
>>                                            uevents writes to devcd fd
>>                                            which results into writes to
>>
>>                                           devcd_data_write()
>>                                             mod_delayed_work()
>>                                               try_to_grab_pending()
>>                                                 del_timer()
>>                                                   debug_assert_init()
>>     INIT_DELAYED_WORK()
>>     schedule_delayed_work()
>>                                                     debug_object_fixup()
>>                                                       timer_fixup_assert_init()
>>                                                         timer_setup()
>>                                                           do_init_timer()
>>                                                         /*
>>                                                          Above call reinitializes
>>                                                          the timer to
>>                                                          timer->entry.pprev=NULL
>>                                                          and this will be checked
>>                                                          later in timer_pending() call.
>>                                                         */
>>                                                   timer_pending()
>>                                                    !hlist_unhashed_lockless(&timer->entry)
>>                                                      !h->pprev
>>                                                  /*
>>                                                    del_timer() checks h->pprev and finds
>>                                                    it to be NULL due to which
>>                                                    try_to_grab_pending() stucks.
>>                                                  */
>>
>> Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
>> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
>> ---
> 
> I need an ack from the devcoredump maintainer before I can take this...
> 
> thanks,
> 
> greg k-h
Re: [PATCH v5] devcoredump : Serialize devcd_del work
Posted by Mukesh Ojha 3 years, 9 months ago
Gentle reminder for review.

-Mukesh

On 7/1/2022 8:23 PM, Mukesh Ojha wrote:
> Thanks @greg.
> 
> Hi @johannes,
> 
> Could you review this patch?
> 
> -Mukesh
> 
> On 6/27/2022 6:41 PM, Greg KH wrote:
>> On Fri, May 27, 2022 at 07:33:40PM +0530, Mukesh Ojha wrote:
>>> In following scenario(diagram), when one thread X running 
>>> dev_coredumpm()
>>> adds devcd device to the framework which sends uevent notification to
>>> userspace and another thread Y reads this uevent and call to
>>> devcd_data_write() which eventually try to delete the queued timer that
>>> is not initialized/queued yet.
>>>
>>> So, debug object reports some warning and in the meantime, timer is
>>> initialized and queued from X path. and from Y path, it gets 
>>> reinitialized
>>> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
>>>
>>> To fix this, introduce mutex and a boolean flag to serialize the 
>>> behaviour.
>>>
>>>       cpu0(X)                            cpu1(Y)
>>>
>>>      dev_coredump() uevent sent to user space
>>>      device_add()  ======================> user space process Y reads 
>>> the
>>>                                            uevents writes to devcd fd
>>>                                            which results into writes to
>>>
>>>                                           devcd_data_write()
>>>                                             mod_delayed_work()
>>>                                               try_to_grab_pending()
>>>                                                 del_timer()
>>>                                                   debug_assert_init()
>>>     INIT_DELAYED_WORK()
>>>     schedule_delayed_work()
>>>                                                     debug_object_fixup()
>>>                                                       
>>> timer_fixup_assert_init()
>>>                                                         timer_setup()
>>>                                                           
>>> do_init_timer()
>>>                                                         /*
>>>                                                          Above call 
>>> reinitializes
>>>                                                          the timer to
>>>                                                          
>>> timer->entry.pprev=NULL
>>>                                                          and this 
>>> will be checked
>>>                                                          later in 
>>> timer_pending() call.
>>>                                                         */
>>>                                                   timer_pending()
>>>                                                    
>>> !hlist_unhashed_lockless(&timer->entry)
>>>                                                      !h->pprev
>>>                                                  /*
>>>                                                    del_timer() checks 
>>> h->pprev and finds
>>>                                                    it to be NULL due 
>>> to which
>>>                                                    
>>> try_to_grab_pending() stucks.
>>>                                                  */
>>>
>>> Link: 
>>> https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/ 
>>>
>>> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
>>> ---
>>
>> I need an ack from the devcoredump maintainer before I can take this...
>>
>> thanks,
>>
>> greg k-h
Re: [PATCH v5] devcoredump : Serialize devcd_del work
Posted by Mukesh Ojha 3 years, 10 months ago
Friendly reminder !!

-Mukesh
On 5/27/2022 7:33 PM, Mukesh Ojha wrote:
> In following scenario(diagram), when one thread X running dev_coredumpm()
> adds devcd device to the framework which sends uevent notification to
> userspace and another thread Y reads this uevent and call to
> devcd_data_write() which eventually try to delete the queued timer that
> is not initialized/queued yet.
> 
> So, debug object reports some warning and in the meantime, timer is
> initialized and queued from X path. and from Y path, it gets reinitialized
> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
> 
> To fix this, introduce mutex and a boolean flag to serialize the behaviour.
> 
>   	cpu0(X)			                cpu1(Y)
> 
>      dev_coredump() uevent sent to user space
>      device_add()  ======================> user space process Y reads the
>                                            uevents writes to devcd fd
>                                            which results into writes to
> 
>                                           devcd_data_write()
>                                             mod_delayed_work()
>                                               try_to_grab_pending()
>                                                 del_timer()
>                                                   debug_assert_init()
>     INIT_DELAYED_WORK()
>     schedule_delayed_work()
>                                                     debug_object_fixup()
>                                                       timer_fixup_assert_init()
>                                                         timer_setup()
>                                                           do_init_timer()
>                                                         /*
>                                                          Above call reinitializes
>                                                          the timer to
>                                                          timer->entry.pprev=NULL
>                                                          and this will be checked
>                                                          later in timer_pending() call.
>                                                         */
>                                                   timer_pending()
>                                                    !hlist_unhashed_lockless(&timer->entry)
>                                                      !h->pprev
>                                                  /*
>                                                    del_timer() checks h->pprev and finds
>                                                    it to be NULL due to which
>                                                    try_to_grab_pending() stucks.
>                                                  */
> 
> Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
> ---
> v4->v5:
>   - Rebased it.
> 
> v3->v4:
>   - flg variable renamed to delete_work.
> 
> v2->v3:
>   Addressed comments from gregkh
>   - Wrapped the commit text and corrected the alignment.
>   - Described the reason to introduce new variables.
>   - Restored the blank line.
>   - rename the del_wk_queued to flg.
>   Addressed comments from tglx
>   - Added a comment which explains the race which looks obvious however
>     would not occur between disabled_store and devcd_del work.
> 
> 
> v1->v2:
>   - Added del_wk_queued flag to serialize the race between devcd_data_write()
>     and disabled_store() => devcd_free().
>   drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
>   1 file changed, 81 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
> index f4d794d..1c06781 100644
> --- a/drivers/base/devcoredump.c
> +++ b/drivers/base/devcoredump.c
> @@ -25,6 +25,47 @@ struct devcd_entry {
>   	struct device devcd_dev;
>   	void *data;
>   	size_t datalen;
> +	/*
> +	 * Here, mutex is required to serialize the calls to del_wk work between
> +	 * user/kernel space which happens when devcd is added with device_add()
> +	 * and that sends uevent to user space. User space reads the uevents,
> +	 * and calls to devcd_data_write() which try to modify the work which is
> +	 * not even initialized/queued from devcoredump.
> +	 *
> +	 *
> +	 *
> +	 *        cpu0(X)                                 cpu1(Y)
> +	 *
> +	 *        dev_coredump() uevent sent to user space
> +	 *        device_add()  ======================> user space process Y reads the
> +	 *                                              uevents writes to devcd fd
> +	 *                                              which results into writes to
> +	 *
> +	 *                                             devcd_data_write()
> +	 *                                               mod_delayed_work()
> +	 *                                                 try_to_grab_pending()
> +	 *                                                   del_timer()
> +	 *                                                     debug_assert_init()
> +	 *       INIT_DELAYED_WORK()
> +	 *       schedule_delayed_work()
> +	 *
> +	 *
> +	 * Also, mutex alone would not be enough to avoid scheduling of
> +	 * del_wk work after it get flush from a call to devcd_free()
> +	 * mentioned as below.
> +	 *
> +	 *	disabled_store()
> +	 *        devcd_free()
> +	 *          mutex_lock()             devcd_data_write()
> +	 *          flush_delayed_work()
> +	 *          mutex_unlock()
> +	 *                                   mutex_lock()
> +	 *                                   mod_delayed_work()
> +	 *                                   mutex_unlock()
> +	 * So, delete_work flag is required.
> +	 */
> +	struct mutex mutex;
> +	bool delete_work;
>   	struct module *owner;
>   	ssize_t (*read)(char *buffer, loff_t offset, size_t count,
>   			void *data, size_t datalen);
> @@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
>   	struct device *dev = kobj_to_dev(kobj);
>   	struct devcd_entry *devcd = dev_to_devcd(dev);
>   
> -	mod_delayed_work(system_wq, &devcd->del_wk, 0);
> +	mutex_lock(&devcd->mutex);
> +	if (!devcd->delete_work) {
> +		devcd->delete_work = true;
> +		mod_delayed_work(system_wq, &devcd->del_wk, 0);
> +	}
> +	mutex_unlock(&devcd->mutex);
>   
>   	return count;
>   }
> @@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data)
>   {
>   	struct devcd_entry *devcd = dev_to_devcd(dev);
>   
> +	mutex_lock(&devcd->mutex);
> +	if (!devcd->delete_work)
> +		devcd->delete_work = true;
> +
>   	flush_delayed_work(&devcd->del_wk);
> +	mutex_unlock(&devcd->mutex);
>   	return 0;
>   }
>   
> @@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
>   	return sysfs_emit(buf, "%d\n", devcd_disabled);
>   }
>   
> +/*
> + *
> + *	disabled_store()                                	worker()
> + *	 class_for_each_device(&devcd_class,
> + *		NULL, NULL, devcd_free)
> + *         ...
> + *         ...
> + *	   while ((dev = class_dev_iter_next(&iter))
> + *                                                             devcd_del()
> + *                                                               device_del()
> + *                                                                 put_device() <- last reference
> + *             error = fn(dev, data)                           devcd_dev_release()
> + *             devcd_free(dev, data)                           kfree(devcd)
> + *             mutex_lock(&devcd->mutex);
> + *
> + *
> + * In the above diagram, It looks like disabled_store() would be racing with parallely
> + * running devcd_del() and result in memory abort while acquiring devcd->mutex which
> + * is called after kfree of devcd memory  after dropping its last reference with
> + * put_device(). However, this will not happens as fn(dev, data) runs
> + * with its own reference to device via klist_node so it is not its last reference.
> + * so, above situation would not occur.
> + */
> +
>   static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
>   			      const char *buf, size_t count)
>   {
> @@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>   	devcd->read = read;
>   	devcd->free = free;
>   	devcd->failing_dev = get_device(dev);
> +	devcd->delete_work = false;
>   
> +	mutex_init(&devcd->mutex);
>   	device_initialize(&devcd->devcd_dev);
>   
>   	dev_set_name(&devcd->devcd_dev, "devcd%d",
>   		     atomic_inc_return(&devcd_count));
>   	devcd->devcd_dev.class = &devcd_class;
>   
> +	mutex_lock(&devcd->mutex);
>   	if (device_add(&devcd->devcd_dev))
>   		goto put_device;
>   
> @@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>   
>   	INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
>   	schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
> -
> +	mutex_unlock(&devcd->mutex);
>   	return;
>    put_device:
>   	put_device(&devcd->devcd_dev);
> +	mutex_unlock(&devcd->mutex);
>    put_module:
>   	module_put(owner);
>    free: