drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 2 deletions(-)
In following scenario(diagram), when one thread X running dev_coredumpm()
adds devcd device to the framework which sends uevent notification to
userspace and another thread Y reads this uevent and call to
devcd_data_write() which eventually try to delete the queued timer that
is not initialized/queued yet.
So, debug object reports some warning and in the meantime, timer is
initialized and queued from X path. and from Y path, it gets reinitialized
again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
To fix this, introduce mutex and a boolean flag to serialize the behaviour.
cpu0(X) cpu1(Y)
dev_coredump() uevent sent to user space
device_add() ======================> user space process Y reads the
uevents writes to devcd fd
which results into writes to
devcd_data_write()
mod_delayed_work()
try_to_grab_pending()
del_timer()
debug_assert_init()
INIT_DELAYED_WORK()
schedule_delayed_work()
debug_object_fixup()
timer_fixup_assert_init()
timer_setup()
do_init_timer()
/*
Above call reinitializes
the timer to
timer->entry.pprev=NULL
and this will be checked
later in timer_pending() call.
*/
timer_pending()
!hlist_unhashed_lockless(&timer->entry)
!h->pprev
/*
del_timer() checks h->pprev and finds
it to be NULL due to which
try_to_grab_pending() stucks.
*/
Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
---
v4->v5:
- Rebased it.
v3->v4:
- flg variable renamed to delete_work.
v2->v3:
Addressed comments from gregkh
- Wrapped the commit text and corrected the alignment.
- Described the reason to introduce new variables.
- Restored the blank line.
- rename the del_wk_queued to flg.
Addressed comments from tglx
- Added a comment which explains the race which looks obvious however
would not occur between disabled_store and devcd_del work.
v1->v2:
- Added del_wk_queued flag to serialize the race between devcd_data_write()
and disabled_store() => devcd_free().
drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
1 file changed, 81 insertions(+), 2 deletions(-)
diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
index f4d794d..1c06781 100644
--- a/drivers/base/devcoredump.c
+++ b/drivers/base/devcoredump.c
@@ -25,6 +25,47 @@ struct devcd_entry {
struct device devcd_dev;
void *data;
size_t datalen;
+ /*
+ * Here, mutex is required to serialize the calls to del_wk work between
+ * user/kernel space which happens when devcd is added with device_add()
+ * and that sends uevent to user space. User space reads the uevents,
+ * and calls to devcd_data_write() which try to modify the work which is
+ * not even initialized/queued from devcoredump.
+ *
+ *
+ *
+ * cpu0(X) cpu1(Y)
+ *
+ * dev_coredump() uevent sent to user space
+ * device_add() ======================> user space process Y reads the
+ * uevents writes to devcd fd
+ * which results into writes to
+ *
+ * devcd_data_write()
+ * mod_delayed_work()
+ * try_to_grab_pending()
+ * del_timer()
+ * debug_assert_init()
+ * INIT_DELAYED_WORK()
+ * schedule_delayed_work()
+ *
+ *
+ * Also, mutex alone would not be enough to avoid scheduling of
+ * del_wk work after it get flush from a call to devcd_free()
+ * mentioned as below.
+ *
+ * disabled_store()
+ * devcd_free()
+ * mutex_lock() devcd_data_write()
+ * flush_delayed_work()
+ * mutex_unlock()
+ * mutex_lock()
+ * mod_delayed_work()
+ * mutex_unlock()
+ * So, delete_work flag is required.
+ */
+ struct mutex mutex;
+ bool delete_work;
struct module *owner;
ssize_t (*read)(char *buffer, loff_t offset, size_t count,
void *data, size_t datalen);
@@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
struct device *dev = kobj_to_dev(kobj);
struct devcd_entry *devcd = dev_to_devcd(dev);
- mod_delayed_work(system_wq, &devcd->del_wk, 0);
+ mutex_lock(&devcd->mutex);
+ if (!devcd->delete_work) {
+ devcd->delete_work = true;
+ mod_delayed_work(system_wq, &devcd->del_wk, 0);
+ }
+ mutex_unlock(&devcd->mutex);
return count;
}
@@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data)
{
struct devcd_entry *devcd = dev_to_devcd(dev);
+ mutex_lock(&devcd->mutex);
+ if (!devcd->delete_work)
+ devcd->delete_work = true;
+
flush_delayed_work(&devcd->del_wk);
+ mutex_unlock(&devcd->mutex);
return 0;
}
@@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
return sysfs_emit(buf, "%d\n", devcd_disabled);
}
+/*
+ *
+ * disabled_store() worker()
+ * class_for_each_device(&devcd_class,
+ * NULL, NULL, devcd_free)
+ * ...
+ * ...
+ * while ((dev = class_dev_iter_next(&iter))
+ * devcd_del()
+ * device_del()
+ * put_device() <- last reference
+ * error = fn(dev, data) devcd_dev_release()
+ * devcd_free(dev, data) kfree(devcd)
+ * mutex_lock(&devcd->mutex);
+ *
+ *
+ * In the above diagram, It looks like disabled_store() would be racing with parallely
+ * running devcd_del() and result in memory abort while acquiring devcd->mutex which
+ * is called after kfree of devcd memory after dropping its last reference with
+ * put_device(). However, this will not happens as fn(dev, data) runs
+ * with its own reference to device via klist_node so it is not its last reference.
+ * so, above situation would not occur.
+ */
+
static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
const char *buf, size_t count)
{
@@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner,
devcd->read = read;
devcd->free = free;
devcd->failing_dev = get_device(dev);
+ devcd->delete_work = false;
+ mutex_init(&devcd->mutex);
device_initialize(&devcd->devcd_dev);
dev_set_name(&devcd->devcd_dev, "devcd%d",
atomic_inc_return(&devcd_count));
devcd->devcd_dev.class = &devcd_class;
+ mutex_lock(&devcd->mutex);
if (device_add(&devcd->devcd_dev))
goto put_device;
@@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
-
+ mutex_unlock(&devcd->mutex);
return;
put_device:
put_device(&devcd->devcd_dev);
+ mutex_unlock(&devcd->mutex);
put_module:
module_put(owner);
free:
--
2.7.4
Hi Johannes/Kees,
Sorry for reminding on it again.
Any hope of this one to get into devcoredump ?
-Mukesh
On 5/27/2022 7:33 PM, Mukesh Ojha wrote:
> In following scenario(diagram), when one thread X running dev_coredumpm()
> adds devcd device to the framework which sends uevent notification to
> userspace and another thread Y reads this uevent and call to
> devcd_data_write() which eventually try to delete the queued timer that
> is not initialized/queued yet.
>
> So, debug object reports some warning and in the meantime, timer is
> initialized and queued from X path. and from Y path, it gets reinitialized
> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
>
> To fix this, introduce mutex and a boolean flag to serialize the behaviour.
>
> cpu0(X) cpu1(Y)
>
> dev_coredump() uevent sent to user space
> device_add() ======================> user space process Y reads the
> uevents writes to devcd fd
> which results into writes to
>
> devcd_data_write()
> mod_delayed_work()
> try_to_grab_pending()
> del_timer()
> debug_assert_init()
> INIT_DELAYED_WORK()
> schedule_delayed_work()
> debug_object_fixup()
> timer_fixup_assert_init()
> timer_setup()
> do_init_timer()
> /*
> Above call reinitializes
> the timer to
> timer->entry.pprev=NULL
> and this will be checked
> later in timer_pending() call.
> */
> timer_pending()
> !hlist_unhashed_lockless(&timer->entry)
> !h->pprev
> /*
> del_timer() checks h->pprev and finds
> it to be NULL due to which
> try_to_grab_pending() stucks.
> */
>
> Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
> ---
> v4->v5:
> - Rebased it.
>
> v3->v4:
> - flg variable renamed to delete_work.
>
> v2->v3:
> Addressed comments from gregkh
> - Wrapped the commit text and corrected the alignment.
> - Described the reason to introduce new variables.
> - Restored the blank line.
> - rename the del_wk_queued to flg.
> Addressed comments from tglx
> - Added a comment which explains the race which looks obvious however
> would not occur between disabled_store and devcd_del work.
>
>
> v1->v2:
> - Added del_wk_queued flag to serialize the race between devcd_data_write()
> and disabled_store() => devcd_free().
> drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 81 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
> index f4d794d..1c06781 100644
> --- a/drivers/base/devcoredump.c
> +++ b/drivers/base/devcoredump.c
> @@ -25,6 +25,47 @@ struct devcd_entry {
> struct device devcd_dev;
> void *data;
> size_t datalen;
> + /*
> + * Here, mutex is required to serialize the calls to del_wk work between
> + * user/kernel space which happens when devcd is added with device_add()
> + * and that sends uevent to user space. User space reads the uevents,
> + * and calls to devcd_data_write() which try to modify the work which is
> + * not even initialized/queued from devcoredump.
> + *
> + *
> + *
> + * cpu0(X) cpu1(Y)
> + *
> + * dev_coredump() uevent sent to user space
> + * device_add() ======================> user space process Y reads the
> + * uevents writes to devcd fd
> + * which results into writes to
> + *
> + * devcd_data_write()
> + * mod_delayed_work()
> + * try_to_grab_pending()
> + * del_timer()
> + * debug_assert_init()
> + * INIT_DELAYED_WORK()
> + * schedule_delayed_work()
> + *
> + *
> + * Also, mutex alone would not be enough to avoid scheduling of
> + * del_wk work after it get flush from a call to devcd_free()
> + * mentioned as below.
> + *
> + * disabled_store()
> + * devcd_free()
> + * mutex_lock() devcd_data_write()
> + * flush_delayed_work()
> + * mutex_unlock()
> + * mutex_lock()
> + * mod_delayed_work()
> + * mutex_unlock()
> + * So, delete_work flag is required.
> + */
> + struct mutex mutex;
> + bool delete_work;
> struct module *owner;
> ssize_t (*read)(char *buffer, loff_t offset, size_t count,
> void *data, size_t datalen);
> @@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
> struct device *dev = kobj_to_dev(kobj);
> struct devcd_entry *devcd = dev_to_devcd(dev);
>
> - mod_delayed_work(system_wq, &devcd->del_wk, 0);
> + mutex_lock(&devcd->mutex);
> + if (!devcd->delete_work) {
> + devcd->delete_work = true;
> + mod_delayed_work(system_wq, &devcd->del_wk, 0);
> + }
> + mutex_unlock(&devcd->mutex);
>
> return count;
> }
> @@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data)
> {
> struct devcd_entry *devcd = dev_to_devcd(dev);
>
> + mutex_lock(&devcd->mutex);
> + if (!devcd->delete_work)
> + devcd->delete_work = true;
> +
> flush_delayed_work(&devcd->del_wk);
> + mutex_unlock(&devcd->mutex);
> return 0;
> }
>
> @@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
> return sysfs_emit(buf, "%d\n", devcd_disabled);
> }
>
> +/*
> + *
> + * disabled_store() worker()
> + * class_for_each_device(&devcd_class,
> + * NULL, NULL, devcd_free)
> + * ...
> + * ...
> + * while ((dev = class_dev_iter_next(&iter))
> + * devcd_del()
> + * device_del()
> + * put_device() <- last reference
> + * error = fn(dev, data) devcd_dev_release()
> + * devcd_free(dev, data) kfree(devcd)
> + * mutex_lock(&devcd->mutex);
> + *
> + *
> + * In the above diagram, It looks like disabled_store() would be racing with parallely
> + * running devcd_del() and result in memory abort while acquiring devcd->mutex which
> + * is called after kfree of devcd memory after dropping its last reference with
> + * put_device(). However, this will not happens as fn(dev, data) runs
> + * with its own reference to device via klist_node so it is not its last reference.
> + * so, above situation would not occur.
> + */
> +
> static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
> const char *buf, size_t count)
> {
> @@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner,
> devcd->read = read;
> devcd->free = free;
> devcd->failing_dev = get_device(dev);
> + devcd->delete_work = false;
>
> + mutex_init(&devcd->mutex);
> device_initialize(&devcd->devcd_dev);
>
> dev_set_name(&devcd->devcd_dev, "devcd%d",
> atomic_inc_return(&devcd_count));
> devcd->devcd_dev.class = &devcd_class;
>
> + mutex_lock(&devcd->mutex);
> if (device_add(&devcd->devcd_dev))
> goto put_device;
>
> @@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>
> INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
> schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
> -
> + mutex_unlock(&devcd->mutex);
> return;
> put_device:
> put_device(&devcd->devcd_dev);
> + mutex_unlock(&devcd->mutex);
> put_module:
> module_put(owner);
> free:
On Thu, Aug 11, 2022 at 09:43:48PM +0530, Mukesh Ojha wrote:
> Hi Johannes/Kees,
Hi!
>
> Sorry for reminding on it again.
> Any hope of this one to get into devcoredump ?
I don't know this code well enough to comment on the solution, but it
seems designed and justified correctly, at least. :)
I'll leave it to Johannes for review.
-Kees
>
> -Mukesh
>
>
> On 5/27/2022 7:33 PM, Mukesh Ojha wrote:
> > In following scenario(diagram), when one thread X running dev_coredumpm()
> > adds devcd device to the framework which sends uevent notification to
> > userspace and another thread Y reads this uevent and call to
> > devcd_data_write() which eventually try to delete the queued timer that
> > is not initialized/queued yet.
> >
> > So, debug object reports some warning and in the meantime, timer is
> > initialized and queued from X path. and from Y path, it gets reinitialized
> > again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
> >
> > To fix this, introduce mutex and a boolean flag to serialize the behaviour.
> >
> > cpu0(X) cpu1(Y)
> >
> > dev_coredump() uevent sent to user space
> > device_add() ======================> user space process Y reads the
> > uevents writes to devcd fd
> > which results into writes to
> >
> > devcd_data_write()
> > mod_delayed_work()
> > try_to_grab_pending()
> > del_timer()
> > debug_assert_init()
> > INIT_DELAYED_WORK()
> > schedule_delayed_work()
> > debug_object_fixup()
> > timer_fixup_assert_init()
> > timer_setup()
> > do_init_timer()
> > /*
> > Above call reinitializes
> > the timer to
> > timer->entry.pprev=NULL
> > and this will be checked
> > later in timer_pending() call.
> > */
> > timer_pending()
> > !hlist_unhashed_lockless(&timer->entry)
> > !h->pprev
> > /*
> > del_timer() checks h->pprev and finds
> > it to be NULL due to which
> > try_to_grab_pending() stucks.
> > */
> >
> > Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
> > Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
> > ---
> > v4->v5:
> > - Rebased it.
> >
> > v3->v4:
> > - flg variable renamed to delete_work.
> >
> > v2->v3:
> > Addressed comments from gregkh
> > - Wrapped the commit text and corrected the alignment.
> > - Described the reason to introduce new variables.
> > - Restored the blank line.
> > - rename the del_wk_queued to flg.
> > Addressed comments from tglx
> > - Added a comment which explains the race which looks obvious however
> > would not occur between disabled_store and devcd_del work.
> >
> >
> > v1->v2:
> > - Added del_wk_queued flag to serialize the race between devcd_data_write()
> > and disabled_store() => devcd_free().
> > drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
> > 1 file changed, 81 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
> > index f4d794d..1c06781 100644
> > --- a/drivers/base/devcoredump.c
> > +++ b/drivers/base/devcoredump.c
> > @@ -25,6 +25,47 @@ struct devcd_entry {
> > struct device devcd_dev;
> > void *data;
> > size_t datalen;
> > + /*
> > + * Here, mutex is required to serialize the calls to del_wk work between
> > + * user/kernel space which happens when devcd is added with device_add()
> > + * and that sends uevent to user space. User space reads the uevents,
> > + * and calls to devcd_data_write() which try to modify the work which is
> > + * not even initialized/queued from devcoredump.
> > + *
> > + *
> > + *
> > + * cpu0(X) cpu1(Y)
> > + *
> > + * dev_coredump() uevent sent to user space
> > + * device_add() ======================> user space process Y reads the
> > + * uevents writes to devcd fd
> > + * which results into writes to
> > + *
> > + * devcd_data_write()
> > + * mod_delayed_work()
> > + * try_to_grab_pending()
> > + * del_timer()
> > + * debug_assert_init()
> > + * INIT_DELAYED_WORK()
> > + * schedule_delayed_work()
> > + *
> > + *
> > + * Also, mutex alone would not be enough to avoid scheduling of
> > + * del_wk work after it get flush from a call to devcd_free()
> > + * mentioned as below.
> > + *
> > + * disabled_store()
> > + * devcd_free()
> > + * mutex_lock() devcd_data_write()
> > + * flush_delayed_work()
> > + * mutex_unlock()
> > + * mutex_lock()
> > + * mod_delayed_work()
> > + * mutex_unlock()
> > + * So, delete_work flag is required.
> > + */
> > + struct mutex mutex;
> > + bool delete_work;
> > struct module *owner;
> > ssize_t (*read)(char *buffer, loff_t offset, size_t count,
> > void *data, size_t datalen);
> > @@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
> > struct device *dev = kobj_to_dev(kobj);
> > struct devcd_entry *devcd = dev_to_devcd(dev);
> > - mod_delayed_work(system_wq, &devcd->del_wk, 0);
> > + mutex_lock(&devcd->mutex);
> > + if (!devcd->delete_work) {
> > + devcd->delete_work = true;
> > + mod_delayed_work(system_wq, &devcd->del_wk, 0);
> > + }
> > + mutex_unlock(&devcd->mutex);
> > return count;
> > }
> > @@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data)
> > {
> > struct devcd_entry *devcd = dev_to_devcd(dev);
> > + mutex_lock(&devcd->mutex);
> > + if (!devcd->delete_work)
> > + devcd->delete_work = true;
> > +
> > flush_delayed_work(&devcd->del_wk);
> > + mutex_unlock(&devcd->mutex);
> > return 0;
> > }
> > @@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
> > return sysfs_emit(buf, "%d\n", devcd_disabled);
> > }
> > +/*
> > + *
> > + * disabled_store() worker()
> > + * class_for_each_device(&devcd_class,
> > + * NULL, NULL, devcd_free)
> > + * ...
> > + * ...
> > + * while ((dev = class_dev_iter_next(&iter))
> > + * devcd_del()
> > + * device_del()
> > + * put_device() <- last reference
> > + * error = fn(dev, data) devcd_dev_release()
> > + * devcd_free(dev, data) kfree(devcd)
> > + * mutex_lock(&devcd->mutex);
> > + *
> > + *
> > + * In the above diagram, It looks like disabled_store() would be racing with parallely
> > + * running devcd_del() and result in memory abort while acquiring devcd->mutex which
> > + * is called after kfree of devcd memory after dropping its last reference with
> > + * put_device(). However, this will not happens as fn(dev, data) runs
> > + * with its own reference to device via klist_node so it is not its last reference.
> > + * so, above situation would not occur.
> > + */
> > +
> > static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
> > const char *buf, size_t count)
> > {
> > @@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner,
> > devcd->read = read;
> > devcd->free = free;
> > devcd->failing_dev = get_device(dev);
> > + devcd->delete_work = false;
> > + mutex_init(&devcd->mutex);
> > device_initialize(&devcd->devcd_dev);
> > dev_set_name(&devcd->devcd_dev, "devcd%d",
> > atomic_inc_return(&devcd_count));
> > devcd->devcd_dev.class = &devcd_class;
> > + mutex_lock(&devcd->mutex);
> > if (device_add(&devcd->devcd_dev))
> > goto put_device;
> > @@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
> > INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
> > schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
> > -
> > + mutex_unlock(&devcd->mutex);
> > return;
> > put_device:
> > put_device(&devcd->devcd_dev);
> > + mutex_unlock(&devcd->mutex);
> > put_module:
> > module_put(owner);
> > free:
--
Kees Cook
On 8/17/2022 1:53 AM, Kees Cook wrote:
> On Thu, Aug 11, 2022 at 09:43:48PM +0530, Mukesh Ojha wrote:
>> Hi Johannes/Kees,
>
> Hi!
>
>>
>> Sorry for reminding on it again.
>> Any hope of this one to get into devcoredump ?
>
> I don't know this code well enough to comment on the solution, but it
> seems designed and justified correctly, at least. :)
Thanks Kees for the reply.
Hi @Johannes,
This patch is running in our internal build from a very long time and it
did not show any regression.
Would like to get your ack on it ?
-Mukesh
>
> I'll leave it to Johannes for review.
>
> -Kees
>
>>
>> -Mukesh
>>
>>
>> On 5/27/2022 7:33 PM, Mukesh Ojha wrote:
>>> In following scenario(diagram), when one thread X running dev_coredumpm()
>>> adds devcd device to the framework which sends uevent notification to
>>> userspace and another thread Y reads this uevent and call to
>>> devcd_data_write() which eventually try to delete the queued timer that
>>> is not initialized/queued yet.
>>>
>>> So, debug object reports some warning and in the meantime, timer is
>>> initialized and queued from X path. and from Y path, it gets reinitialized
>>> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
>>>
>>> To fix this, introduce mutex and a boolean flag to serialize the behaviour.
>>>
>>> cpu0(X) cpu1(Y)
>>>
>>> dev_coredump() uevent sent to user space
>>> device_add() ======================> user space process Y reads the
>>> uevents writes to devcd fd
>>> which results into writes to
>>>
>>> devcd_data_write()
>>> mod_delayed_work()
>>> try_to_grab_pending()
>>> del_timer()
>>> debug_assert_init()
>>> INIT_DELAYED_WORK()
>>> schedule_delayed_work()
>>> debug_object_fixup()
>>> timer_fixup_assert_init()
>>> timer_setup()
>>> do_init_timer()
>>> /*
>>> Above call reinitializes
>>> the timer to
>>> timer->entry.pprev=NULL
>>> and this will be checked
>>> later in timer_pending() call.
>>> */
>>> timer_pending()
>>> !hlist_unhashed_lockless(&timer->entry)
>>> !h->pprev
>>> /*
>>> del_timer() checks h->pprev and finds
>>> it to be NULL due to which
>>> try_to_grab_pending() stucks.
>>> */
>>>
>>> Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
>>> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
>>> ---
>>> v4->v5:
>>> - Rebased it.
>>>
>>> v3->v4:
>>> - flg variable renamed to delete_work.
>>>
>>> v2->v3:
>>> Addressed comments from gregkh
>>> - Wrapped the commit text and corrected the alignment.
>>> - Described the reason to introduce new variables.
>>> - Restored the blank line.
>>> - rename the del_wk_queued to flg.
>>> Addressed comments from tglx
>>> - Added a comment which explains the race which looks obvious however
>>> would not occur between disabled_store and devcd_del work.
>>>
>>>
>>> v1->v2:
>>> - Added del_wk_queued flag to serialize the race between devcd_data_write()
>>> and disabled_store() => devcd_free().
>>> drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
>>> 1 file changed, 81 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
>>> index f4d794d..1c06781 100644
>>> --- a/drivers/base/devcoredump.c
>>> +++ b/drivers/base/devcoredump.c
>>> @@ -25,6 +25,47 @@ struct devcd_entry {
>>> struct device devcd_dev;
>>> void *data;
>>> size_t datalen;
>>> + /*
>>> + * Here, mutex is required to serialize the calls to del_wk work between
>>> + * user/kernel space which happens when devcd is added with device_add()
>>> + * and that sends uevent to user space. User space reads the uevents,
>>> + * and calls to devcd_data_write() which try to modify the work which is
>>> + * not even initialized/queued from devcoredump.
>>> + *
>>> + *
>>> + *
>>> + * cpu0(X) cpu1(Y)
>>> + *
>>> + * dev_coredump() uevent sent to user space
>>> + * device_add() ======================> user space process Y reads the
>>> + * uevents writes to devcd fd
>>> + * which results into writes to
>>> + *
>>> + * devcd_data_write()
>>> + * mod_delayed_work()
>>> + * try_to_grab_pending()
>>> + * del_timer()
>>> + * debug_assert_init()
>>> + * INIT_DELAYED_WORK()
>>> + * schedule_delayed_work()
>>> + *
>>> + *
>>> + * Also, mutex alone would not be enough to avoid scheduling of
>>> + * del_wk work after it get flush from a call to devcd_free()
>>> + * mentioned as below.
>>> + *
>>> + * disabled_store()
>>> + * devcd_free()
>>> + * mutex_lock() devcd_data_write()
>>> + * flush_delayed_work()
>>> + * mutex_unlock()
>>> + * mutex_lock()
>>> + * mod_delayed_work()
>>> + * mutex_unlock()
>>> + * So, delete_work flag is required.
>>> + */
>>> + struct mutex mutex;
>>> + bool delete_work;
>>> struct module *owner;
>>> ssize_t (*read)(char *buffer, loff_t offset, size_t count,
>>> void *data, size_t datalen);
>>> @@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
>>> struct device *dev = kobj_to_dev(kobj);
>>> struct devcd_entry *devcd = dev_to_devcd(dev);
>>> - mod_delayed_work(system_wq, &devcd->del_wk, 0);
>>> + mutex_lock(&devcd->mutex);
>>> + if (!devcd->delete_work) {
>>> + devcd->delete_work = true;
>>> + mod_delayed_work(system_wq, &devcd->del_wk, 0);
>>> + }
>>> + mutex_unlock(&devcd->mutex);
>>> return count;
>>> }
>>> @@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data)
>>> {
>>> struct devcd_entry *devcd = dev_to_devcd(dev);
>>> + mutex_lock(&devcd->mutex);
>>> + if (!devcd->delete_work)
>>> + devcd->delete_work = true;
>>> +
>>> flush_delayed_work(&devcd->del_wk);
>>> + mutex_unlock(&devcd->mutex);
>>> return 0;
>>> }
>>> @@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
>>> return sysfs_emit(buf, "%d\n", devcd_disabled);
>>> }
>>> +/*
>>> + *
>>> + * disabled_store() worker()
>>> + * class_for_each_device(&devcd_class,
>>> + * NULL, NULL, devcd_free)
>>> + * ...
>>> + * ...
>>> + * while ((dev = class_dev_iter_next(&iter))
>>> + * devcd_del()
>>> + * device_del()
>>> + * put_device() <- last reference
>>> + * error = fn(dev, data) devcd_dev_release()
>>> + * devcd_free(dev, data) kfree(devcd)
>>> + * mutex_lock(&devcd->mutex);
>>> + *
>>> + *
>>> + * In the above diagram, It looks like disabled_store() would be racing with parallely
>>> + * running devcd_del() and result in memory abort while acquiring devcd->mutex which
>>> + * is called after kfree of devcd memory after dropping its last reference with
>>> + * put_device(). However, this will not happens as fn(dev, data) runs
>>> + * with its own reference to device via klist_node so it is not its last reference.
>>> + * so, above situation would not occur.
>>> + */
>>> +
>>> static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
>>> const char *buf, size_t count)
>>> {
>>> @@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>>> devcd->read = read;
>>> devcd->free = free;
>>> devcd->failing_dev = get_device(dev);
>>> + devcd->delete_work = false;
>>> + mutex_init(&devcd->mutex);
>>> device_initialize(&devcd->devcd_dev);
>>> dev_set_name(&devcd->devcd_dev, "devcd%d",
>>> atomic_inc_return(&devcd_count));
>>> devcd->devcd_dev.class = &devcd_class;
>>> + mutex_lock(&devcd->mutex);
>>> if (device_add(&devcd->devcd_dev))
>>> goto put_device;
>>> @@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>>> INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
>>> schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
>>> -
>>> + mutex_unlock(&devcd->mutex);
>>> return;
>>> put_device:
>>> put_device(&devcd->devcd_dev);
>>> + mutex_unlock(&devcd->mutex);
>>> put_module:
>>> module_put(owner);
>>> free:
>
On Fri, May 27, 2022 at 07:33:40PM +0530, Mukesh Ojha wrote: > In following scenario(diagram), when one thread X running dev_coredumpm() > adds devcd device to the framework which sends uevent notification to > userspace and another thread Y reads this uevent and call to > devcd_data_write() which eventually try to delete the queued timer that > is not initialized/queued yet. > > So, debug object reports some warning and in the meantime, timer is > initialized and queued from X path. and from Y path, it gets reinitialized > again and timer->entry.pprev=NULL and try_to_grab_pending() stucks. > > To fix this, introduce mutex and a boolean flag to serialize the behaviour. > > cpu0(X) cpu1(Y) > > dev_coredump() uevent sent to user space > device_add() ======================> user space process Y reads the > uevents writes to devcd fd > which results into writes to > > devcd_data_write() > mod_delayed_work() > try_to_grab_pending() > del_timer() > debug_assert_init() > INIT_DELAYED_WORK() > schedule_delayed_work() > debug_object_fixup() > timer_fixup_assert_init() > timer_setup() > do_init_timer() > /* > Above call reinitializes > the timer to > timer->entry.pprev=NULL > and this will be checked > later in timer_pending() call. > */ > timer_pending() > !hlist_unhashed_lockless(&timer->entry) > !h->pprev > /* > del_timer() checks h->pprev and finds > it to be NULL due to which > try_to_grab_pending() stucks. > */ > > Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/ > Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> > --- I need an ack from the devcoredump maintainer before I can take this... thanks, greg k-h
Thanks @greg. Hi @johannes, Could you review this patch? -Mukesh On 6/27/2022 6:41 PM, Greg KH wrote: > On Fri, May 27, 2022 at 07:33:40PM +0530, Mukesh Ojha wrote: >> In following scenario(diagram), when one thread X running dev_coredumpm() >> adds devcd device to the framework which sends uevent notification to >> userspace and another thread Y reads this uevent and call to >> devcd_data_write() which eventually try to delete the queued timer that >> is not initialized/queued yet. >> >> So, debug object reports some warning and in the meantime, timer is >> initialized and queued from X path. and from Y path, it gets reinitialized >> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks. >> >> To fix this, introduce mutex and a boolean flag to serialize the behaviour. >> >> cpu0(X) cpu1(Y) >> >> dev_coredump() uevent sent to user space >> device_add() ======================> user space process Y reads the >> uevents writes to devcd fd >> which results into writes to >> >> devcd_data_write() >> mod_delayed_work() >> try_to_grab_pending() >> del_timer() >> debug_assert_init() >> INIT_DELAYED_WORK() >> schedule_delayed_work() >> debug_object_fixup() >> timer_fixup_assert_init() >> timer_setup() >> do_init_timer() >> /* >> Above call reinitializes >> the timer to >> timer->entry.pprev=NULL >> and this will be checked >> later in timer_pending() call. >> */ >> timer_pending() >> !hlist_unhashed_lockless(&timer->entry) >> !h->pprev >> /* >> del_timer() checks h->pprev and finds >> it to be NULL due to which >> try_to_grab_pending() stucks. >> */ >> >> Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/ >> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> >> --- > > I need an ack from the devcoredump maintainer before I can take this... > > thanks, > > greg k-h
Gentle reminder for review. -Mukesh On 7/1/2022 8:23 PM, Mukesh Ojha wrote: > Thanks @greg. > > Hi @johannes, > > Could you review this patch? > > -Mukesh > > On 6/27/2022 6:41 PM, Greg KH wrote: >> On Fri, May 27, 2022 at 07:33:40PM +0530, Mukesh Ojha wrote: >>> In following scenario(diagram), when one thread X running >>> dev_coredumpm() >>> adds devcd device to the framework which sends uevent notification to >>> userspace and another thread Y reads this uevent and call to >>> devcd_data_write() which eventually try to delete the queued timer that >>> is not initialized/queued yet. >>> >>> So, debug object reports some warning and in the meantime, timer is >>> initialized and queued from X path. and from Y path, it gets >>> reinitialized >>> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks. >>> >>> To fix this, introduce mutex and a boolean flag to serialize the >>> behaviour. >>> >>> cpu0(X) cpu1(Y) >>> >>> dev_coredump() uevent sent to user space >>> device_add() ======================> user space process Y reads >>> the >>> uevents writes to devcd fd >>> which results into writes to >>> >>> devcd_data_write() >>> mod_delayed_work() >>> try_to_grab_pending() >>> del_timer() >>> debug_assert_init() >>> INIT_DELAYED_WORK() >>> schedule_delayed_work() >>> debug_object_fixup() >>> >>> timer_fixup_assert_init() >>> timer_setup() >>> >>> do_init_timer() >>> /* >>> Above call >>> reinitializes >>> the timer to >>> >>> timer->entry.pprev=NULL >>> and this >>> will be checked >>> later in >>> timer_pending() call. >>> */ >>> timer_pending() >>> >>> !hlist_unhashed_lockless(&timer->entry) >>> !h->pprev >>> /* >>> del_timer() checks >>> h->pprev and finds >>> it to be NULL due >>> to which >>> >>> try_to_grab_pending() stucks. >>> */ >>> >>> Link: >>> https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/ >>> >>> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> >>> --- >> >> I need an ack from the devcoredump maintainer before I can take this... >> >> thanks, >> >> greg k-h
Friendly reminder !!
-Mukesh
On 5/27/2022 7:33 PM, Mukesh Ojha wrote:
> In following scenario(diagram), when one thread X running dev_coredumpm()
> adds devcd device to the framework which sends uevent notification to
> userspace and another thread Y reads this uevent and call to
> devcd_data_write() which eventually try to delete the queued timer that
> is not initialized/queued yet.
>
> So, debug object reports some warning and in the meantime, timer is
> initialized and queued from X path. and from Y path, it gets reinitialized
> again and timer->entry.pprev=NULL and try_to_grab_pending() stucks.
>
> To fix this, introduce mutex and a boolean flag to serialize the behaviour.
>
> cpu0(X) cpu1(Y)
>
> dev_coredump() uevent sent to user space
> device_add() ======================> user space process Y reads the
> uevents writes to devcd fd
> which results into writes to
>
> devcd_data_write()
> mod_delayed_work()
> try_to_grab_pending()
> del_timer()
> debug_assert_init()
> INIT_DELAYED_WORK()
> schedule_delayed_work()
> debug_object_fixup()
> timer_fixup_assert_init()
> timer_setup()
> do_init_timer()
> /*
> Above call reinitializes
> the timer to
> timer->entry.pprev=NULL
> and this will be checked
> later in timer_pending() call.
> */
> timer_pending()
> !hlist_unhashed_lockless(&timer->entry)
> !h->pprev
> /*
> del_timer() checks h->pprev and finds
> it to be NULL due to which
> try_to_grab_pending() stucks.
> */
>
> Link: https://lore.kernel.org/lkml/2e1f81e2-428c-f11f-ce92-eb11048cb271@quicinc.com/
> Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
> ---
> v4->v5:
> - Rebased it.
>
> v3->v4:
> - flg variable renamed to delete_work.
>
> v2->v3:
> Addressed comments from gregkh
> - Wrapped the commit text and corrected the alignment.
> - Described the reason to introduce new variables.
> - Restored the blank line.
> - rename the del_wk_queued to flg.
> Addressed comments from tglx
> - Added a comment which explains the race which looks obvious however
> would not occur between disabled_store and devcd_del work.
>
>
> v1->v2:
> - Added del_wk_queued flag to serialize the race between devcd_data_write()
> and disabled_store() => devcd_free().
> drivers/base/devcoredump.c | 83 ++++++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 81 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/base/devcoredump.c b/drivers/base/devcoredump.c
> index f4d794d..1c06781 100644
> --- a/drivers/base/devcoredump.c
> +++ b/drivers/base/devcoredump.c
> @@ -25,6 +25,47 @@ struct devcd_entry {
> struct device devcd_dev;
> void *data;
> size_t datalen;
> + /*
> + * Here, mutex is required to serialize the calls to del_wk work between
> + * user/kernel space which happens when devcd is added with device_add()
> + * and that sends uevent to user space. User space reads the uevents,
> + * and calls to devcd_data_write() which try to modify the work which is
> + * not even initialized/queued from devcoredump.
> + *
> + *
> + *
> + * cpu0(X) cpu1(Y)
> + *
> + * dev_coredump() uevent sent to user space
> + * device_add() ======================> user space process Y reads the
> + * uevents writes to devcd fd
> + * which results into writes to
> + *
> + * devcd_data_write()
> + * mod_delayed_work()
> + * try_to_grab_pending()
> + * del_timer()
> + * debug_assert_init()
> + * INIT_DELAYED_WORK()
> + * schedule_delayed_work()
> + *
> + *
> + * Also, mutex alone would not be enough to avoid scheduling of
> + * del_wk work after it get flush from a call to devcd_free()
> + * mentioned as below.
> + *
> + * disabled_store()
> + * devcd_free()
> + * mutex_lock() devcd_data_write()
> + * flush_delayed_work()
> + * mutex_unlock()
> + * mutex_lock()
> + * mod_delayed_work()
> + * mutex_unlock()
> + * So, delete_work flag is required.
> + */
> + struct mutex mutex;
> + bool delete_work;
> struct module *owner;
> ssize_t (*read)(char *buffer, loff_t offset, size_t count,
> void *data, size_t datalen);
> @@ -84,7 +125,12 @@ static ssize_t devcd_data_write(struct file *filp, struct kobject *kobj,
> struct device *dev = kobj_to_dev(kobj);
> struct devcd_entry *devcd = dev_to_devcd(dev);
>
> - mod_delayed_work(system_wq, &devcd->del_wk, 0);
> + mutex_lock(&devcd->mutex);
> + if (!devcd->delete_work) {
> + devcd->delete_work = true;
> + mod_delayed_work(system_wq, &devcd->del_wk, 0);
> + }
> + mutex_unlock(&devcd->mutex);
>
> return count;
> }
> @@ -112,7 +158,12 @@ static int devcd_free(struct device *dev, void *data)
> {
> struct devcd_entry *devcd = dev_to_devcd(dev);
>
> + mutex_lock(&devcd->mutex);
> + if (!devcd->delete_work)
> + devcd->delete_work = true;
> +
> flush_delayed_work(&devcd->del_wk);
> + mutex_unlock(&devcd->mutex);
> return 0;
> }
>
> @@ -122,6 +173,30 @@ static ssize_t disabled_show(struct class *class, struct class_attribute *attr,
> return sysfs_emit(buf, "%d\n", devcd_disabled);
> }
>
> +/*
> + *
> + * disabled_store() worker()
> + * class_for_each_device(&devcd_class,
> + * NULL, NULL, devcd_free)
> + * ...
> + * ...
> + * while ((dev = class_dev_iter_next(&iter))
> + * devcd_del()
> + * device_del()
> + * put_device() <- last reference
> + * error = fn(dev, data) devcd_dev_release()
> + * devcd_free(dev, data) kfree(devcd)
> + * mutex_lock(&devcd->mutex);
> + *
> + *
> + * In the above diagram, It looks like disabled_store() would be racing with parallely
> + * running devcd_del() and result in memory abort while acquiring devcd->mutex which
> + * is called after kfree of devcd memory after dropping its last reference with
> + * put_device(). However, this will not happens as fn(dev, data) runs
> + * with its own reference to device via klist_node so it is not its last reference.
> + * so, above situation would not occur.
> + */
> +
> static ssize_t disabled_store(struct class *class, struct class_attribute *attr,
> const char *buf, size_t count)
> {
> @@ -278,13 +353,16 @@ void dev_coredumpm(struct device *dev, struct module *owner,
> devcd->read = read;
> devcd->free = free;
> devcd->failing_dev = get_device(dev);
> + devcd->delete_work = false;
>
> + mutex_init(&devcd->mutex);
> device_initialize(&devcd->devcd_dev);
>
> dev_set_name(&devcd->devcd_dev, "devcd%d",
> atomic_inc_return(&devcd_count));
> devcd->devcd_dev.class = &devcd_class;
>
> + mutex_lock(&devcd->mutex);
> if (device_add(&devcd->devcd_dev))
> goto put_device;
>
> @@ -301,10 +379,11 @@ void dev_coredumpm(struct device *dev, struct module *owner,
>
> INIT_DELAYED_WORK(&devcd->del_wk, devcd_del);
> schedule_delayed_work(&devcd->del_wk, DEVCD_TIMEOUT);
> -
> + mutex_unlock(&devcd->mutex);
> return;
> put_device:
> put_device(&devcd->devcd_dev);
> + mutex_unlock(&devcd->mutex);
> put_module:
> module_put(owner);
> free:
© 2016 - 2026 Red Hat, Inc.