bcache: fix oops bug in cache_set_flush

liequan che posted 1 patch 1 week, 2 days ago
drivers/md/bcache/super.c | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
bcache: fix oops bug in cache_set_flush
Posted by liequan che 1 week, 2 days ago
Signed-off-by: cheliequan <cheliequan@inspur.com>

   If the bcache cache disk contains damaged btree data,
when the bcache cache disk partition is directly operated,
the system-udevd service is triggered to call the bcache-register
program to register the bcache device,resulting in kernel oops.

crash> bt
PID: 7773     TASK: ffff49cc44d69340  CPU: 57   COMMAND: "kworker/57:2"
 #0 [ffff800046373800] machine_kexec at ffffbe5039eb54a8
 #1 [ffff8000463739b0] __crash_kexec at ffffbe503a052824
 #2 [ffff8000463739e0] crash_kexec at ffffbe503a0529cc
 #3 [ffff800046373a60] die at ffffbe5039e9445c
 #4 [ffff800046373ac0] die_kernel_fault at ffffbe5039ec698c
 #5 [ffff800046373af0] __do_kernel_fault at ffffbe5039ec6a38
 #6 [ffff800046373b20] do_page_fault at ffffbe503ac76ba4
 #7 [ffff800046373b70] do_translation_fault at ffffbe503ac76ebc
 #8 [ffff800046373b90] do_mem_abort at ffffbe5039ec68ac
 #9 [ffff800046373bc0] el1_abort at ffffbe503ac669bc
#10 [ffff800046373bf0] el1_sync_handler at ffffbe503ac671d4
#11 [ffff800046373d30] el1_sync at ffffbe5039e82230
#12 [ffff800046373d50] cache_set_flush at ffffbe50121fa4c4 [bcache]
#13 [ffff800046373da0] process_one_work at ffffbe5039f5af68
#14 [ffff800046373e00] worker_thread at ffffbe5039f5b3c4
#15 [ffff800046373e50] kthread at ffffbe5039f634b8
crash> dis cache_set_flush+0x94
0xffffbe50121fa4c8 <cache_set_flush+148>:       str     x23, [x20, #512]

---
drivers/md/bcache/super.c | 16 ++++++++++------
1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index fd97730479d8..8a41dfcf9fb6 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -1741,8 +1741,10 @@ static void cache_set_flush(struct closure *cl)
       if (!IS_ERR_OR_NULL(c->gc_thread))
               kthread_stop(c->gc_thread);

-       if (!IS_ERR(c->root))
-               list_add(&c->root->list, &c->btree_cache);
+       if (!IS_ERR_OR_NULL(c->root)) {
+               if (!list_empty(&c->root->list))
+                       list_add(&c->root->list, &c->btree_cache);
+       }

       /*
        * Avoid flushing cached nodes if cache set is retiring
@@ -1750,10 +1752,12 @@ static void cache_set_flush(struct closure *cl)
        */
       if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
               list_for_each_entry(b, &c->btree_cache, list) {
-                       mutex_lock(&b->write_lock);
-                       if (btree_node_dirty(b))
-                               __bch_btree_node_write(b, NULL);
-                       mutex_unlock(&b->write_lock);
+                       if (!IS_ERR_OR_NULL(b)) {
+                               mutex_lock(&b->write_lock);
+                               if (btree_node_dirty(b))
+                                       __bch_btree_node_write(b, NULL);
+                               mutex_unlock(&b->write_lock);
+                       }
               }

       if (ca->alloc_thread)
--
2.33.0
Re: bcache: fix oops bug in cache_set_flush
Posted by Coly Li 1 week, 2 days ago
Hi Liequan,

> 2024年11月13日 14:25,liequan che <liequanche@gmail.com> 写道:
> 
> Signed-off-by: cheliequan <cheliequan@inspur.com>
> 
>   If the bcache cache disk contains damaged btree data,
> when the bcache cache disk partition is directly operated,
> the system-udevd service is triggered to call the bcache-register
> program to register the bcache device,resulting in kernel oops.
> 

What is the kernel version ? 

Interesting that why the btree node checking code during registration doesn’t cache the meta data error.



> crash> bt
> PID: 7773     TASK: ffff49cc44d69340  CPU: 57   COMMAND: "kworker/57:2"
> #0 [ffff800046373800] machine_kexec at ffffbe5039eb54a8
> #1 [ffff8000463739b0] __crash_kexec at ffffbe503a052824
> #2 [ffff8000463739e0] crash_kexec at ffffbe503a0529cc
> #3 [ffff800046373a60] die at ffffbe5039e9445c
> #4 [ffff800046373ac0] die_kernel_fault at ffffbe5039ec698c
> #5 [ffff800046373af0] __do_kernel_fault at ffffbe5039ec6a38
> #6 [ffff800046373b20] do_page_fault at ffffbe503ac76ba4
> #7 [ffff800046373b70] do_translation_fault at ffffbe503ac76ebc
> #8 [ffff800046373b90] do_mem_abort at ffffbe5039ec68ac
> #9 [ffff800046373bc0] el1_abort at ffffbe503ac669bc
> #10 [ffff800046373bf0] el1_sync_handler at ffffbe503ac671d4
> #11 [ffff800046373d30] el1_sync at ffffbe5039e82230
> #12 [ffff800046373d50] cache_set_flush at ffffbe50121fa4c4 [bcache]
> #13 [ffff800046373da0] process_one_work at ffffbe5039f5af68
> #14 [ffff800046373e00] worker_thread at ffffbe5039f5b3c4
> #15 [ffff800046373e50] kthread at ffffbe5039f634b8
> crash> dis cache_set_flush+0x94
> 0xffffbe50121fa4c8 <cache_set_flush+148>:       str     x23, [x20, #512]
> 
> ---
> drivers/md/bcache/super.c | 16 ++++++++++------
> 1 file changed, 10 insertions(+), 6 deletions(-)
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index fd97730479d8..8a41dfcf9fb6 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -1741,8 +1741,10 @@ static void cache_set_flush(struct closure *cl)
>       if (!IS_ERR_OR_NULL(c->gc_thread))
>               kthread_stop(c->gc_thread);
> 
> -       if (!IS_ERR(c->root))
> -               list_add(&c->root->list, &c->btree_cache);
> +       if (!IS_ERR_OR_NULL(c->root)) {
> +               if (!list_empty(&c->root->list))
> +                       list_add(&c->root->list, &c->btree_cache);
> +       }
> 
>       /*
>        * Avoid flushing cached nodes if cache set is retiring
> @@ -1750,10 +1752,12 @@ static void cache_set_flush(struct closure *cl)
>        */
>       if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
>               list_for_each_entry(b, &c->btree_cache, list) {
> -                       mutex_lock(&b->write_lock);
> -                       if (btree_node_dirty(b))
> -                               __bch_btree_node_write(b, NULL);
> -                       mutex_unlock(&b->write_lock);
> +                       if (!IS_ERR_OR_NULL(b)) {

The above check is not safe. 



> +                               mutex_lock(&b->write_lock);
> +                               if (btree_node_dirty(b))
> +                                       __bch_btree_node_write(b, NULL);
> +                               mutex_unlock(&b->write_lock);
> +                       }
>               }
> 
>       if (ca->alloc_thread)
> --
> 2.33.0


Thanks.

Coly Li
Re: bcache: fix oops bug in cache_set_flush
Posted by liequan che 1 week, 2 days ago
I tested this bug on kernel versions 5.10.0-231.0.0.133 and 5.10.0-202.0.0.115。
You can get detailed information through the link below.

https://gitee.com/openeuler/kernel/issues/IB3YQZ
https://gitee.com/openeuler/kernel/pulls/13205
Best Regards!
cheleiquan

Coly Li <colyli@suse.de> 于2024年11月13日周三 15:04写道:
>
> Hi Liequan,
>
> > 2024年11月13日 14:25,liequan che <liequanche@gmail.com> 写道:
> >
> > Signed-off-by: cheliequan <cheliequan@inspur.com>
> >
> >   If the bcache cache disk contains damaged btree data,
> > when the bcache cache disk partition is directly operated,
> > the system-udevd service is triggered to call the bcache-register
> > program to register the bcache device,resulting in kernel oops.
> >
>
> What is the kernel version ?
>
> Interesting that why the btree node checking code during registration doesn’t cache the meta data error.
>
>
>
> > crash> bt
> > PID: 7773     TASK: ffff49cc44d69340  CPU: 57   COMMAND: "kworker/57:2"
> > #0 [ffff800046373800] machine_kexec at ffffbe5039eb54a8
> > #1 [ffff8000463739b0] __crash_kexec at ffffbe503a052824
> > #2 [ffff8000463739e0] crash_kexec at ffffbe503a0529cc
> > #3 [ffff800046373a60] die at ffffbe5039e9445c
> > #4 [ffff800046373ac0] die_kernel_fault at ffffbe5039ec698c
> > #5 [ffff800046373af0] __do_kernel_fault at ffffbe5039ec6a38
> > #6 [ffff800046373b20] do_page_fault at ffffbe503ac76ba4
> > #7 [ffff800046373b70] do_translation_fault at ffffbe503ac76ebc
> > #8 [ffff800046373b90] do_mem_abort at ffffbe5039ec68ac
> > #9 [ffff800046373bc0] el1_abort at ffffbe503ac669bc
> > #10 [ffff800046373bf0] el1_sync_handler at ffffbe503ac671d4
> > #11 [ffff800046373d30] el1_sync at ffffbe5039e82230
> > #12 [ffff800046373d50] cache_set_flush at ffffbe50121fa4c4 [bcache]
> > #13 [ffff800046373da0] process_one_work at ffffbe5039f5af68
> > #14 [ffff800046373e00] worker_thread at ffffbe5039f5b3c4
> > #15 [ffff800046373e50] kthread at ffffbe5039f634b8
> > crash> dis cache_set_flush+0x94
> > 0xffffbe50121fa4c8 <cache_set_flush+148>:       str     x23, [x20, #512]
> >
> > ---
> > drivers/md/bcache/super.c | 16 ++++++++++------
> > 1 file changed, 10 insertions(+), 6 deletions(-)
> > diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> > index fd97730479d8..8a41dfcf9fb6 100644
> > --- a/drivers/md/bcache/super.c
> > +++ b/drivers/md/bcache/super.c
> > @@ -1741,8 +1741,10 @@ static void cache_set_flush(struct closure *cl)
> >       if (!IS_ERR_OR_NULL(c->gc_thread))
> >               kthread_stop(c->gc_thread);
> >
> > -       if (!IS_ERR(c->root))
> > -               list_add(&c->root->list, &c->btree_cache);
> > +       if (!IS_ERR_OR_NULL(c->root)) {
> > +               if (!list_empty(&c->root->list))
> > +                       list_add(&c->root->list, &c->btree_cache);
> > +       }
> >
> >       /*
> >        * Avoid flushing cached nodes if cache set is retiring
> > @@ -1750,10 +1752,12 @@ static void cache_set_flush(struct closure *cl)
> >        */
> >       if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
> >               list_for_each_entry(b, &c->btree_cache, list) {
> > -                       mutex_lock(&b->write_lock);
> > -                       if (btree_node_dirty(b))
> > -                               __bch_btree_node_write(b, NULL);
> > -                       mutex_unlock(&b->write_lock);
> > +                       if (!IS_ERR_OR_NULL(b)) {
>
> The above check is not safe.
>
>
>
> > +                               mutex_lock(&b->write_lock);
> > +                               if (btree_node_dirty(b))
> > +                                       __bch_btree_node_write(b, NULL);
> > +                               mutex_unlock(&b->write_lock);
> > +                       }
> >               }
> >
> >       if (ca->alloc_thread)
> > --
> > 2.33.0
>
>
> Thanks.
>
> Coly Li
Re: bcache: fix oops bug in cache_set_flush
Posted by liequan che 1 week, 2 days ago
Hi Coly:
  In addition, the following actions caused the kernel oops. After
creating the BCache device, the metadata information was not cleared.
Hot-plugged to another server, repartitioned nvme with a different
partition size to recreate the BCache device.After the partition was
executed, the kernel oops was triggered.
  After I applied the new patch, it did not trigger the kernel panic.
  For the above problems, can you give me better modification suggestions?
  Thanks!
  cheliequan

liequan che <liequanche@gmail.com> 于2024年11月13日周三 15:40写道:
>
> I tested this bug on kernel versions 5.10.0-231.0.0.133 and 5.10.0-202.0.0.115。
> You can get detailed information through the link below.
>
> https://gitee.com/openeuler/kernel/issues/IB3YQZ
> https://gitee.com/openeuler/kernel/pulls/13205
> Best Regards!
> cheleiquan
>
> Coly Li <colyli@suse.de> 于2024年11月13日周三 15:04写道:
> >
> > Hi Liequan,
> >
> > > 2024年11月13日 14:25,liequan che <liequanche@gmail.com> 写道:
> > >
> > > Signed-off-by: cheliequan <cheliequan@inspur.com>
> > >
> > >   If the bcache cache disk contains damaged btree data,
> > > when the bcache cache disk partition is directly operated,
> > > the system-udevd service is triggered to call the bcache-register
> > > program to register the bcache device,resulting in kernel oops.
> > >
> >
> > What is the kernel version ?
> >
> > Interesting that why the btree node checking code during registration doesn’t cache the meta data error.
> >
> >
> >
> > > crash> bt
> > > PID: 7773     TASK: ffff49cc44d69340  CPU: 57   COMMAND: "kworker/57:2"
> > > #0 [ffff800046373800] machine_kexec at ffffbe5039eb54a8
> > > #1 [ffff8000463739b0] __crash_kexec at ffffbe503a052824
> > > #2 [ffff8000463739e0] crash_kexec at ffffbe503a0529cc
> > > #3 [ffff800046373a60] die at ffffbe5039e9445c
> > > #4 [ffff800046373ac0] die_kernel_fault at ffffbe5039ec698c
> > > #5 [ffff800046373af0] __do_kernel_fault at ffffbe5039ec6a38
> > > #6 [ffff800046373b20] do_page_fault at ffffbe503ac76ba4
> > > #7 [ffff800046373b70] do_translation_fault at ffffbe503ac76ebc
> > > #8 [ffff800046373b90] do_mem_abort at ffffbe5039ec68ac
> > > #9 [ffff800046373bc0] el1_abort at ffffbe503ac669bc
> > > #10 [ffff800046373bf0] el1_sync_handler at ffffbe503ac671d4
> > > #11 [ffff800046373d30] el1_sync at ffffbe5039e82230
> > > #12 [ffff800046373d50] cache_set_flush at ffffbe50121fa4c4 [bcache]
> > > #13 [ffff800046373da0] process_one_work at ffffbe5039f5af68
> > > #14 [ffff800046373e00] worker_thread at ffffbe5039f5b3c4
> > > #15 [ffff800046373e50] kthread at ffffbe5039f634b8
> > > crash> dis cache_set_flush+0x94
> > > 0xffffbe50121fa4c8 <cache_set_flush+148>:       str     x23, [x20, #512]
> > >
> > > ---
> > > drivers/md/bcache/super.c | 16 ++++++++++------
> > > 1 file changed, 10 insertions(+), 6 deletions(-)
> > > diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> > > index fd97730479d8..8a41dfcf9fb6 100644
> > > --- a/drivers/md/bcache/super.c
> > > +++ b/drivers/md/bcache/super.c
> > > @@ -1741,8 +1741,10 @@ static void cache_set_flush(struct closure *cl)
> > >       if (!IS_ERR_OR_NULL(c->gc_thread))
> > >               kthread_stop(c->gc_thread);
> > >
> > > -       if (!IS_ERR(c->root))
> > > -               list_add(&c->root->list, &c->btree_cache);
> > > +       if (!IS_ERR_OR_NULL(c->root)) {
> > > +               if (!list_empty(&c->root->list))
> > > +                       list_add(&c->root->list, &c->btree_cache);
> > > +       }
> > >
> > >       /*
> > >        * Avoid flushing cached nodes if cache set is retiring
> > > @@ -1750,10 +1752,12 @@ static void cache_set_flush(struct closure *cl)
> > >        */
> > >       if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
> > >               list_for_each_entry(b, &c->btree_cache, list) {
> > > -                       mutex_lock(&b->write_lock);
> > > -                       if (btree_node_dirty(b))
> > > -                               __bch_btree_node_write(b, NULL);
> > > -                       mutex_unlock(&b->write_lock);
> > > +                       if (!IS_ERR_OR_NULL(b)) {
> >
> > The above check is not safe.
> >
> >
> >
> > > +                               mutex_lock(&b->write_lock);
> > > +                               if (btree_node_dirty(b))
> > > +                                       __bch_btree_node_write(b, NULL);
> > > +                               mutex_unlock(&b->write_lock);
> > > +                       }
> > >               }
> > >
> > >       if (ca->alloc_thread)
> > > --
> > > 2.33.0
> >
> >
> > Thanks.
> >
> > Coly Li