fix gfs2 umount timeout bug

[PATCH] fix gfs2 umount timeout bug

Posted by Chunjie Zhu 11 months, 3 weeks ago

  If there are heavy lock contenions between nodes in a cluster, at
  fs umount time,

          node 1                           node 2
            |                                |
	    |                                |
     iopen glock lock    -->       iopen glock go_callback
            |                                |
	    |                                |
         EAGAIN                       try evict failure
	    |                                |
	    |                                |
       DLM_ECANCEL                           |
            |                                |
	    |                                |
      glock complete                         |
            |                                |
	    |                                |
    umount(clear_glock)                      |
            |                                |
	    |                                |
 cannot free iopen glock                     |
            |                                |
	    |                                |
    umount timeout (*)                       |
            |                                |
	    |                                |
      umount complete                        |
                                             |
                                             |
				       umount succeed

Signed-off-by: Chunjie Zhu <chunjie.zhu@cloud.com>
---
 fs/gfs2/glock.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4a280be229a6..bf2445f0afa9 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2120,6 +2120,23 @@ static void thaw_glock(struct gfs2_glock *gl)
 	gfs2_glock_queue_work(gl, 0);
 }
 
+/**
+ * IOPEN glock might be a zombie glock instance due to lock contention
+ * between nodes in the cluster during fs umount, then it causes umount
+ * timeout
+ */
+
+static int is_zombie_glock(struct gfs2_glock *gl)
+{
+	if (test_bit(GLF_LOCK, &gl->gl_flags) &&
+		test_bit(GLF_DEMOTE, &gl->gl_flags) &&
+		test_bit(GLF_BLOCKING, &gl->gl_flags) &&
+		(gl->gl_name.ln_type == LM_TYPE_IOPEN) &&
+		list_empty(&gl->gl_holders))
+		return 1;
+	return 0;
+}
+
 /**
  * clear_glock - look at a glock and see if we can free it from glock cache
  * @gl: the glock to look at
@@ -2132,7 +2149,8 @@ static void clear_glock(struct gfs2_glock *gl)
 
 	spin_lock(&gl->gl_lockref.lock);
 	if (!__lockref_is_dead(&gl->gl_lockref)) {
-		gl->gl_lockref.count++;
+		if (!is_zombie_glock(gl))
+			gl->gl_lockref.count++;
 		if (gl->gl_state != LM_ST_UNLOCKED)
 			handle_callback(gl, LM_ST_UNLOCKED, 0, false);
 		__gfs2_glock_queue_work(gl, 0);
-- 
2.34.1

Re: [PATCH] fix gfs2 umount timeout bug

Posted by Andreas Grünbacher 11 months, 3 weeks ago

Hello,

Am Mo., 17. Feb. 2025 um 05:10 Uhr schrieb Chunjie Zhu <chunjie.zhu@cloud.com>:
>   If there are heavy lock contenions between nodes in a cluster, at
>   fs umount time,
>
>           node 1                           node 2
>             |                                |
>             |                                |
>      iopen glock lock    -->       iopen glock go_callback
>             |                                |
>             |                                |
>          EAGAIN                       try evict failure
>             |                                |
>             |                                |
>        DLM_ECANCEL                           |
>             |                                |
>             |                                |
>       glock complete                         |
>             |                                |
>             |                                |
>     umount(clear_glock)                      |
>             |                                |
>             |                                |
>  cannot free iopen glock                     |
>             |                                |
>             |                                |
>     umount timeout (*)                       |
>             |                                |
>             |                                |
>       umount complete                        |
>                                              |
>                                              |
>                                        umount succeed

Thank you for your bug report. I'm having a hard time following what
you are trying to say, and the patch itself doesn't look right to me.
If there was a reference counting problem like the patch suggests, we
would probably see regular left-over glocks at unmount time, but I'm
not aware of any such problems. So you ld you please explain in a bit
more detail what you think the problem is? Do you get any messages in
the syslog? The file checksum in the patch refers to commit
bb25b97562e5 ("gfs2: remove dead code in add_to_queue") from 2023.
What exact kernel version are you running?

Thanks,
Andreas

> Signed-off-by: Chunjie Zhu <chunjie.zhu@cloud.com>
> ---
>  fs/gfs2/glock.c | 20 +++++++++++++++++++-
>  1 file changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> index 4a280be229a6..bf2445f0afa9 100644
> --- a/fs/gfs2/glock.c
> +++ b/fs/gfs2/glock.c
> @@ -2120,6 +2120,23 @@ static void thaw_glock(struct gfs2_glock *gl)
>         gfs2_glock_queue_work(gl, 0);
>  }
>
> +/**
> + * IOPEN glock might be a zombie glock instance due to lock contention
> + * between nodes in the cluster during fs umount, then it causes umount
> + * timeout
> + */
> +
> +static int is_zombie_glock(struct gfs2_glock *gl)
> +{
> +       if (test_bit(GLF_LOCK, &gl->gl_flags) &&
> +               test_bit(GLF_DEMOTE, &gl->gl_flags) &&
> +               test_bit(GLF_BLOCKING, &gl->gl_flags) &&
> +               (gl->gl_name.ln_type == LM_TYPE_IOPEN) &&
> +               list_empty(&gl->gl_holders))
> +               return 1;
> +       return 0;
> +}
> +
>  /**
>   * clear_glock - look at a glock and see if we can free it from glock cache
>   * @gl: the glock to look at
> @@ -2132,7 +2149,8 @@ static void clear_glock(struct gfs2_glock *gl)
>
>         spin_lock(&gl->gl_lockref.lock);
>         if (!__lockref_is_dead(&gl->gl_lockref)) {
> -               gl->gl_lockref.count++;
> +               if (!is_zombie_glock(gl))
> +                       gl->gl_lockref.count++;
>                 if (gl->gl_state != LM_ST_UNLOCKED)
>                         handle_callback(gl, LM_ST_UNLOCKED, 0, false);
>                 __gfs2_glock_queue_work(gl, 0);
> --
> 2.34.1
>
>

Re: Re: [PATCH] fix gfs2 umount timeout bug

Posted by Chunjie Zhu 11 months, 3 weeks ago

The rapid response is much appreciated. Our kernel version is 6.6.22.

Our IO tests run on a 2-nodes GFS2 cluster. In our tests, there are lots of
file deletion operations, this causes heavy exclusive lock contention inside
the GFS2 cluster. At the umount time, we often see the following message
reported by kernel hung task detector.

After investigatin, I find that one or two iopen type glock instances are left
behind at umount time.

See the time sequence graph in my commit message, due to heavy lock
contention, node 1 DLM returns EAGAIN and then ECANCEL to GFS2 glock A lock
request, then, node 1 instructs all GFS2 IO applications to exit, and umount
follows up on node 1. During umount process, glock A is always in memory, we
can see gfs2_glock_dq cannot free glock A because glock state machine cannot
decrease the glock A refcount to be 0.

INFO: task umount:75342 blocked for more than 483 seconds.
      Not tainted 6.6.22+0 #1
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
task:umount          state:D stack:0     pid:75342 ppid:75335  flags:0x00004002
Call Trace:
 <TASK>
 __schedule+0x3a0/0x1330
 ? srso_alias_return_thunk+0x5/0x7f
 ? srso_alias_return_thunk+0x5/0x7f
 schedule+0x53/0xc0
 schedule_timeout+0x76/0xf0
 ? __pfx_process_timeout+0x10/0x10
 gfs2_gl_hash_clear+0x135/0x140 [gfs2]
 ? __pfx_autoremove_wake_function+0x10/0x10
 gfs2_put_super+0x175/0x220 [gfs2]
 generic_shutdown_super+0x7e/0x170
 kill_block_super+0x16/0x40
 deactivate_locked_super+0x2f/0xa0
 cleanup_mnt+0xbd/0x150
 task_work_run+0x60/0xa0
 exit_to_user_mode_prepare+0x117/0x120
 syscall_exit_to_user_mode+0x22/0x40
 ? srso_alias_return_thunk+0x5/0x7f
 do_syscall_64+0x67/0x80
 ? srso_alias_return_thunk+0x5/0x7f
 ? syscall_exit_to_user_mode+0x27/0x40
 ? srso_alias_return_thunk+0x5/0x7f
 ? do_syscall_64+0x67/0x80
 ? srso_alias_return_thunk+0x5/0x7f
 ? syscall_exit_to_user_mode+0x27/0x40
 ? srso_alias_return_thunk+0x5/0x7f
 ? do_syscall_64+0x67/0x80
 ? syscall_exit_to_user_mode+0x27/0x40
 ? srso_alias_return_thunk+0x5/0x7f
 ? do_syscall_64+0x67/0x80
 ? do_syscall_64+0x67/0x80
 ? do_syscall_64+0x67/0x80
 ? exc_page_fault+0x72/0x130
 entry_SYSCALL_64_after_hwframe+0x6e/0xd8
RIP: 0033:0x7fb0823ebeab
RSP: 002b:00007ffcd0d45b68 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
RAX: 0000000000000000 RBX: 00007fb081257000 RCX: 00007fb0823ebeab
RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007fb081219980
RBP: 00007fb081257118 R08: 0000000000000073 R09: 0000000000000001
R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
R13: 00007fb081219980 R14: 0000000000000000 R15: 00007fb081257000
 </TASK>

> 
> > Signed-off-by: Chunjie Zhu <chunjie.zhu@cloud.com>
> > ---
> >  fs/gfs2/glock.c | 20 +++++++++++++++++++-
> >  1 file changed, 19 insertions(+), 1 deletion(-)
> >
> > diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> > index 4a280be229a6..bf2445f0afa9 100644
> > --- a/fs/gfs2/glock.c
> > +++ b/fs/gfs2/glock.c
> > @@ -2120,6 +2120,23 @@ static void thaw_glock(struct gfs2_glock *gl)
> >         gfs2_glock_queue_work(gl, 0);
> >  }
> >
> > +/**
> > + * IOPEN glock might be a zombie glock instance due to lock contention
> > + * between nodes in the cluster during fs umount, then it causes umount
> > + * timeout
> > + */
> > +
> > +static int is_zombie_glock(struct gfs2_glock *gl)
> > +{
> > +       if (test_bit(GLF_LOCK, &gl->gl_flags) &&
> > +               test_bit(GLF_DEMOTE, &gl->gl_flags) &&
> > +               test_bit(GLF_BLOCKING, &gl->gl_flags) &&
> > +               (gl->gl_name.ln_type == LM_TYPE_IOPEN) &&
> > +               list_empty(&gl->gl_holders))
> > +               return 1;
> > +       return 0;
> > +}
> > +
> >  /**
> >   * clear_glock - look at a glock and see if we can free it from glock cache
> >   * @gl: the glock to look at
> > @@ -2132,7 +2149,8 @@ static void clear_glock(struct gfs2_glock *gl)
> >
> >         spin_lock(&gl->gl_lockref.lock);
> >         if (!__lockref_is_dead(&gl->gl_lockref)) {
> > -               gl->gl_lockref.count++;
> > +               if (!is_zombie_glock(gl))
> > +                       gl->gl_lockref.count++;
> >                 if (gl->gl_state != LM_ST_UNLOCKED)
> >                         handle_callback(gl, LM_ST_UNLOCKED, 0, false);
> >                 __gfs2_glock_queue_work(gl, 0);
> > --
> > 2.34.1
> >
> >
>

Re: Re: Re: [PATCH] fix gfs2 umount timeout bug

Posted by Chunjie Zhu 11 months, 3 weeks ago

The time ordered events,

IO app -> do_xmote -> gdlm_lock -> gfs2_glock_complete (ret is CANCEL) ->
__gfs2_glock_queue_work

kworker A -> glock_work_func -> finish_xmote -> gfs2_holder_wake ->
-> retry do_xmote (set GLF_LOCK flag) -> gdlm_lock (DLM does not invoke
GFS2 callbacks) -> run_queue (do nothing as glock has GLF_LOCK flag)

glock refcount is 1

umount -> clear_glock (refcount +1) -> glock_work_func -> run_queue (do
nothing as glock has GLF_LOCK flag) -> refcount -1

glock refcount is 1, still in memory

> 
> INFO: task umount:75342 blocked for more than 483 seconds.
>       Not tainted 6.6.22+0 #1
> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
> task:umount          state:D stack:0     pid:75342 ppid:75335  flags:0x00004002
> Call Trace:
>  <TASK>
>  __schedule+0x3a0/0x1330
>  ? srso_alias_return_thunk+0x5/0x7f
>  ? srso_alias_return_thunk+0x5/0x7f
>  schedule+0x53/0xc0
>  schedule_timeout+0x76/0xf0
>  ? __pfx_process_timeout+0x10/0x10
>  gfs2_gl_hash_clear+0x135/0x140 [gfs2]
>  ? __pfx_autoremove_wake_function+0x10/0x10
>  gfs2_put_super+0x175/0x220 [gfs2]
>  generic_shutdown_super+0x7e/0x170
>  kill_block_super+0x16/0x40
>  deactivate_locked_super+0x2f/0xa0
>  cleanup_mnt+0xbd/0x150
>  task_work_run+0x60/0xa0
>  exit_to_user_mode_prepare+0x117/0x120
>  syscall_exit_to_user_mode+0x22/0x40
>  ? srso_alias_return_thunk+0x5/0x7f
>  do_syscall_64+0x67/0x80
>  ? srso_alias_return_thunk+0x5/0x7f
>  ? syscall_exit_to_user_mode+0x27/0x40
>  ? srso_alias_return_thunk+0x5/0x7f
>  ? do_syscall_64+0x67/0x80
>  ? srso_alias_return_thunk+0x5/0x7f
>  ? syscall_exit_to_user_mode+0x27/0x40
>  ? srso_alias_return_thunk+0x5/0x7f
>  ? do_syscall_64+0x67/0x80
>  ? syscall_exit_to_user_mode+0x27/0x40
>  ? srso_alias_return_thunk+0x5/0x7f
>  ? do_syscall_64+0x67/0x80
>  ? do_syscall_64+0x67/0x80
>  ? do_syscall_64+0x67/0x80
>  ? exc_page_fault+0x72/0x130
>  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
> RIP: 0033:0x7fb0823ebeab
> RSP: 002b:00007ffcd0d45b68 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6
> RAX: 0000000000000000 RBX: 00007fb081257000 RCX: 00007fb0823ebeab
> RDX: 0000000000000000 RSI: 0000000000000000 RDI: 00007fb081219980
> RBP: 00007fb081257118 R08: 0000000000000073 R09: 0000000000000001
> R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
> R13: 00007fb081219980 R14: 0000000000000000 R15: 00007fb081257000
>  </TASK>
> 
> > 
> > > Signed-off-by: Chunjie Zhu <chunjie.zhu@cloud.com>
> > > ---
> > >  fs/gfs2/glock.c | 20 +++++++++++++++++++-
> > >  1 file changed, 19 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
> > > index 4a280be229a6..bf2445f0afa9 100644
> > > --- a/fs/gfs2/glock.c
> > > +++ b/fs/gfs2/glock.c
> > > @@ -2120,6 +2120,23 @@ static void thaw_glock(struct gfs2_glock *gl)
> > >         gfs2_glock_queue_work(gl, 0);
> > >  }
> > >
> > > +/**
> > > + * IOPEN glock might be a zombie glock instance due to lock contention
> > > + * between nodes in the cluster during fs umount, then it causes umount
> > > + * timeout
> > > + */
> > > +
> > > +static int is_zombie_glock(struct gfs2_glock *gl)
> > > +{
> > > +       if (test_bit(GLF_LOCK, &gl->gl_flags) &&
> > > +               test_bit(GLF_DEMOTE, &gl->gl_flags) &&
> > > +               test_bit(GLF_BLOCKING, &gl->gl_flags) &&
> > > +               (gl->gl_name.ln_type == LM_TYPE_IOPEN) &&
> > > +               list_empty(&gl->gl_holders))
> > > +               return 1;
> > > +       return 0;
> > > +}
> > > +
> > >  /**
> > >   * clear_glock - look at a glock and see if we can free it from glock cache
> > >   * @gl: the glock to look at
> > > @@ -2132,7 +2149,8 @@ static void clear_glock(struct gfs2_glock *gl)
> > >
> > >         spin_lock(&gl->gl_lockref.lock);
> > >         if (!__lockref_is_dead(&gl->gl_lockref)) {
> > > -               gl->gl_lockref.count++;
> > > +               if (!is_zombie_glock(gl))
> > > +                       gl->gl_lockref.count++;
> > >                 if (gl->gl_state != LM_ST_UNLOCKED)
> > >                         handle_callback(gl, LM_ST_UNLOCKED, 0, false);
> > >                 __gfs2_glock_queue_work(gl, 0);
> > > --
> > > 2.34.1
> > >
> > >
> > 
>