[libvirt] [PATCH v3] qemu: fix deadlock if create qemuProcessReconnect thread failed

Wang Yechao posted 1 patch 5 years, 6 months ago
Failed in applying to current master (apply log)
src/qemu/qemu_domain.c  | 75 ++++++++++++++++++++++++++++++++++++++++---------
src/qemu/qemu_domain.h  |  9 ++++++
src/qemu/qemu_process.c |  2 +-
3 files changed, 71 insertions(+), 15 deletions(-)
[libvirt] [PATCH v3] qemu: fix deadlock if create qemuProcessReconnect thread failed
Posted by Wang Yechao 5 years, 6 months ago
qemuProcessReconnectHelper has hold lock on doms, if create
qemuProcessReconnect thread failed, calls qemuDomainRemoveInactiveJob
will lead to deadlock.

Add a qemuDomainRemoveInactiveJobLocked, modify qemuProcessReconnectHelper
to call it.

Signed-off-by: Wang Yechao <wang.yechao255@zte.com.cn>

---
v2 patch:
https://www.redhat.com/archives/libvir-list/2018-September/msg00635.html

Changes in v3:
 - drop v2 patch, cause it is not good.
 - split qemuDomainRemoveInactive into qemuDomainRemoveInactiveCommon
and virDomainObjListRemove.
 - create a qemuDomainRemoveInactiveLocked, consists of
qemuDomainRemoveInactiveCommon and virDomainObjListRemoveLocked.
 - create a qemuDomainRemoveInactiveJobLocked,  which copies
qemuDomainRemoveInactiveJob except calling qemuDomainRemoveInactiveLocked
 - Modify qemuProcessReconnectHelper to call qemuDomainRemoveInactiveJobLocked
---
 src/qemu/qemu_domain.c  | 75 ++++++++++++++++++++++++++++++++++++++++---------
 src/qemu/qemu_domain.h  |  9 ++++++
 src/qemu/qemu_process.c |  2 +-
 3 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 2fd8a2a..ef0d91d 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -8343,24 +8343,13 @@ qemuDomainSnapshotDiscardAllMetadata(virQEMUDriverPtr driver,
     return rem.err;
 }
 
-
-/**
- * qemuDomainRemoveInactive:
- *
- * The caller must hold a lock to the vm.
- */
 void
-qemuDomainRemoveInactive(virQEMUDriverPtr driver,
-                         virDomainObjPtr vm)
+qemuDomainRemoveInactiveCommon(virQEMUDriverPtr driver,
+                               virDomainObjPtr vm)
 {
     char *snapDir;
     virQEMUDriverConfigPtr cfg;
 
-    if (vm->persistent) {
-        /* Short-circuit, we don't want to remove a persistent domain */
-        return;
-    }
-
     cfg = virQEMUDriverGetConfig(driver);
 
     /* Remove any snapshot metadata prior to removing the domain */
@@ -8379,9 +8368,47 @@ qemuDomainRemoveInactive(virQEMUDriverPtr driver,
     }
     qemuExtDevicesCleanupHost(driver, vm->def);
 
+    virObjectUnref(cfg);
+}
+
+/**
+ * qemuDomainRemoveInactive:
+ *
+ * The caller must hold a lock to the vm.
+ */
+void
+qemuDomainRemoveInactive(virQEMUDriverPtr driver,
+                         virDomainObjPtr vm)
+{
+    if (vm->persistent) {
+        /* Short-circuit, we don't want to remove a persistent domain */
+        return;
+    }
+
+    qemuDomainRemoveInactiveCommon(driver, vm);
     virDomainObjListRemove(driver->domains, vm);
 
-    virObjectUnref(cfg);
+}
+
+/**
+ * qemuDomainRemoveInactiveLocked:
+ *
+ * The caller must hold a lock to the vm ,
+ * and hold a lock to the doms.
+ */
+
+void
+qemuDomainRemoveInactiveLocked(virQEMUDriverPtr driver,
+                               virDomainObjPtr vm)
+{
+    if (vm->persistent) {
+        /* Short-circuit, we don't want to remove a persistent domain */
+        return;
+    }
+
+    qemuDomainRemoveInactiveCommon(driver, vm);
+    virDomainObjListRemoveLocked(driver->domains, vm);
+
 }
 
 
@@ -8407,6 +8434,26 @@ qemuDomainRemoveInactiveJob(virQEMUDriverPtr driver,
         qemuDomainObjEndJob(driver, vm);
 }
 
+/**
+ * qemuDomainRemoveInactiveJobLocked:
+ *
+ * Just like qemuDomainRemoveInactiveJob but it hold lock
+ * on 'doms'.
+ */
+
+void
+qemuDomainRemoveInactiveJobLocked(virQEMUDriverPtr driver,
+                                  virDomainObjPtr vm)
+{
+    bool haveJob;
+
+    haveJob = qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) >= 0;
+
+    qemuDomainRemoveInactiveLocked(driver, vm);
+
+    if (haveJob)
+        qemuDomainObjEndJob(driver, vm);
+}
 
 void
 qemuDomainSetFakeReboot(virQEMUDriverPtr driver,
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 914c9a6..1d80621 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -698,12 +698,21 @@ int qemuDomainSnapshotDiscardAll(void *payload,
 int qemuDomainSnapshotDiscardAllMetadata(virQEMUDriverPtr driver,
                                          virDomainObjPtr vm);
 
+void qemuDomainRemoveInactiveCommon(virQEMUDriverPtr driver,
+                                    virDomainObjPtr vm);
+
 void qemuDomainRemoveInactive(virQEMUDriverPtr driver,
                               virDomainObjPtr vm);
 
+void qemuDomainRemoveInactiveLocked(virQEMUDriverPtr driver,
+                                    virDomainObjPtr vm);
+
 void qemuDomainRemoveInactiveJob(virQEMUDriverPtr driver,
                                  virDomainObjPtr vm);
 
+void qemuDomainRemoveInactiveJobLocked(virQEMUDriverPtr driver,
+                                       virDomainObjPtr vm);
+
 void qemuDomainSetFakeReboot(virQEMUDriverPtr driver,
                              virDomainObjPtr vm,
                              bool value);
diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 72a59de..ed24447 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -8025,7 +8025,7 @@ qemuProcessReconnectHelper(virDomainObjPtr obj,
          */
         qemuProcessStop(src->driver, obj, VIR_DOMAIN_SHUTOFF_FAILED,
                         QEMU_ASYNC_JOB_NONE, 0);
-        qemuDomainRemoveInactiveJob(src->driver, obj);
+        qemuDomainRemoveInactiveJobLocked(src->driver, obj);
 
         virDomainObjEndAPI(&obj);
         virNWFilterUnlockFilterUpdates();
-- 
1.8.3.1

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list
Re: [libvirt] [PATCH v3] qemu: fix deadlock if create qemuProcessReconnect thread failed
Posted by John Ferlan 5 years, 6 months ago

On 09/19/2018 10:27 PM, Wang Yechao wrote:
> qemuProcessReconnectHelper has hold lock on doms, if create
> qemuProcessReconnect thread failed, calls qemuDomainRemoveInactiveJob
> will lead to deadlock.
> 
> Add a qemuDomainRemoveInactiveJobLocked, modify qemuProcessReconnectHelper
> to call it.
> 
> Signed-off-by: Wang Yechao <wang.yechao255@zte.com.cn>
> 
> ---
> v2 patch:
> https://www.redhat.com/archives/libvir-list/2018-September/msg00635.html
> 
> Changes in v3:
>  - drop v2 patch, cause it is not good.
>  - split qemuDomainRemoveInactive into qemuDomainRemoveInactiveCommon
> and virDomainObjListRemove.
>  - create a qemuDomainRemoveInactiveLocked, consists of
> qemuDomainRemoveInactiveCommon and virDomainObjListRemoveLocked.
>  - create a qemuDomainRemoveInactiveJobLocked,  which copies
> qemuDomainRemoveInactiveJob except calling qemuDomainRemoveInactiveLocked
>  - Modify qemuProcessReconnectHelper to call qemuDomainRemoveInactiveJobLocked
> ---
>  src/qemu/qemu_domain.c  | 75 ++++++++++++++++++++++++++++++++++++++++---------
>  src/qemu/qemu_domain.h  |  9 ++++++
>  src/qemu/qemu_process.c |  2 +-
>  3 files changed, 71 insertions(+), 15 deletions(-)
> 

Sigh, I thought I was clear - multiple patches would have been
preferred. It's easier to review and shows the progression to the fix.
Hopefully the next series will split them up.  Each will have their own
commit message to describe what's happening.

> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
> index 2fd8a2a..ef0d91d 100644
> --- a/src/qemu/qemu_domain.c
> +++ b/src/qemu/qemu_domain.c
> @@ -8343,24 +8343,13 @@ qemuDomainSnapshotDiscardAllMetadata(virQEMUDriverPtr driver,
>      return rem.err;
>  }
>  
> -
> -/**
> - * qemuDomainRemoveInactive:
> - *
> - * The caller must hold a lock to the vm.
> - */
>  void

This will be a "static void"

> -qemuDomainRemoveInactive(virQEMUDriverPtr driver,
> -                         virDomainObjPtr vm)
> +qemuDomainRemoveInactiveCommon(virQEMUDriverPtr driver,
> +                               virDomainObjPtr vm)
>  {
>      char *snapDir;
>      virQEMUDriverConfigPtr cfg;
>  
> -    if (vm->persistent) {
> -        /* Short-circuit, we don't want to remove a persistent domain */
> -        return;
> -    }
> -
>      cfg = virQEMUDriverGetConfig(driver);
>  
>      /* Remove any snapshot metadata prior to removing the domain */
> @@ -8379,9 +8368,47 @@ qemuDomainRemoveInactive(virQEMUDriverPtr driver,
>      }
>      qemuExtDevicesCleanupHost(driver, vm->def);
>  
> +    virObjectUnref(cfg);
> +}
> +

Two blank lines between functions

> +/**
> + * qemuDomainRemoveInactive:
> + *
> + * The caller must hold a lock to the vm.
> + */
> +void
> +qemuDomainRemoveInactive(virQEMUDriverPtr driver,
> +                         virDomainObjPtr vm)
> +{
> +    if (vm->persistent) {
> +        /* Short-circuit, we don't want to remove a persistent domain */
> +        return;
> +    }
> +
> +    qemuDomainRemoveInactiveCommon(driver, vm);
>      virDomainObjListRemove(driver->domains, vm);
>  

If you're going to have a blank line, then have it between the two
calls, but not after the last one.

> -    virObjectUnref(cfg);
> +}
> +

Two blank lines

> +/**
> + * qemuDomainRemoveInactiveLocked:
> + *
> + * The caller must hold a lock to the vm ,
> + * and hold a lock to the doms.

Change to:

 * The caller must hold a lock to the vm and must hold the
 * lock on driver->domains in order to call the remove obj
 * from locked list method.

> + */
> +
> +void

static void

> +qemuDomainRemoveInactiveLocked(virQEMUDriverPtr driver,
> +                               virDomainObjPtr vm)
> +{
> +    if (vm->persistent) {
> +        /* Short-circuit, we don't want to remove a persistent domain */
> +        return;
> +    }
> +
> +    qemuDomainRemoveInactiveCommon(driver, vm);
> +    virDomainObjListRemoveLocked(driver->domains, vm);
> +

Similar with respect to the extra blank line here.

>  }
>  
>  
> @@ -8407,6 +8434,26 @@ qemuDomainRemoveInactiveJob(virQEMUDriverPtr driver,
>          qemuDomainObjEndJob(driver, vm);
>  }
>  
> +/**
> + * qemuDomainRemoveInactiveJobLocked:
> + *
> + * Just like qemuDomainRemoveInactiveJob but it hold lock
> + * on 'doms'.

I would say:

 * Similar to qemuDomainRemoveInactiveJob, except that the caller must
 * also hold the lock @driver->domains

> + */
> +

Remove the blank line above

> +void
> +qemuDomainRemoveInactiveJobLocked(virQEMUDriverPtr driver,
> +                                  virDomainObjPtr vm)
> +{
> +    bool haveJob;
> +
> +    haveJob = qemuDomainObjBeginJob(driver, vm, QEMU_JOB_MODIFY) >= 0;
> +
> +    qemuDomainRemoveInactiveLocked(driver, vm);
> +
> +    if (haveJob)
> +        qemuDomainObjEndJob(driver, vm);
> +}
>  

Two blank lines

>  void
>  qemuDomainSetFakeReboot(virQEMUDriverPtr driver,
> diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
> index 914c9a6..1d80621 100644
> --- a/src/qemu/qemu_domain.h
> +++ b/src/qemu/qemu_domain.h
> @@ -698,12 +698,21 @@ int qemuDomainSnapshotDiscardAll(void *payload,
>  int qemuDomainSnapshotDiscardAllMetadata(virQEMUDriverPtr driver,
>                                           virDomainObjPtr vm);
>  
> +void qemuDomainRemoveInactiveCommon(virQEMUDriverPtr driver,
> +                                    virDomainObjPtr vm);
> +

Nope - this is not external

>  void qemuDomainRemoveInactive(virQEMUDriverPtr driver,
>                                virDomainObjPtr vm);
>  
> +void qemuDomainRemoveInactiveLocked(virQEMUDriverPtr driver,
> +                                    virDomainObjPtr vm);
> +

and neither is this one

>  void qemuDomainRemoveInactiveJob(virQEMUDriverPtr driver,
>                                   virDomainObjPtr vm);
>  
> +void qemuDomainRemoveInactiveJobLocked(virQEMUDriverPtr driver,
> +                                       virDomainObjPtr vm);
> +
>  void qemuDomainSetFakeReboot(virQEMUDriverPtr driver,
>                               virDomainObjPtr vm,
>                               bool value);
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 72a59de..ed24447 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -8025,7 +8025,7 @@ qemuProcessReconnectHelper(virDomainObjPtr obj,
>           */
>          qemuProcessStop(src->driver, obj, VIR_DOMAIN_SHUTOFF_FAILED,
>                          QEMU_ASYNC_JOB_NONE, 0);
> -        qemuDomainRemoveInactiveJob(src->driver, obj);
> +        qemuDomainRemoveInactiveJobLocked(src->driver, obj);
>  
>          virDomainObjEndAPI(&obj);
>          virNWFilterUnlockFilterUpdates();
> 

In the long run the above hung will be the "only" thing for patch3.

John

--
libvir-list mailing list
libvir-list@redhat.com
https://www.redhat.com/mailman/listinfo/libvir-list