[PATCH] libxl: Fix domain shutdown

Jim Fehlig posted 1 patch 3 years, 2 months ago
Test syntax-check failed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/libvirt tags/patchew/20210220001906.12778-1-jfehlig@suse.com
src/libxl/libxl_domain.c | 115 ++++++++++++++++++---------------------
1 file changed, 54 insertions(+), 61 deletions(-)
[PATCH] libxl: Fix domain shutdown
Posted by Jim Fehlig 3 years, 2 months ago
Commit fa30ee04a2 caused a regression in normal domain shutown.
Initiating a shutdown from within the domain or via 'virsh shutdown'
does cause the guest OS running in the domain to shutdown, but libvirt
never reaps the domain so it is always shown in a running state until
calling 'virsh destroy'.

The shutdown thread is also an internal user of the driver shutdown
machinery and eventually calls libxlDomainDestroyInternal where
the ignoreDeathEvent inhibitor is set, but running in a thread
introduces the possibility of racing with the death event from
libxl. This can be prevented by setting ignoreDeathEvent before
running the shutdown thread.

An additional improvement is to handle the destroy event synchronously
instead of spawning a thread. The time consuming aspects of destroying
a domain have been completed when the destroy event is delivered.

Signed-off-by: Jim Fehlig <jfehlig@suse.com>
---
 src/libxl/libxl_domain.c | 115 ++++++++++++++++++---------------------
 1 file changed, 54 insertions(+), 61 deletions(-)

diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
index d59153fffa..32dc503089 100644
--- a/src/libxl/libxl_domain.c
+++ b/src/libxl/libxl_domain.c
@@ -476,6 +476,7 @@ libxlDomainShutdownHandleRestart(libxlDriverPrivatePtr driver,
 struct libxlShutdownThreadInfo
 {
     libxlDriverPrivatePtr driver;
+    virDomainObjPtr vm;
     libxl_event *event;
 };
 
@@ -484,7 +485,7 @@ static void
 libxlDomainShutdownThread(void *opaque)
 {
     struct libxlShutdownThreadInfo *shutdown_info = opaque;
-    virDomainObjPtr vm = NULL;
+    virDomainObjPtr vm = shutdown_info->vm;
     libxl_event *ev = shutdown_info->event;
     libxlDriverPrivatePtr driver = shutdown_info->driver;
     virObjectEventPtr dom_event = NULL;
@@ -494,12 +495,6 @@ libxlDomainShutdownThread(void *opaque)
 
     libxl_domain_config_init(&d_config);
 
-    vm = virDomainObjListFindByID(driver->domains, ev->domid);
-    if (!vm) {
-        VIR_INFO("Received event for unknown domain ID %d", ev->domid);
-        goto cleanup;
-    }
-
     if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0)
         goto cleanup;
 
@@ -616,32 +611,18 @@ libxlDomainShutdownThread(void *opaque)
 }
 
 static void
-libxlDomainDeathThread(void *opaque)
+libxlDomainHandleDeath(libxlDriverPrivatePtr driver, virDomainObjPtr vm)
 {
-    struct libxlShutdownThreadInfo *shutdown_info = opaque;
-    virDomainObjPtr vm = NULL;
-    libxl_event *ev = shutdown_info->event;
-    libxlDriverPrivatePtr driver = shutdown_info->driver;
     virObjectEventPtr dom_event = NULL;
-    g_autoptr(libxlDriverConfig) cfg = libxlDriverConfigGet(driver);
-    libxlDomainObjPrivatePtr priv;
-
-    vm = virDomainObjListFindByID(driver->domains, ev->domid);
-    if (!vm) {
-        /* vm->def->id already cleared, means the death was handled by the
-         * driver already */
-        goto cleanup;
-    }
-
-    priv = vm->privateData;
+    libxlDomainObjPrivatePtr priv = vm->privateData;
 
     if (priv->ignoreDeathEvent) {
         priv->ignoreDeathEvent = false;
-        goto cleanup;
+        return;
     }
 
     if (libxlDomainObjBeginJob(driver, vm, LIBXL_JOB_MODIFY) < 0)
-        goto cleanup;
+        return;
 
     virDomainObjSetState(vm, VIR_DOMAIN_SHUTOFF, VIR_DOMAIN_SHUTOFF_DESTROYED);
     dom_event = virDomainEventLifecycleNewFromObj(vm,
@@ -651,12 +632,7 @@ libxlDomainDeathThread(void *opaque)
     if (!vm->persistent)
         virDomainObjListRemove(driver->domains, vm);
     libxlDomainObjEndJob(driver, vm);
-
- cleanup:
-    virDomainObjEndAPI(&vm);
     virObjectEventStateQueue(driver->domainEventState, dom_event);
-    libxl_event_free(cfg->ctx, ev);
-    VIR_FREE(shutdown_info);
 }
 
 
@@ -668,16 +644,13 @@ libxlDomainEventHandler(void *data, VIR_LIBXL_EVENT_CONST libxl_event *event)
 {
     libxlDriverPrivatePtr driver = data;
     libxl_shutdown_reason xl_reason = event->u.domain_shutdown.shutdown_reason;
-    struct libxlShutdownThreadInfo *shutdown_info = NULL;
-    virThread thread;
+    virDomainObjPtr vm = NULL;
     g_autoptr(libxlDriverConfig) cfg = NULL;
-    int ret = -1;
-    g_autofree char *name = NULL;
 
     if (event->type != LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN &&
             event->type != LIBXL_EVENT_TYPE_DOMAIN_DEATH) {
         VIR_INFO("Unhandled event type %d", event->type);
-        goto error;
+        goto cleanup;
     }
 
     /*
@@ -685,42 +658,62 @@ libxlDomainEventHandler(void *data, VIR_LIBXL_EVENT_CONST libxl_event *event)
      * after calling libxl_domain_suspend() are handled by its callers.
      */
     if (xl_reason == LIBXL_SHUTDOWN_REASON_SUSPEND)
-        goto error;
+        goto cleanup;
 
-    /*
-     * Start a thread to handle shutdown.  We don't want to be tying up
-     * libxl's event machinery by doing a potentially lengthy shutdown.
-     */
-    shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1);
+    vm = virDomainObjListFindByID(driver->domains, event->domid);
+    if (!vm) {
+        /* Nothing to do if we can't find the virDomainObj */
+        goto cleanup;
+    }
 
-    shutdown_info->driver = driver;
-    shutdown_info->event = (libxl_event *)event;
-    name = g_strdup_printf("ev-%d", event->domid);
-    if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN)
-        ret = virThreadCreateFull(&thread, false, libxlDomainShutdownThread,
-                                  name, false, shutdown_info);
-    else if (event->type == LIBXL_EVENT_TYPE_DOMAIN_DEATH)
-        ret = virThreadCreateFull(&thread, false, libxlDomainDeathThread,
-                                  name, false, shutdown_info);
+    if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN) {
+        libxlDomainObjPrivatePtr priv = vm->privateData;
+        struct libxlShutdownThreadInfo *shutdown_info = NULL;
+        virThread thread;
+        g_autofree char *name = NULL;
 
-    if (ret < 0) {
         /*
-         * Not much we can do on error here except log it.
+         * Start a thread to handle shutdown.  We don't want to be tying up
+         * libxl's event machinery by doing a potentially lengthy shutdown.
          */
-        VIR_ERROR(_("Failed to create thread to handle domain shutdown"));
-        goto error;
-    }
+        shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1);
 
-    /*
-     * libxlShutdownThreadInfo and libxl_event are freed in shutdown thread
-     */
-    return;
+        shutdown_info->driver = driver;
+        shutdown_info->vm = vm;
+        shutdown_info->event = (libxl_event *)event;
+        name = g_strdup_printf("ev-%d", event->domid);
+        /*
+         * Cleanup will be handled by the shutdown thread.
+         * Ignore the forthcoming death event from libxl
+         */
+        priv->ignoreDeathEvent = true;
+        if (virThreadCreateFull(&thread, false, libxlDomainShutdownThread,
+                                name, false, shutdown_info) < 0) {
+            /*
+             * Not much we can do on error here except log it.
+             */
+            VIR_ERROR(_("Failed to create thread to handle domain shutdown"));
+            VIR_FREE(shutdown_info);
+            goto cleanup;
+        }
+        /*
+         * virDomainObjEndAPI is called in the shutdown thread, where
+         * libxlShutdownThreadInfo and libxl_event are also freed.
+         */
+        return;
+    } else if (event->type == LIBXL_EVENT_TYPE_DOMAIN_DEATH) {
+        /*
+         * On death the domain is cleaned up from Xen's perspective.
+         * Cleanup on the libvirt side can be done synchronously.
+         */
+        libxlDomainHandleDeath(driver, vm);
+    }
 
- error:
+ cleanup:
+    virDomainObjEndAPI(&vm);
     cfg = libxlDriverConfigGet(driver);
     /* Cast away any const */
     libxl_event_free(cfg->ctx, (libxl_event *)event);
-    VIR_FREE(shutdown_info);
 }
 
 char *
-- 
2.29.2


Re: [PATCH] libxl: Fix domain shutdown
Posted by Michal Privoznik 3 years, 2 months ago
On 2/20/21 1:19 AM, Jim Fehlig wrote:
> Commit fa30ee04a2 caused a regression in normal domain shutown.
> Initiating a shutdown from within the domain or via 'virsh shutdown'
> does cause the guest OS running in the domain to shutdown, but libvirt
> never reaps the domain so it is always shown in a running state until
> calling 'virsh destroy'.
> 
> The shutdown thread is also an internal user of the driver shutdown
> machinery and eventually calls libxlDomainDestroyInternal where
> the ignoreDeathEvent inhibitor is set, but running in a thread
> introduces the possibility of racing with the death event from
> libxl. This can be prevented by setting ignoreDeathEvent before
> running the shutdown thread.
> 
> An additional improvement is to handle the destroy event synchronously
> instead of spawning a thread. The time consuming aspects of destroying
> a domain have been completed when the destroy event is delivered.
> 
> Signed-off-by: Jim Fehlig <jfehlig@suse.com>
> ---
>   src/libxl/libxl_domain.c | 115 ++++++++++++++++++---------------------
>   1 file changed, 54 insertions(+), 61 deletions(-)
> 
> diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
> index d59153fffa..32dc503089 100644
> --- a/src/libxl/libxl_domain.c
> +++ b/src/libxl/libxl_domain.c

> @@ -685,42 +658,62 @@ libxlDomainEventHandler(void *data, VIR_LIBXL_EVENT_CONST libxl_event *event)
>        * after calling libxl_domain_suspend() are handled by its callers.
>        */
>       if (xl_reason == LIBXL_SHUTDOWN_REASON_SUSPEND)
> -        goto error;
> +        goto cleanup;
>   
> -    /*
> -     * Start a thread to handle shutdown.  We don't want to be tying up
> -     * libxl's event machinery by doing a potentially lengthy shutdown.
> -     */
> -    shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1);
> +    vm = virDomainObjListFindByID(driver->domains, event->domid);
> +    if (!vm) {
> +        /* Nothing to do if we can't find the virDomainObj */
> +        goto cleanup;
> +    }
>   
> -    shutdown_info->driver = driver;
> -    shutdown_info->event = (libxl_event *)event;
> -    name = g_strdup_printf("ev-%d", event->domid);
> -    if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN)
> -        ret = virThreadCreateFull(&thread, false, libxlDomainShutdownThread,
> -                                  name, false, shutdown_info);
> -    else if (event->type == LIBXL_EVENT_TYPE_DOMAIN_DEATH)
> -        ret = virThreadCreateFull(&thread, false, libxlDomainDeathThread,
> -                                  name, false, shutdown_info);
> +    if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN) {
> +        libxlDomainObjPrivatePtr priv = vm->privateData;
> +        struct libxlShutdownThreadInfo *shutdown_info = NULL;
> +        virThread thread;
> +        g_autofree char *name = NULL;
>   
> -    if (ret < 0) {
>           /*
> -         * Not much we can do on error here except log it.
> +         * Start a thread to handle shutdown.  We don't want to be tying up
> +         * libxl's event machinery by doing a potentially lengthy shutdown.
>            */
> -        VIR_ERROR(_("Failed to create thread to handle domain shutdown"));
> -        goto error;
> -    }
> +        shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1);
>   
> -    /*
> -     * libxlShutdownThreadInfo and libxl_event are freed in shutdown thread
> -     */
> -    return;
> +        shutdown_info->driver = driver;
> +        shutdown_info->vm = vm;

Aaah, wanted to suggest that you increment @vm's refcounter here, because ..

> +        shutdown_info->event = (libxl_event *)event;
> +        name = g_strdup_printf("ev-%d", event->domid);
> +        /*
> +         * Cleanup will be handled by the shutdown thread.
> +         * Ignore the forthcoming death event from libxl
> +         */
> +        priv->ignoreDeathEvent = true;
> +        if (virThreadCreateFull(&thread, false, libxlDomainShutdownThread,
> +                                name, false, shutdown_info) < 0) {

.. what if this thread is scheduled after [1]? But then noticed this 
subtle ..

> +            /*
> +             * Not much we can do on error here except log it.
> +             */
> +            VIR_ERROR(_("Failed to create thread to handle domain shutdown"));
> +            VIR_FREE(shutdown_info);
> +            goto cleanup;
> +        }
> +        /*
> +         * virDomainObjEndAPI is called in the shutdown thread, where
> +         * libxlShutdownThreadInfo and libxl_event are also freed.
> +         */
> +        return;

.. return. So the refcount is okay :-)

> +    } else if (event->type == LIBXL_EVENT_TYPE_DOMAIN_DEATH) {
> +        /*
> +         * On death the domain is cleaned up from Xen's perspective.
> +         * Cleanup on the libvirt side can be done synchronously.
> +         */
> +        libxlDomainHandleDeath(driver, vm);
> +    }
>   
> - error:
> + cleanup:
> +    virDomainObjEndAPI(&vm);

1: this ^^^

>       cfg = libxlDriverConfigGet(driver);
>       /* Cast away any const */
>       libxl_event_free(cfg->ctx, (libxl_event *)event);
> -    VIR_FREE(shutdown_info);
>   }
>   
>   char *
> 

Reviewed-by: Michal Privoznik <mprivozn@redhat.com>

Michal

Re: [PATCH] libxl: Fix domain shutdown
Posted by Jim Fehlig 3 years, 2 months ago
On 2/22/21 5:46 AM, Michal Privoznik wrote:
> On 2/20/21 1:19 AM, Jim Fehlig wrote:
>> Commit fa30ee04a2 caused a regression in normal domain shutown.
>> Initiating a shutdown from within the domain or via 'virsh shutdown'
>> does cause the guest OS running in the domain to shutdown, but libvirt
>> never reaps the domain so it is always shown in a running state until
>> calling 'virsh destroy'.
>>
>> The shutdown thread is also an internal user of the driver shutdown
>> machinery and eventually calls libxlDomainDestroyInternal where
>> the ignoreDeathEvent inhibitor is set, but running in a thread
>> introduces the possibility of racing with the death event from
>> libxl. This can be prevented by setting ignoreDeathEvent before
>> running the shutdown thread.
>>
>> An additional improvement is to handle the destroy event synchronously
>> instead of spawning a thread. The time consuming aspects of destroying
>> a domain have been completed when the destroy event is delivered.
>>
>> Signed-off-by: Jim Fehlig <jfehlig@suse.com>
>> ---
>>   src/libxl/libxl_domain.c | 115 ++++++++++++++++++---------------------
>>   1 file changed, 54 insertions(+), 61 deletions(-)
>>
>> diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
>> index d59153fffa..32dc503089 100644
>> --- a/src/libxl/libxl_domain.c
>> +++ b/src/libxl/libxl_domain.c
> 
>> @@ -685,42 +658,62 @@ libxlDomainEventHandler(void *data, 
>> VIR_LIBXL_EVENT_CONST libxl_event *event)
>>        * after calling libxl_domain_suspend() are handled by its callers.
>>        */
>>       if (xl_reason == LIBXL_SHUTDOWN_REASON_SUSPEND)
>> -        goto error;
>> +        goto cleanup;
>> -    /*
>> -     * Start a thread to handle shutdown.  We don't want to be tying up
>> -     * libxl's event machinery by doing a potentially lengthy shutdown.
>> -     */
>> -    shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1);
>> +    vm = virDomainObjListFindByID(driver->domains, event->domid);
>> +    if (!vm) {
>> +        /* Nothing to do if we can't find the virDomainObj */
>> +        goto cleanup;
>> +    }
>> -    shutdown_info->driver = driver;
>> -    shutdown_info->event = (libxl_event *)event;
>> -    name = g_strdup_printf("ev-%d", event->domid);
>> -    if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN)
>> -        ret = virThreadCreateFull(&thread, false, libxlDomainShutdownThread,
>> -                                  name, false, shutdown_info);
>> -    else if (event->type == LIBXL_EVENT_TYPE_DOMAIN_DEATH)
>> -        ret = virThreadCreateFull(&thread, false, libxlDomainDeathThread,
>> -                                  name, false, shutdown_info);
>> +    if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN) {
>> +        libxlDomainObjPrivatePtr priv = vm->privateData;
>> +        struct libxlShutdownThreadInfo *shutdown_info = NULL;
>> +        virThread thread;
>> +        g_autofree char *name = NULL;
>> -    if (ret < 0) {
>>           /*
>> -         * Not much we can do on error here except log it.
>> +         * Start a thread to handle shutdown.  We don't want to be tying up
>> +         * libxl's event machinery by doing a potentially lengthy shutdown.
>>            */
>> -        VIR_ERROR(_("Failed to create thread to handle domain shutdown"));
>> -        goto error;
>> -    }
>> +        shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1);
>> -    /*
>> -     * libxlShutdownThreadInfo and libxl_event are freed in shutdown thread
>> -     */
>> -    return;
>> +        shutdown_info->driver = driver;
>> +        shutdown_info->vm = vm;
> 
> Aaah, wanted to suggest that you increment @vm's refcounter here, because ..
> 
>> +        shutdown_info->event = (libxl_event *)event;
>> +        name = g_strdup_printf("ev-%d", event->domid);
>> +        /*
>> +         * Cleanup will be handled by the shutdown thread.
>> +         * Ignore the forthcoming death event from libxl
>> +         */
>> +        priv->ignoreDeathEvent = true;
>> +        if (virThreadCreateFull(&thread, false, libxlDomainShutdownThread,
>> +                                name, false, shutdown_info) < 0) {
> 
> .. what if this thread is scheduled after [1]? But then noticed this subtle ..
> 
>> +            /*
>> +             * Not much we can do on error here except log it.
>> +             */
>> +            VIR_ERROR(_("Failed to create thread to handle domain shutdown"));
>> +            VIR_FREE(shutdown_info);
>> +            goto cleanup;
>> +        }
>> +        /*
>> +         * virDomainObjEndAPI is called in the shutdown thread, where
>> +         * libxlShutdownThreadInfo and libxl_event are also freed.
>> +         */
>> +        return;
> 
> .. return. So the refcount is okay :-)

The subtle-ness definitely crossed my mind :-). Freeing the libxl event and 
shutdown info was already handled in the same way. The comment was a helpful 
reminder about the subtle behavior. I did consider options before settling on 
this approach, e.g. using g_autoptr to avoid the cleanup label altogether. For 
the shutdown info I would only have to create a free function, but the 
libxl_event would require defining a new struct to hold the event and libxl_ctx, 
along with a free func. It seemed a bit overkill, but I can pursue that if you'd 
like.

BTW, while reading your reply I noticed that ignoreDeathEvent should be set to 
false if the thread creation fails, although it probably doesn't matter much at 
that point. If the approach here is still acceptable, I'll squash in the below 
diff before pushing. Another BTW: is it fine to push this bug/regression fix 
even though we've now hit 7.1.0 freeze?

Regards,
Jim

diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
index 32dc503089..604b94a006 100644
--- a/src/libxl/libxl_domain.c
+++ b/src/libxl/libxl_domain.c
@@ -689,6 +689,7 @@ libxlDomainEventHandler(void *data, VIR_LIBXL_EVENT_CONST 
libxl_event *event)
          priv->ignoreDeathEvent = true;
          if (virThreadCreateFull(&thread, false, libxlDomainShutdownThread,
                                  name, false, shutdown_info) < 0) {
+             priv->ignoreDeathEvent = false;
              /*
               * Not much we can do on error here except log it.
               */


Re: [PATCH] libxl: Fix domain shutdown
Posted by Michal Privoznik 3 years, 2 months ago
On 2/22/21 6:02 PM, Jim Fehlig wrote:
> On 2/22/21 5:46 AM, Michal Privoznik wrote:
>> On 2/20/21 1:19 AM, Jim Fehlig wrote:
>>> Commit fa30ee04a2 caused a regression in normal domain shutown.
>>> Initiating a shutdown from within the domain or via 'virsh shutdown'
>>> does cause the guest OS running in the domain to shutdown, but libvirt
>>> never reaps the domain so it is always shown in a running state until
>>> calling 'virsh destroy'.
>>>
>>> The shutdown thread is also an internal user of the driver shutdown
>>> machinery and eventually calls libxlDomainDestroyInternal where
>>> the ignoreDeathEvent inhibitor is set, but running in a thread
>>> introduces the possibility of racing with the death event from
>>> libxl. This can be prevented by setting ignoreDeathEvent before
>>> running the shutdown thread.
>>>
>>> An additional improvement is to handle the destroy event synchronously
>>> instead of spawning a thread. The time consuming aspects of destroying
>>> a domain have been completed when the destroy event is delivered.
>>>
>>> Signed-off-by: Jim Fehlig <jfehlig@suse.com>
>>> ---
>>>   src/libxl/libxl_domain.c | 115 ++++++++++++++++++---------------------
>>>   1 file changed, 54 insertions(+), 61 deletions(-)
>>>
>>> diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
>>> index d59153fffa..32dc503089 100644
>>> --- a/src/libxl/libxl_domain.c
>>> +++ b/src/libxl/libxl_domain.c
>>
>>> @@ -685,42 +658,62 @@ libxlDomainEventHandler(void *data, 
>>> VIR_LIBXL_EVENT_CONST libxl_event *event)
>>>        * after calling libxl_domain_suspend() are handled by its 
>>> callers.
>>>        */
>>>       if (xl_reason == LIBXL_SHUTDOWN_REASON_SUSPEND)
>>> -        goto error;
>>> +        goto cleanup;
>>> -    /*
>>> -     * Start a thread to handle shutdown.  We don't want to be tying up
>>> -     * libxl's event machinery by doing a potentially lengthy shutdown.
>>> -     */
>>> -    shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1);
>>> +    vm = virDomainObjListFindByID(driver->domains, event->domid);
>>> +    if (!vm) {
>>> +        /* Nothing to do if we can't find the virDomainObj */
>>> +        goto cleanup;
>>> +    }
>>> -    shutdown_info->driver = driver;
>>> -    shutdown_info->event = (libxl_event *)event;
>>> -    name = g_strdup_printf("ev-%d", event->domid);
>>> -    if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN)
>>> -        ret = virThreadCreateFull(&thread, false, 
>>> libxlDomainShutdownThread,
>>> -                                  name, false, shutdown_info);
>>> -    else if (event->type == LIBXL_EVENT_TYPE_DOMAIN_DEATH)
>>> -        ret = virThreadCreateFull(&thread, false, 
>>> libxlDomainDeathThread,
>>> -                                  name, false, shutdown_info);
>>> +    if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN) {
>>> +        libxlDomainObjPrivatePtr priv = vm->privateData;
>>> +        struct libxlShutdownThreadInfo *shutdown_info = NULL;
>>> +        virThread thread;
>>> +        g_autofree char *name = NULL;
>>> -    if (ret < 0) {
>>>           /*
>>> -         * Not much we can do on error here except log it.
>>> +         * Start a thread to handle shutdown.  We don't want to be 
>>> tying up
>>> +         * libxl's event machinery by doing a potentially lengthy 
>>> shutdown.
>>>            */
>>> -        VIR_ERROR(_("Failed to create thread to handle domain 
>>> shutdown"));
>>> -        goto error;
>>> -    }
>>> +        shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1);
>>> -    /*
>>> -     * libxlShutdownThreadInfo and libxl_event are freed in shutdown 
>>> thread
>>> -     */
>>> -    return;
>>> +        shutdown_info->driver = driver;
>>> +        shutdown_info->vm = vm;
>>
>> Aaah, wanted to suggest that you increment @vm's refcounter here, 
>> because ..
>>
>>> +        shutdown_info->event = (libxl_event *)event;
>>> +        name = g_strdup_printf("ev-%d", event->domid);
>>> +        /*
>>> +         * Cleanup will be handled by the shutdown thread.
>>> +         * Ignore the forthcoming death event from libxl
>>> +         */
>>> +        priv->ignoreDeathEvent = true;
>>> +        if (virThreadCreateFull(&thread, false, 
>>> libxlDomainShutdownThread,
>>> +                                name, false, shutdown_info) < 0) {
>>
>> .. what if this thread is scheduled after [1]? But then noticed this 
>> subtle ..
>>
>>> +            /*
>>> +             * Not much we can do on error here except log it.
>>> +             */
>>> +            VIR_ERROR(_("Failed to create thread to handle domain 
>>> shutdown"));
>>> +            VIR_FREE(shutdown_info);
>>> +            goto cleanup;
>>> +        }
>>> +        /*
>>> +         * virDomainObjEndAPI is called in the shutdown thread, where
>>> +         * libxlShutdownThreadInfo and libxl_event are also freed.
>>> +         */
>>> +        return;
>>
>> .. return. So the refcount is okay :-)
> 
> The subtle-ness definitely crossed my mind :-). Freeing the libxl event 
> and shutdown info was already handled in the same way. The comment was a 
> helpful reminder about the subtle behavior. I did consider options 
> before settling on this approach, e.g. using g_autoptr to avoid the 
> cleanup label altogether. For the shutdown info I would only have to 
> create a free function, but the libxl_event would require defining a new 
> struct to hold the event and libxl_ctx, along with a free func. It 
> seemed a bit overkill, but I can pursue that if you'd like.

Not sure if it's worth it. But if you write the patch I'm more than 
happy to review it.

> 
> BTW, while reading your reply I noticed that ignoreDeathEvent should be 
> set to false if the thread creation fails, although it probably doesn't 
> matter much at that point. If the approach here is still acceptable, 
> I'll squash in the below diff before pushing. Another BTW: is it fine to 
> push this bug/regression fix even though we've now hit 7.1.0 freeze?

That's exactly what freeze is for! I mean, if we weren't allowed to push 
fixes then that's the point in having freeze? We shouldn't push new 
features because they are likely to introduce bugs. But bugfixes should 
be okay. Some tend to even state that obviously "Reviewed-by and safe 
for freeze" and sometimes I do that too sometimes I don't. But IMO it's 
always safe to push a bug fix.

Michal

Re: [PATCH] libxl: Fix domain shutdown
Posted by Jim Fehlig 3 years, 2 months ago
On 2/22/21 10:34 AM, Michal Privoznik wrote:
> On 2/22/21 6:02 PM, Jim Fehlig wrote:
>> On 2/22/21 5:46 AM, Michal Privoznik wrote:
>>> On 2/20/21 1:19 AM, Jim Fehlig wrote:
>>>> Commit fa30ee04a2 caused a regression in normal domain shutown.
>>>> Initiating a shutdown from within the domain or via 'virsh shutdown'
>>>> does cause the guest OS running in the domain to shutdown, but libvirt
>>>> never reaps the domain so it is always shown in a running state until
>>>> calling 'virsh destroy'.
>>>>
>>>> The shutdown thread is also an internal user of the driver shutdown
>>>> machinery and eventually calls libxlDomainDestroyInternal where
>>>> the ignoreDeathEvent inhibitor is set, but running in a thread
>>>> introduces the possibility of racing with the death event from
>>>> libxl. This can be prevented by setting ignoreDeathEvent before
>>>> running the shutdown thread.
>>>>
>>>> An additional improvement is to handle the destroy event synchronously
>>>> instead of spawning a thread. The time consuming aspects of destroying
>>>> a domain have been completed when the destroy event is delivered.
>>>>
>>>> Signed-off-by: Jim Fehlig <jfehlig@suse.com>
>>>> ---
>>>>   src/libxl/libxl_domain.c | 115 ++++++++++++++++++---------------------
>>>>   1 file changed, 54 insertions(+), 61 deletions(-)
>>>>
>>>> diff --git a/src/libxl/libxl_domain.c b/src/libxl/libxl_domain.c
>>>> index d59153fffa..32dc503089 100644
>>>> --- a/src/libxl/libxl_domain.c
>>>> +++ b/src/libxl/libxl_domain.c
>>>
>>>> @@ -685,42 +658,62 @@ libxlDomainEventHandler(void *data, 
>>>> VIR_LIBXL_EVENT_CONST libxl_event *event)
>>>>        * after calling libxl_domain_suspend() are handled by its callers.
>>>>        */
>>>>       if (xl_reason == LIBXL_SHUTDOWN_REASON_SUSPEND)
>>>> -        goto error;
>>>> +        goto cleanup;
>>>> -    /*
>>>> -     * Start a thread to handle shutdown.  We don't want to be tying up
>>>> -     * libxl's event machinery by doing a potentially lengthy shutdown.
>>>> -     */
>>>> -    shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1);
>>>> +    vm = virDomainObjListFindByID(driver->domains, event->domid);
>>>> +    if (!vm) {
>>>> +        /* Nothing to do if we can't find the virDomainObj */
>>>> +        goto cleanup;
>>>> +    }
>>>> -    shutdown_info->driver = driver;
>>>> -    shutdown_info->event = (libxl_event *)event;
>>>> -    name = g_strdup_printf("ev-%d", event->domid);
>>>> -    if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN)
>>>> -        ret = virThreadCreateFull(&thread, false, libxlDomainShutdownThread,
>>>> -                                  name, false, shutdown_info);
>>>> -    else if (event->type == LIBXL_EVENT_TYPE_DOMAIN_DEATH)
>>>> -        ret = virThreadCreateFull(&thread, false, libxlDomainDeathThread,
>>>> -                                  name, false, shutdown_info);
>>>> +    if (event->type == LIBXL_EVENT_TYPE_DOMAIN_SHUTDOWN) {
>>>> +        libxlDomainObjPrivatePtr priv = vm->privateData;
>>>> +        struct libxlShutdownThreadInfo *shutdown_info = NULL;
>>>> +        virThread thread;
>>>> +        g_autofree char *name = NULL;
>>>> -    if (ret < 0) {
>>>>           /*
>>>> -         * Not much we can do on error here except log it.
>>>> +         * Start a thread to handle shutdown.  We don't want to be tying up
>>>> +         * libxl's event machinery by doing a potentially lengthy shutdown.
>>>>            */
>>>> -        VIR_ERROR(_("Failed to create thread to handle domain shutdown"));
>>>> -        goto error;
>>>> -    }
>>>> +        shutdown_info = g_new0(struct libxlShutdownThreadInfo, 1);
>>>> -    /*
>>>> -     * libxlShutdownThreadInfo and libxl_event are freed in shutdown thread
>>>> -     */
>>>> -    return;
>>>> +        shutdown_info->driver = driver;
>>>> +        shutdown_info->vm = vm;
>>>
>>> Aaah, wanted to suggest that you increment @vm's refcounter here, because ..
>>>
>>>> +        shutdown_info->event = (libxl_event *)event;
>>>> +        name = g_strdup_printf("ev-%d", event->domid);
>>>> +        /*
>>>> +         * Cleanup will be handled by the shutdown thread.
>>>> +         * Ignore the forthcoming death event from libxl
>>>> +         */
>>>> +        priv->ignoreDeathEvent = true;
>>>> +        if (virThreadCreateFull(&thread, false, libxlDomainShutdownThread,
>>>> +                                name, false, shutdown_info) < 0) {
>>>
>>> .. what if this thread is scheduled after [1]? But then noticed this subtle ..
>>>
>>>> +            /*
>>>> +             * Not much we can do on error here except log it.
>>>> +             */
>>>> +            VIR_ERROR(_("Failed to create thread to handle domain shutdown"));
>>>> +            VIR_FREE(shutdown_info);
>>>> +            goto cleanup;
>>>> +        }
>>>> +        /*
>>>> +         * virDomainObjEndAPI is called in the shutdown thread, where
>>>> +         * libxlShutdownThreadInfo and libxl_event are also freed.
>>>> +         */
>>>> +        return;
>>>
>>> .. return. So the refcount is okay :-)
>>
>> The subtle-ness definitely crossed my mind :-). Freeing the libxl event and 
>> shutdown info was already handled in the same way. The comment was a helpful 
>> reminder about the subtle behavior. I did consider options before settling on 
>> this approach, e.g. using g_autoptr to avoid the cleanup label altogether. For 
>> the shutdown info I would only have to create a free function, but the 
>> libxl_event would require defining a new struct to hold the event and 
>> libxl_ctx, along with a free func. It seemed a bit overkill, but I can pursue 
>> that if you'd like.
> 
> Not sure if it's worth it. But if you write the patch I'm more than happy to 
> review it.

I'll look at it in conjunction with other cleanups and improvements I'm working 
on. In the meantime I've pushed this fix.

>> BTW, while reading your reply I noticed that ignoreDeathEvent should be set to 
>> false if the thread creation fails, although it probably doesn't matter much 
>> at that point. If the approach here is still acceptable, I'll squash in the 
>> below diff before pushing. Another BTW: is it fine to push this bug/regression 
>> fix even though we've now hit 7.1.0 freeze?
> 
> That's exactly what freeze is for! I mean, if we weren't allowed to push fixes 
> then that's the point in having freeze?

Yes, thanks for point out the obvious. Another cup of coffee would have also 
done that :-).

Regards,
Jim