[PATCH] qemu: fix vcpu clearing when multiple vcpu hotunplugs timeout

Shaleen Bathla posted 1 patch 1 year, 5 months ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/libvirt tags/patchew/Y2UEAnctgtmo6kvW@shbathla-vm-ol8.appad3iad.osdevelopmeniad.oraclevcn.com
src/qemu/qemu_domain.c  |  6 ++++++
src/qemu/qemu_domain.h  |  1 +
src/qemu/qemu_hotplug.c | 41 ++++++++++++++++++++++++++++++++++++++++-
3 files changed, 47 insertions(+), 1 deletion(-)
[PATCH] qemu: fix vcpu clearing when multiple vcpu hotunplugs timeout
Posted by Shaleen Bathla 1 year, 5 months ago
Problem:
libvirt has a 5 second timeout (generally) for hotplug/unplug
operations which can time out due to heavy load in guest.

vcpu hotunplug occurs one vcpu at a time.
But, if we perform hotplug-unplug repeatedly, there is a situation
where qemu has multiple timedout vcpu unplug operations pending.

libvirt waits for an async event notification from qemu regarding
successful vcpu delete.

qemu deletes the vcpu-vcpuinfo and sends this notification to libvirt.

libvirt handles vcpu delete notification, and refreshes vcpuinfo
by querying vcpuinfo from qemu in qemuDomainRefreshVcpuInfo().

qemu's vcpuinfo will not contain vcpu that was deleted and sent.
qemu's vcpuinfo will overwrite libvirt's vcpuinfo of the other pending
timedout vcpu(s) which qemu has deleted and notified libvirt.
The overwrite resets other timedout vcpu's alias to NULL and tid to 0.

The error is then seen when validating tid of vcpus.
Example error log:
"internal error: qemu didn't report thread id for vcpu 'XX'"

Solution:
Create a per vcpu hp_timeout_alias which will be set as vcpu alias when a timeout
occurs else it is set as NULL.
This data does not get reset when vcpuinfo refresh occurs.
And we can remove a vcpu if it's alias was NULL by checking this alias.
During vcpu validation, we can skip to nextvcpu if this hotplug-timeout-alias
is set indicating that this vcpu is actually valid.

Co-authored-by: Partha Satapathy <partha.satapathy@oracle.com>
Signed-off-by: Shaleen Bathla <shaleen.bathla@oracle.com>
---
 src/qemu/qemu_domain.c  |  6 ++++++
 src/qemu/qemu_domain.h  |  1 +
 src/qemu/qemu_hotplug.c | 41 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index c3afc6c9d3ba..0f6e4dc78b93 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -9568,6 +9568,8 @@ qemuDomainValidateVcpuInfo(virDomainObj *vm)
         vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpu);
 
         if (vcpu->online && vcpupriv->tid == 0) {
+            if (vcpupriv->hp_timeout_alias)
+                continue;
             virReportError(VIR_ERR_INTERNAL_ERROR,
                            _("qemu didn't report thread id for vcpu '%zu'"), i);
             return -1;
@@ -9696,6 +9698,10 @@ qemuDomainRefreshVcpuInfo(virDomainObj *vm,
         if (validTIDs)
             vcpupriv->tid = info[i].tid;
 
+        if (vcpupriv->hp_timeout_alias && !info[i].alias)
+            VIR_DEBUG("no alias for timed out vcpu%zu domain %s", i,
+                      vm->def->name);
+
         vcpupriv->socket_id = info[i].socket_id;
         vcpupriv->core_id = info[i].core_id;
         vcpupriv->thread_id = info[i].thread_id;
diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
index 2bbd492d62da..68f6c645ef79 100644
--- a/src/qemu/qemu_domain.h
+++ b/src/qemu/qemu_domain.h
@@ -324,6 +324,7 @@ struct _qemuDomainVcpuPrivate {
     int thread_id;
     int node_id;
     int vcpus;
+    char *hp_timeout_alias; /* alias after hotplug timeout */
 
     char *qomPath;
 };
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index da92ced2f444..5d54e9c67c3f 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -6138,9 +6138,21 @@ qemuDomainRemoveVcpuAlias(virDomainObj *vm,
 
         if (STREQ_NULLABLE(alias, vcpupriv->alias)) {
             qemuDomainRemoveVcpu(vm, i);
+            if (vcpupriv->hp_timeout_alias)
+                g_free(vcpupriv->hp_timeout_alias);
+            return;
+        }
+        if (STREQ_NULLABLE(alias, vcpupriv->hp_timeout_alias)) {
+            qemuDomainRemoveVcpu(vm, i);
+            VIR_DEBUG("%s found in timed out list of domain %s",
+                       alias, vm->def->name);
+            g_free(vcpupriv->hp_timeout_alias);
             return;
         }
     }
+
+    VIR_DEBUG("%s not found in vcpulist of domain %s ",
+              alias, vm->def->name);
 }
 
 
@@ -6172,11 +6184,19 @@ qemuDomainHotplugDelVcpu(virQEMUDriver *driver,
     }
 
     if ((rc = qemuDomainWaitForDeviceRemoval(vm)) <= 0) {
-        if (rc == 0)
+        if (rc == 0) {
             virReportError(VIR_ERR_OPERATION_TIMEOUT, "%s",
                            _("vcpu unplug request timed out. Unplug result "
                              "must be manually inspected in the domain"));
 
+            if (vcpupriv) {
+                VIR_INFO("unplug timeout set for vcpu '%u' of domain %s",
+                         vcpu, vm->def->name);
+
+                vcpupriv->hp_timeout_alias = g_strdup(vcpupriv->alias);
+            }
+        }
+
         goto cleanup;
     }
 
@@ -6362,6 +6382,8 @@ qemuDomainSetVcpusLive(virQEMUDriver *driver,
 {
     qemuDomainObjPrivate *priv = vm->privateData;
     virCgroupEmulatorAllNodesData *emulatorCgroup = NULL;
+    qemuDomainVcpuPrivate *vcpupriv = NULL;
+    virDomainVcpuDef *vcpuinfo = NULL;
     ssize_t nextvcpu = -1;
     int ret = -1;
 
@@ -6370,6 +6392,14 @@ qemuDomainSetVcpusLive(virQEMUDriver *driver,
 
     if (enable) {
         while ((nextvcpu = virBitmapNextSetBit(vcpumap, nextvcpu)) != -1) {
+            vcpuinfo = virDomainDefGetVcpu(vm->def, nextvcpu);
+            vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
+
+            if (vcpupriv && vcpupriv->hp_timeout_alias) {
+                VIR_INFO("reject hotplug of timed out vcpu '%zd' from domain %s",
+                         nextvcpu, vm->def->name);
+                goto cleanup;
+            }
             if (qemuDomainHotplugAddVcpu(driver, cfg, vm, nextvcpu) < 0)
                 goto cleanup;
         }
@@ -6378,6 +6408,15 @@ qemuDomainSetVcpusLive(virQEMUDriver *driver,
             if (!virBitmapIsBitSet(vcpumap, nextvcpu))
                 continue;
 
+            vcpuinfo = virDomainDefGetVcpu(vm->def, nextvcpu);
+            vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
+
+            if (vcpupriv && vcpupriv->hp_timeout_alias) {
+                VIR_INFO("already serving unplug for vcpu '%zd' in domain %s",
+                         nextvcpu, vm->def->name);
+                goto cleanup;
+            }
+
             if (qemuDomainHotplugDelVcpu(driver, cfg, vm, nextvcpu) < 0)
                 goto cleanup;
         }
-- 
2.31.1
Re: [PATCH] qemu: fix vcpu clearing when multiple vcpu hotunplugs timeout
Posted by Peter Krempa 1 year, 5 months ago
On Fri, Nov 04, 2022 at 17:52:26 +0530, Shaleen Bathla wrote:
> Problem:
> libvirt has a 5 second timeout (generally) for hotplug/unplug
> operations which can time out due to heavy load in guest.
> 
> vcpu hotunplug occurs one vcpu at a time.
> But, if we perform hotplug-unplug repeatedly, there is a situation
> where qemu has multiple timedout vcpu unplug operations pending.
> 
> libvirt waits for an async event notification from qemu regarding
> successful vcpu delete.
> 
> qemu deletes the vcpu-vcpuinfo and sends this notification to libvirt.
> 
> libvirt handles vcpu delete notification, and refreshes vcpuinfo
> by querying vcpuinfo from qemu in qemuDomainRefreshVcpuInfo().
> 
> qemu's vcpuinfo will not contain vcpu that was deleted and sent.
> qemu's vcpuinfo will overwrite libvirt's vcpuinfo of the other pending
> timedout vcpu(s) which qemu has deleted and notified libvirt.
> The overwrite resets other timedout vcpu's alias to NULL and tid to 0.
> 
> The error is then seen when validating tid of vcpus.
> Example error log:
> "internal error: qemu didn't report thread id for vcpu 'XX'"
> 
> Solution:
> Create a per vcpu hp_timeout_alias which will be set as vcpu alias when a timeout
> occurs else it is set as NULL.
> This data does not get reset when vcpuinfo refresh occurs.
> And we can remove a vcpu if it's alias was NULL by checking this alias.
> During vcpu validation, we can skip to nextvcpu if this hotplug-timeout-alias
> is set indicating that this vcpu is actually valid.

So the problem you are describing happens when qemu either:

1) bunches up two unplug confirmations before libvirt processed the first one

In this case the call to 'qemuDomainRemoveVcpu' to remove the first cpu
already gets updated CPU data where both cpus are removed.

Thus qemuDomainValidateVcpuInfo fails.

2) when attempting a hotplug at which point qemu finishes unplug of another cpu

After the new cpu is successfully plugged in the updated data doesn't
contain the unplugged cpu's data.

To correct the hotplug/unplug code paths
(qemuDomainHotplugAddVcpu/qemuDomainRemoveVcpu) I think a better
solution will be to validate only the single CPU entity which is handled
by the corresponding hotplug API.

This means replacing the qemuDomainValidateVcpuInfo by modifying the
following block (qemuDomainHotplugAddVcpu) to do the validation:

    /* validation requires us to set the expected state prior to calling it */
    for (i = vcpu; i < vcpu + nvcpus; i++) {
        vcpuinfo = virDomainDefGetVcpu(vm->def, i);
        vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);

        vcpuinfo->online = true;

        if (vcpupriv->tid > 0 &&
            qemuProcessSetupVcpu(vm, i, true) < 0)
            return -1;
    }

and modify it such that it does the validation internally:

    bool hasVcpuPids = qemuDomainHasVcpuPids(vm));

    [...]

    /* Validate that the new vcpus have thread IDs if needed */
    for (i = vcpu; i < vcpu + nvcpus; i++) {
        vcpuinfo = virDomainDefGetVcpu(vm->def, i);
        vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);

        vcpuinfo->online = true;

        if (vcpupriv->tid > 0) {
            if (qemuProcessSetupVcpu(vm, i, true) < 0)
                return -1;
        } else {
            if (hasVcpuPids) {
                virReportError(VIR_ERR_INTERNAL_ERROR,
                               _("qemu didn't report thread id for vcpu '%zu'"), i);
                return -1;

            }
        }
    }

Similarly for the unplug code path.

The code in qemuDomainRefreshVcpuInfo which refreshes the alias should
also do so only when the data reported from qemu do report it:

        if (info[i].alias) {
            VIR_FREE(vcpupriv->alias);
            vcpupriv->alias = g_steal_pointer(&info[i].alias);
        }

That way you will not need the additional variable. The alias will then
be cleared in qemuDomainRemoveVcpu after handling the unplug finalizing
steps.

Note that the sub-sequent review comments were written before the above
paragraph.

> Co-authored-by: Partha Satapathy <partha.satapathy@oracle.com>
> Signed-off-by: Shaleen Bathla <shaleen.bathla@oracle.com>
> ---
>  src/qemu/qemu_domain.c  |  6 ++++++
>  src/qemu/qemu_domain.h  |  1 +
>  src/qemu/qemu_hotplug.c | 41 ++++++++++++++++++++++++++++++++++++++++-
>  3 files changed, 47 insertions(+), 1 deletion(-)
> 
> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
> index c3afc6c9d3ba..0f6e4dc78b93 100644
> --- a/src/qemu/qemu_domain.c
> +++ b/src/qemu/qemu_domain.c
> @@ -9568,6 +9568,8 @@ qemuDomainValidateVcpuInfo(virDomainObj *vm)
>          vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpu);
>  
>          if (vcpu->online && vcpupriv->tid == 0) {
> +            if (vcpupriv->hp_timeout_alias)
> +                continue;

Add extra newline here to separate the blocks. Ideally also add a
comment explaining that if 'hp_timeout_alias' is set there's a pending
device removal so the thread ID might not be present any more.

>              virReportError(VIR_ERR_INTERNAL_ERROR,
>                             _("qemu didn't report thread id for vcpu '%zu'"), i);
>              return -1;
> @@ -9696,6 +9698,10 @@ qemuDomainRefreshVcpuInfo(virDomainObj *vm,
>          if (validTIDs)
>              vcpupriv->tid = info[i].tid;
>  
> +        if (vcpupriv->hp_timeout_alias && !info[i].alias)
> +            VIR_DEBUG("no alias for timed out vcpu%zu domain %s", i,
> +                      vm->def->name);

This debug statement doesn't seem to make too much sense here.

> +
>          vcpupriv->socket_id = info[i].socket_id;
>          vcpupriv->core_id = info[i].core_id;
>          vcpupriv->thread_id = info[i].thread_id;
> diff --git a/src/qemu/qemu_domain.h b/src/qemu/qemu_domain.h
> index 2bbd492d62da..68f6c645ef79 100644
> --- a/src/qemu/qemu_domain.h
> +++ b/src/qemu/qemu_domain.h
> @@ -324,6 +324,7 @@ struct _qemuDomainVcpuPrivate {
>      int thread_id;
>      int node_id;
>      int vcpus;
> +    char *hp_timeout_alias; /* alias after hotplug timeout */

Missing corresponding freeing of this new field in
qemuDomainVcpuPrivateDispose.

That is required regardless of if you clear it in other ways.

>  
>      char *qomPath;
>  };
> diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
> index da92ced2f444..5d54e9c67c3f 100644
> --- a/src/qemu/qemu_hotplug.c
> +++ b/src/qemu/qemu_hotplug.c
> @@ -6138,9 +6138,21 @@ qemuDomainRemoveVcpuAlias(virDomainObj *vm,
>  
>          if (STREQ_NULLABLE(alias, vcpupriv->alias)) {
>              qemuDomainRemoveVcpu(vm, i);
> +            if (vcpupriv->hp_timeout_alias)

Checking the pointer before a free is not needed. g_free can handle NULL
pointers.

> +                g_free(vcpupriv->hp_timeout_alias);

This can be deferred to qemuDomainVcpuPrivateDispose.

> +            return;
> +        }
> +        if (STREQ_NULLABLE(alias, vcpupriv->hp_timeout_alias)) {
> +            qemuDomainRemoveVcpu(vm, i);
> +            VIR_DEBUG("%s found in timed out list of domain %s",
> +                       alias, vm->def->name);

Drop this debug statement. It doesn't matter how we found it.

> +            g_free(vcpupriv->hp_timeout_alias);
Same as above.
>              return;
>          }
>      }
> +
> +    VIR_DEBUG("%s not found in vcpulist of domain %s ",
> +              alias, vm->def->name);
>  }
>  
>  
> @@ -6172,11 +6184,19 @@ qemuDomainHotplugDelVcpu(virQEMUDriver *driver,
>      }
>  
>      if ((rc = qemuDomainWaitForDeviceRemoval(vm)) <= 0) {
> -        if (rc == 0)
> +        if (rc == 0) {
>              virReportError(VIR_ERR_OPERATION_TIMEOUT, "%s",
>                             _("vcpu unplug request timed out. Unplug result "
>                               "must be manually inspected in the domain"));
>  
> +            if (vcpupriv) {
> +                VIR_INFO("unplug timeout set for vcpu '%u' of domain %s",
> +                         vcpu, vm->def->name);

This VIR_INFO call is not needed as the above ee

> +
> +                vcpupriv->hp_timeout_alias = g_strdup(vcpupriv->alias);
> +            }
> +        }
> +
>          goto cleanup;
>      }
>  
> @@ -6362,6 +6382,8 @@ qemuDomainSetVcpusLive(virQEMUDriver *driver,
>  {
>      qemuDomainObjPrivate *priv = vm->privateData;
>      virCgroupEmulatorAllNodesData *emulatorCgroup = NULL;
> +    qemuDomainVcpuPrivate *vcpupriv = NULL;
> +    virDomainVcpuDef *vcpuinfo = NULL;

Declare these helper variables in the appropriate loop block/scope.

>      ssize_t nextvcpu = -1;
>      int ret = -1;
>  
> @@ -6370,6 +6392,14 @@ qemuDomainSetVcpusLive(virQEMUDriver *driver,
>  
>      if (enable) {
>          while ((nextvcpu = virBitmapNextSetBit(vcpumap, nextvcpu)) != -1) {
> +            vcpuinfo = virDomainDefGetVcpu(vm->def, nextvcpu);
> +            vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
> +
> +            if (vcpupriv && vcpupriv->hp_timeout_alias) {
> +                VIR_INFO("reject hotplug of timed out vcpu '%zd' from domain %s",
> +                         nextvcpu, vm->def->name);

The VIR_INFO call purely logs the message but does not set the
'virError' object which is returned to the caller. Since this function
returns failure (it's called directly from qemuDomainSetVcpu, and no
other error is reported) in this case the error must be set properly.

If this is an error always use 'virReportError'. If it is something that
should be ignored, in most cases VIR_DEBUG is the proper logging
statement.

Additionally this code makes no sense here at all looking back at how
live hotplug is handled. In cases when the CPU is still considered
online by libvirt (your code specifically handles this case), then the
control flow never reaches this point, but rather the caller
'qemuDomainSetVcpuInternal' calls 'qemuDomainFilterHotplugVcpuEntities'
which doesn't allow onlining a cpu which is already online.

> +                goto cleanup;
> +            }
>              if (qemuDomainHotplugAddVcpu(driver, cfg, vm, nextvcpu) < 0)
>                  goto cleanup;
>          }
> @@ -6378,6 +6408,15 @@ qemuDomainSetVcpusLive(virQEMUDriver *driver,
>              if (!virBitmapIsBitSet(vcpumap, nextvcpu))
>                  continue;
>  
> +            vcpuinfo = virDomainDefGetVcpu(vm->def, nextvcpu);
> +            vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
> +
> +            if (vcpupriv && vcpupriv->hp_timeout_alias) {
> +                VIR_INFO("already serving unplug for vcpu '%zd' in domain %s",
> +                         nextvcpu, vm->def->name);
> +                goto cleanup;
> +            }

Same as above. Similarly to the additional note, this code is here not
wanted either, as it prevents re-sending of the device deletion request
which might be needed in some cases.

> +
>              if (qemuDomainHotplugDelVcpu(driver, cfg, vm, nextvcpu) < 0)
>                  goto cleanup;
>          }
> -- 
> 2.31.1
>