[PATCH v3] qemu: Don't report spurious errors from vCPU tid validation on hotunplug timeout

Peter Krempa posted 1 patch 1 week, 5 days ago
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/libvirt tags/patchew/82ff159672c83e737b044d28398a12019fffbf9d.1669126039.git.pkrempa@redhat.com
src/qemu/qemu_domain.c  |  6 +++--
src/qemu/qemu_hotplug.c | 53 ++++++++++++++++++++++++-----------------
2 files changed, 35 insertions(+), 24 deletions(-)
[PATCH v3] qemu: Don't report spurious errors from vCPU tid validation on hotunplug timeout
Posted by Peter Krempa 1 week, 5 days ago
From: Shaleen Bathla <shaleen.bathla@oracle.com>

Use of qemuDomainValidateVcpuInfo in the helpers for hotplug and unplug
of vCPUs can lead to spurious errors reported such as:

  internal error: qemu didn't report thread id for vcpu 'XX'"

The reason for this is that qemuDomainValidateVcpuInfo validates the
state of all vCPUs against the expected state of vCPUs. If an unplug
operation completed before libvirt was unable to process it yet the
expected state could not reflect the current state.

To avoid spurious errors the qemuDomainHotplugAddVcpu and
qemuDomainRemoveVcpu functions are modified to do localized validation
only for the vCPUs they actually modify.

We also now ensure that the cgroups are modified before bailing out on
error for any vCPUs which passed validation.

Additionally in order for qemuDomainRemoveVcpuAlias to be able to find
the unplugged vCPU we must ensure that qemuDomainRefreshVcpuInfo does
not clear out the alias in case when the vCPU is no longer reported by
qemu.

Co-authored-by: Partha Satapathy <partha.satapathy@oracle.com>
Signed-off-by: Shaleen Bathla <shaleen.bathla@oracle.com>
Signed-off-by: Peter Krempa <pkrempa@redhat.com>
---

v3 addresses my review feedback of the original patch, as well as
rewrites the commit message for more clarity.

 src/qemu/qemu_domain.c  |  6 +++--
 src/qemu/qemu_hotplug.c | 53 ++++++++++++++++++++++++-----------------
 2 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index ef1a9c8c74..64ebec626c 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -9795,8 +9795,10 @@ qemuDomainRefreshVcpuInfo(virDomainObj *vm,
         vcpupriv->vcpus = info[i].vcpus;
         VIR_FREE(vcpupriv->type);
         vcpupriv->type = g_steal_pointer(&info[i].type);
-        VIR_FREE(vcpupriv->alias);
-        vcpupriv->alias = g_steal_pointer(&info[i].alias);
+        if (info[i].alias) {
+            VIR_FREE(vcpupriv->alias);
+            vcpupriv->alias = g_steal_pointer(&info[i].alias);
+        }
         virJSONValueFree(vcpupriv->props);
         vcpupriv->props = g_steal_pointer(&info[i].props);
         vcpupriv->enable_id = info[i].id;
diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
index da92ced2f4..6e300f547c 100644
--- a/src/qemu/qemu_hotplug.c
+++ b/src/qemu/qemu_hotplug.c
@@ -6088,38 +6088,37 @@ qemuDomainRemoveVcpu(virDomainObj *vm,
     qemuDomainVcpuPrivate *vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
     int oldvcpus = virDomainDefGetVcpus(vm->def);
     unsigned int nvcpus = vcpupriv->vcpus;
-    virErrorPtr save_error = NULL;
     size_t i;
+    ssize_t offlineVcpuWithTid = -1;

     if (qemuDomainRefreshVcpuInfo(vm, VIR_ASYNC_JOB_NONE, false) < 0)
         return -1;

-    /* validation requires us to set the expected state prior to calling it */
     for (i = vcpu; i < vcpu + nvcpus; i++) {
         vcpuinfo = virDomainDefGetVcpu(vm->def, i);
-        vcpuinfo->online = false;
+        vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
+
+        if (vcpupriv->tid == 0) {
+            vcpuinfo->online = false;
+            /* Clear the alias as VCPU is now unplugged */
+            VIR_FREE(vcpupriv->alias);
+            ignore_value(virCgroupDelThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i));
+        } else {
+            if (offlineVcpuWithTid == -1)
+                offlineVcpuWithTid = i;
+        }
     }

-    if (qemuDomainValidateVcpuInfo(vm) < 0) {
-        /* rollback vcpu count if the setting has failed */
+    if (offlineVcpuWithTid != -1) {
+        virReportError(VIR_ERR_INTERNAL_ERROR,
+                       _("qemu reported thread id for inactive vcpu '%zu'"),
+                       offlineVcpuWithTid);
         virDomainAuditVcpu(vm, oldvcpus, oldvcpus - nvcpus, "update", false);
-
-        for (i = vcpu; i < vcpu + nvcpus; i++) {
-            vcpuinfo = virDomainDefGetVcpu(vm->def, i);
-            vcpuinfo->online = true;
-        }
         return -1;
     }

     virDomainAuditVcpu(vm, oldvcpus, oldvcpus - nvcpus, "update", true);

-    virErrorPreserveLast(&save_error);
-
-    for (i = vcpu; i < vcpu + nvcpus; i++)
-        ignore_value(virCgroupDelThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i));
-
-    virErrorRestore(&save_error);
-
     return 0;
 }

@@ -6141,6 +6140,9 @@ qemuDomainRemoveVcpuAlias(virDomainObj *vm,
             return;
         }
     }
+
+    VIR_DEBUG("vcpu '%s' not found in vcpulist of domain '%s'",
+              alias, vm->def->name);
 }


@@ -6209,6 +6211,7 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver,
     int rc;
     int oldvcpus = virDomainDefGetVcpus(vm->def);
     size_t i;
+    bool vcpuTidMissing = false;

     if (!qemuDomainSupportsNewVcpuHotplug(vm)) {
         virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
@@ -6238,20 +6241,26 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver,
     if (qemuDomainRefreshVcpuInfo(vm, VIR_ASYNC_JOB_NONE, false) < 0)
         return -1;

-    /* validation requires us to set the expected state prior to calling it */
     for (i = vcpu; i < vcpu + nvcpus; i++) {
         vcpuinfo = virDomainDefGetVcpu(vm->def, i);
         vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);

         vcpuinfo->online = true;

-        if (vcpupriv->tid > 0 &&
-            qemuProcessSetupVcpu(vm, i, true) < 0)
-            return -1;
+        if (vcpupriv->tid > 0) {
+            if (qemuProcessSetupVcpu(vm, i, true) < 0) {
+                return -1;
+            }
+        } else {
+            vcpuTidMissing = true;
+        }
     }

-    if (qemuDomainValidateVcpuInfo(vm) < 0)
+    if (vcpuTidMissing && qemuDomainHasVcpuPids(vm)) {
+        virReportError(VIR_ERR_INTERNAL_ERROR,
+                       _("qemu didn't report thread id for vcpu '%zu'"), i);
         return -1;
+    }

     qemuDomainVcpuPersistOrder(vm->def);

-- 
2.37.3
Re: [External] : [PATCH v3] qemu: Don't report spurious errors from vCPU tid validation on hotunplug timeout
Posted by Shaleen Bathla 1 week, 4 days ago
Tested internally and v3 also works well.

Thanks and Regards,
Shaleen Bathla

On Tue, Nov 22, 2022 at 03:08:43PM +0100, Peter Krempa wrote:
> From: Shaleen Bathla <shaleen.bathla@oracle.com>
> 
> Use of qemuDomainValidateVcpuInfo in the helpers for hotplug and unplug
> of vCPUs can lead to spurious errors reported such as:
> 
>   internal error: qemu didn't report thread id for vcpu 'XX'"
> 
> The reason for this is that qemuDomainValidateVcpuInfo validates the
> state of all vCPUs against the expected state of vCPUs. If an unplug
> operation completed before libvirt was unable to process it yet the
> expected state could not reflect the current state.
> 
> To avoid spurious errors the qemuDomainHotplugAddVcpu and
> qemuDomainRemoveVcpu functions are modified to do localized validation
> only for the vCPUs they actually modify.
> 
> We also now ensure that the cgroups are modified before bailing out on
> error for any vCPUs which passed validation.
> 
> Additionally in order for qemuDomainRemoveVcpuAlias to be able to find
> the unplugged vCPU we must ensure that qemuDomainRefreshVcpuInfo does
> not clear out the alias in case when the vCPU is no longer reported by
> qemu.
> 
> Co-authored-by: Partha Satapathy <partha.satapathy@oracle.com>
> Signed-off-by: Shaleen Bathla <shaleen.bathla@oracle.com>
> Signed-off-by: Peter Krempa <pkrempa@redhat.com>
> ---
> 
> v3 addresses my review feedback of the original patch, as well as
> rewrites the commit message for more clarity.
> 
>  src/qemu/qemu_domain.c  |  6 +++--
>  src/qemu/qemu_hotplug.c | 53 ++++++++++++++++++++++++-----------------
>  2 files changed, 35 insertions(+), 24 deletions(-)
> 
> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
> index ef1a9c8c74..64ebec626c 100644
> --- a/src/qemu/qemu_domain.c
> +++ b/src/qemu/qemu_domain.c
> @@ -9795,8 +9795,10 @@ qemuDomainRefreshVcpuInfo(virDomainObj *vm,
>          vcpupriv->vcpus = info[i].vcpus;
>          VIR_FREE(vcpupriv->type);
>          vcpupriv->type = g_steal_pointer(&info[i].type);
> -        VIR_FREE(vcpupriv->alias);
> -        vcpupriv->alias = g_steal_pointer(&info[i].alias);
> +        if (info[i].alias) {
> +            VIR_FREE(vcpupriv->alias);
> +            vcpupriv->alias = g_steal_pointer(&info[i].alias);
> +        }
>          virJSONValueFree(vcpupriv->props);
>          vcpupriv->props = g_steal_pointer(&info[i].props);
>          vcpupriv->enable_id = info[i].id;
> diff --git a/src/qemu/qemu_hotplug.c b/src/qemu/qemu_hotplug.c
> index da92ced2f4..6e300f547c 100644
> --- a/src/qemu/qemu_hotplug.c
> +++ b/src/qemu/qemu_hotplug.c
> @@ -6088,38 +6088,37 @@ qemuDomainRemoveVcpu(virDomainObj *vm,
>      qemuDomainVcpuPrivate *vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
>      int oldvcpus = virDomainDefGetVcpus(vm->def);
>      unsigned int nvcpus = vcpupriv->vcpus;
> -    virErrorPtr save_error = NULL;
>      size_t i;
> +    ssize_t offlineVcpuWithTid = -1;
> 
>      if (qemuDomainRefreshVcpuInfo(vm, VIR_ASYNC_JOB_NONE, false) < 0)
>          return -1;
> 
> -    /* validation requires us to set the expected state prior to calling it */
>      for (i = vcpu; i < vcpu + nvcpus; i++) {
>          vcpuinfo = virDomainDefGetVcpu(vm->def, i);
> -        vcpuinfo->online = false;
> +        vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
> +
> +        if (vcpupriv->tid == 0) {
> +            vcpuinfo->online = false;
> +            /* Clear the alias as VCPU is now unplugged */
> +            VIR_FREE(vcpupriv->alias);
> +            ignore_value(virCgroupDelThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i));
> +        } else {
> +            if (offlineVcpuWithTid == -1)
> +                offlineVcpuWithTid = i;
> +        }
>      }
> 
> -    if (qemuDomainValidateVcpuInfo(vm) < 0) {
> -        /* rollback vcpu count if the setting has failed */
> +    if (offlineVcpuWithTid != -1) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR,
> +                       _("qemu reported thread id for inactive vcpu '%zu'"),
> +                       offlineVcpuWithTid);
>          virDomainAuditVcpu(vm, oldvcpus, oldvcpus - nvcpus, "update", false);
> -
> -        for (i = vcpu; i < vcpu + nvcpus; i++) {
> -            vcpuinfo = virDomainDefGetVcpu(vm->def, i);
> -            vcpuinfo->online = true;
> -        }
>          return -1;
>      }
> 
>      virDomainAuditVcpu(vm, oldvcpus, oldvcpus - nvcpus, "update", true);
> 
> -    virErrorPreserveLast(&save_error);
> -
> -    for (i = vcpu; i < vcpu + nvcpus; i++)
> -        ignore_value(virCgroupDelThread(priv->cgroup, VIR_CGROUP_THREAD_VCPU, i));
> -
> -    virErrorRestore(&save_error);
> -
>      return 0;
>  }
> 
> @@ -6141,6 +6140,9 @@ qemuDomainRemoveVcpuAlias(virDomainObj *vm,
>              return;
>          }
>      }
> +
> +    VIR_DEBUG("vcpu '%s' not found in vcpulist of domain '%s'",
> +              alias, vm->def->name);
>  }
> 
> 
> @@ -6209,6 +6211,7 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver,
>      int rc;
>      int oldvcpus = virDomainDefGetVcpus(vm->def);
>      size_t i;
> +    bool vcpuTidMissing = false;
> 
>      if (!qemuDomainSupportsNewVcpuHotplug(vm)) {
>          virReportError(VIR_ERR_OPERATION_UNSUPPORTED, "%s",
> @@ -6238,20 +6241,26 @@ qemuDomainHotplugAddVcpu(virQEMUDriver *driver,
>      if (qemuDomainRefreshVcpuInfo(vm, VIR_ASYNC_JOB_NONE, false) < 0)
>          return -1;
> 
> -    /* validation requires us to set the expected state prior to calling it */
>      for (i = vcpu; i < vcpu + nvcpus; i++) {
>          vcpuinfo = virDomainDefGetVcpu(vm->def, i);
>          vcpupriv = QEMU_DOMAIN_VCPU_PRIVATE(vcpuinfo);
> 
>          vcpuinfo->online = true;
> 
> -        if (vcpupriv->tid > 0 &&
> -            qemuProcessSetupVcpu(vm, i, true) < 0)
> -            return -1;
> +        if (vcpupriv->tid > 0) {
> +            if (qemuProcessSetupVcpu(vm, i, true) < 0) {
> +                return -1;
> +            }
> +        } else {
> +            vcpuTidMissing = true;
> +        }
>      }
> 
> -    if (qemuDomainValidateVcpuInfo(vm) < 0)
> +    if (vcpuTidMissing && qemuDomainHasVcpuPids(vm)) {
> +        virReportError(VIR_ERR_INTERNAL_ERROR,
> +                       _("qemu didn't report thread id for vcpu '%zu'"), i);
>          return -1;
> +    }
> 
>      qemuDomainVcpuPersistOrder(vm->def);
> 
> -- 
> 2.37.3
>