[PATCH 1/2] qemu_domain: Strip <acpi/> from s390(x) definitions

Peter Krempa posted 2 patches 3 months ago
There is a newer version of this series
[PATCH 1/2] qemu_domain: Strip <acpi/> from s390(x) definitions
Posted by Peter Krempa 3 months ago
The s390(x) machines never supported ACPI. That didn't stop users
enabling ACPI in their config. As of libvirt-9.2 (98c4e3d073) with new
enough qemu we reject configs which require ACPI, but qemu can't satisfy
it.

This breaks migration of existing VMs with the old wrong configs to new
libvirt installations.

To address this introduce a post-parse fixup removing the ACPI flag
specifically for s390 machines which do enable it in the definition.

The advantage of doing it in post-parse, rather than simply relaxing the
ABI stability check to allow users providing an fixed XML when migrating
(allowing change of the ACPI flag for s390 in ABI stability check, as it
 doesn't impact ABI), is that only the destination installation needs to
be patched in order to preserve migration.

The disadvantage is that users will not be notified that they are using
a broken configuration as it will be silently fixed. This is also the
reason why only specifically s390(x) machines are being fixed up despite
the problem also impacting other legacy architectures not supporting
ACPI as well.

Resolves: https://issues.redhat.com/browse/RHEL-49516
Signed-off-by: Peter Krempa <pkrempa@redhat.com>
---
 src/qemu/qemu_domain.c | 94 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
index 298f4bfb9e..4f77817215 100644
--- a/src/qemu/qemu_domain.c
+++ b/src/qemu/qemu_domain.c
@@ -5020,6 +5020,98 @@ qemuDomainDefPostParseBasic(virDomainDef *def,
 }


+/**
+ * qemuDomainDefACPIPostParse:
+ * @def: domain definition
+ * @qemuCaps: qemu capabilities object
+ *
+ * Fixup the use of ACPI flag on certain architectures that never supported it
+ * and users for some reason used it, which would break migration to newer
+ * libvirt versions which check whether given machine type supports ACPI.
+ */
+static void
+qemuDomainDefACPIPostParse(virDomainDef *def,
+                           virQEMUCaps *qemuCaps)
+{
+    bool stripACPI = false;
+
+    /* Only cases when ACPI is enabled need to be fixed up */
+    if (def->features[VIR_DOMAIN_FEATURE_ACPI] != VIR_TRISTATE_SWITCH_ON)
+        return;
+
+    switch (def->os.arch) {
+    /* x86(_64) and AARCH64 always supported ACPI */
+    case VIR_ARCH_I686:
+    case VIR_ARCH_X86_64:
+    case VIR_ARCH_AARCH64:
+        stripACPI = false;
+        break;
+
+    /* S390 never supported ACPI, but some users enabled it regardless in their
+     * configs for some reason.  In order to fix incoming migration we'll strip
+     * ACPI from those definitions, so that user configs are fixed by updating
+     * only the destination libvirt installation */
+    case VIR_ARCH_S390:
+    case VIR_ARCH_S390X:
+        stripACPI = true;
+        break;
+
+    /* Any other historic architectures could undergo the above treatment but
+     * since there were no user reports of invalid usage we avoid stripping ACPI */
+    case VIR_ARCH_ARMV6L:
+    case VIR_ARCH_ARMV7L:
+    case VIR_ARCH_ARMV7B:
+    case VIR_ARCH_PPC64:
+    case VIR_ARCH_PPC64LE:
+    case VIR_ARCH_ALPHA:
+    case VIR_ARCH_PPC:
+    case VIR_ARCH_PPCEMB:
+    case VIR_ARCH_SH4:
+    case VIR_ARCH_SH4EB:
+    case VIR_ARCH_RISCV32:
+    case VIR_ARCH_SPARC64:
+    case VIR_ARCH_MIPS:
+    case VIR_ARCH_MIPSEL:
+    case VIR_ARCH_MIPS64:
+    case VIR_ARCH_MIPS64EL:
+    case VIR_ARCH_CRIS:
+    case VIR_ARCH_ITANIUM:
+    case VIR_ARCH_LM32:
+    case VIR_ARCH_M68K:
+    case VIR_ARCH_MICROBLAZE:
+    case VIR_ARCH_MICROBLAZEEL:
+    case VIR_ARCH_OR32:
+    case VIR_ARCH_PARISC:
+    case VIR_ARCH_PARISC64:
+    case VIR_ARCH_PPCLE:
+    case VIR_ARCH_SPARC:
+    case VIR_ARCH_UNICORE32:
+    case VIR_ARCH_XTENSA:
+    case VIR_ARCH_XTENSAEB:
+        stripACPI = false;
+        break;
+
+    /* Any modern architecture must obey what qemu reports */
+    case VIR_ARCH_LOONGARCH64:
+    case VIR_ARCH_RISCV64:
+        stripACPI = false;
+        break;
+
+    case VIR_ARCH_NONE:
+    case VIR_ARCH_LAST:
+    default:
+        break;
+    }
+
+    /* To be sure, we only strip ACPI if given machine type doesn't support it
+     * or if the support state is unknown (which effectively means that it's
+     * unsupported, as we don't fixup x86(_64) and AARCH64) */
+    if (stripACPI &&
+        virQEMUCapsMachineSupportsACPI(qemuCaps, def->virtType, def->os.machine) != VIR_TRISTATE_BOOL_YES)
+        def->features[VIR_DOMAIN_FEATURE_ACPI] = VIR_TRISTATE_SWITCH_ABSENT;
+}
+
+
 static int
 qemuDomainDefPostParse(virDomainDef *def,
                        unsigned int parseFlags,
@@ -5040,6 +5132,8 @@ qemuDomainDefPostParse(virDomainDef *def,
     if (qemuDomainDefMachinePostParse(def, qemuCaps) < 0)
         return -1;

+    qemuDomainDefACPIPostParse(def, qemuCaps);
+
     if (qemuDomainDefBootPostParse(def, driver, parseFlags) < 0)
         return -1;

-- 
2.45.2
Re: [PATCH 1/2] qemu_domain: Strip <acpi/> from s390(x) definitions
Posted by Andrea Bolognani 3 months ago
On Wed, Jul 31, 2024 at 01:02:26PM GMT, Peter Krempa wrote:
> The s390(x) machines never supported ACPI. That didn't stop users
> enabling ACPI in their config. As of libvirt-9.2 (98c4e3d073) with new
> enough qemu we reject configs which require ACPI, but qemu can't satisfy
> it.
>
> This breaks migration of existing VMs with the old wrong configs to new
> libvirt installations.
>
> To address this introduce a post-parse fixup removing the ACPI flag
> specifically for s390 machines which do enable it in the definition.
>
> The advantage of doing it in post-parse, rather than simply relaxing the
> ABI stability check to allow users providing an fixed XML when migrating
> (allowing change of the ACPI flag for s390 in ABI stability check, as it
>  doesn't impact ABI), is that only the destination installation needs to
> be patched in order to preserve migration.
>
> The disadvantage is that users will not be notified that they are using
> a broken configuration as it will be silently fixed.

More on this below.

> +/**
> + * qemuDomainDefACPIPostParse:
> + * @def: domain definition
> + * @qemuCaps: qemu capabilities object
> + *
> + * Fixup the use of ACPI flag on certain architectures that never supported it
> + * and users for some reason used it, which would break migration to newer
> + * libvirt versions which check whether given machine type supports ACPI.
> + */
> +static void
> +qemuDomainDefACPIPostParse(virDomainDef *def,
> +                           virQEMUCaps *qemuCaps)
> +{
> +    bool stripACPI = false;
> +
> +    /* Only cases when ACPI is enabled need to be fixed up */
> +    if (def->features[VIR_DOMAIN_FEATURE_ACPI] != VIR_TRISTATE_SWITCH_ON)
> +        return;
> +
> +    switch (def->os.arch) {
> +    /* x86(_64) and AARCH64 always supported ACPI */
> +    case VIR_ARCH_I686:
> +    case VIR_ARCH_X86_64:
> +    case VIR_ARCH_AARCH64:
> +        stripACPI = false;
> +        break;
> +
> +    /* S390 never supported ACPI, but some users enabled it regardless in their
> +     * configs for some reason.  In order to fix incoming migration we'll strip
> +     * ACPI from those definitions, so that user configs are fixed by updating
> +     * only the destination libvirt installation */
> +    case VIR_ARCH_S390:
> +    case VIR_ARCH_S390X:
> +        stripACPI = true;
> +        break;
> +
> +    /* Any other historic architectures could undergo the above treatment but
> +     * since there were no user reports of invalid usage we avoid stripping ACPI */
> +    case VIR_ARCH_ARMV6L:
> +    case VIR_ARCH_ARMV7L:
> +    case VIR_ARCH_ARMV7B:
> +    case VIR_ARCH_PPC64:
> +    case VIR_ARCH_PPC64LE:
> +    case VIR_ARCH_ALPHA:
> +    case VIR_ARCH_PPC:
> +    case VIR_ARCH_PPCEMB:
> +    case VIR_ARCH_SH4:
> +    case VIR_ARCH_SH4EB:
> +    case VIR_ARCH_RISCV32:
> +    case VIR_ARCH_SPARC64:
> +    case VIR_ARCH_MIPS:
> +    case VIR_ARCH_MIPSEL:
> +    case VIR_ARCH_MIPS64:
> +    case VIR_ARCH_MIPS64EL:
> +    case VIR_ARCH_CRIS:
> +    case VIR_ARCH_ITANIUM:
> +    case VIR_ARCH_LM32:
> +    case VIR_ARCH_M68K:
> +    case VIR_ARCH_MICROBLAZE:
> +    case VIR_ARCH_MICROBLAZEEL:
> +    case VIR_ARCH_OR32:
> +    case VIR_ARCH_PARISC:
> +    case VIR_ARCH_PARISC64:
> +    case VIR_ARCH_PPCLE:
> +    case VIR_ARCH_SPARC:
> +    case VIR_ARCH_UNICORE32:
> +    case VIR_ARCH_XTENSA:
> +    case VIR_ARCH_XTENSAEB:
> +        stripACPI = false;
> +        break;
> +
> +    /* Any modern architecture must obey what qemu reports */
> +    case VIR_ARCH_LOONGARCH64:
> +    case VIR_ARCH_RISCV64:
> +        stripACPI = false;
> +        break;
> +
> +    case VIR_ARCH_NONE:
> +    case VIR_ARCH_LAST:
> +    default:
> +        break;
> +    }

This looks unnecessarily verbose. I kinda get the idea, but
considering that we're unlikely to go back and do anything about most
of the architectures that we're currently leaving alone, does it
really make sense to have all this code instead of just

  bool stripACPI = false;

  /* S390 never supported ACPI, but some users enabled it regardless in their
   * configs for some reason.  In order to fix incoming migration we'll strip
   * ACPI from those definitions, so that user configs are fixed by updating
   * only the destination libvirt installation */
  if (def->os.arch == VIR_ARCH_S390 || def->os.arch == VIR_ARCH_S390X)
      stripACPI = true;

? Maybe there's some motivation that I'm not seeing.

> +    /* To be sure, we only strip ACPI if given machine type doesn't support it
> +     * or if the support state is unknown (which effectively means that it's
> +     * unsupported, as we don't fixup x86(_64) and AARCH64) */
> +    if (stripACPI &&
> +        virQEMUCapsMachineSupportsACPI(qemuCaps, def->virtType, def->os.machine) != VIR_TRISTATE_BOOL_YES)
> +        def->features[VIR_DOMAIN_FEATURE_ACPI] = VIR_TRISTATE_SWITCH_ABSENT;

Clever safety measure.


Regarding the disadvantage you mention in the commit message, how
about we implement a middle-ground solution instead? If we only
actually strip the feature when

  !(parseFlags & VIR_DOMAIN_DEF_PARSE_ABI_UPDATE)

then we still silently fix migration, but we also prevent new s390x
domains with ACPI (supposedly) enabled from being defined, and
generally limit the scope of the workaround as much as possible.

-- 
Andrea Bolognani / Red Hat / Virtualization
Re: [PATCH 1/2] qemu_domain: Strip <acpi/> from s390(x) definitions
Posted by Peter Krempa 3 months ago
On Thu, Aug 01, 2024 at 02:35:59 -0700, Andrea Bolognani wrote:
> On Wed, Jul 31, 2024 at 01:02:26PM GMT, Peter Krempa wrote:
> > The s390(x) machines never supported ACPI. That didn't stop users
> > enabling ACPI in their config. As of libvirt-9.2 (98c4e3d073) with new
> > enough qemu we reject configs which require ACPI, but qemu can't satisfy
> > it.
> >
> > This breaks migration of existing VMs with the old wrong configs to new
> > libvirt installations.
> >
> > To address this introduce a post-parse fixup removing the ACPI flag
> > specifically for s390 machines which do enable it in the definition.
> >
> > The advantage of doing it in post-parse, rather than simply relaxing the
> > ABI stability check to allow users providing an fixed XML when migrating
> > (allowing change of the ACPI flag for s390 in ABI stability check, as it
> >  doesn't impact ABI), is that only the destination installation needs to
> > be patched in order to preserve migration.
> >
> > The disadvantage is that users will not be notified that they are using
> > a broken configuration as it will be silently fixed.
> 
> More on this below.
> 
> > +/**
> > + * qemuDomainDefACPIPostParse:
> > + * @def: domain definition
> > + * @qemuCaps: qemu capabilities object
> > + *
> > + * Fixup the use of ACPI flag on certain architectures that never supported it
> > + * and users for some reason used it, which would break migration to newer
> > + * libvirt versions which check whether given machine type supports ACPI.
> > + */
> > +static void
> > +qemuDomainDefACPIPostParse(virDomainDef *def,
> > +                           virQEMUCaps *qemuCaps)
> > +{
> > +    bool stripACPI = false;
> > +
> > +    /* Only cases when ACPI is enabled need to be fixed up */
> > +    if (def->features[VIR_DOMAIN_FEATURE_ACPI] != VIR_TRISTATE_SWITCH_ON)
> > +        return;
> > +
> > +    switch (def->os.arch) {
> > +    /* x86(_64) and AARCH64 always supported ACPI */
> > +    case VIR_ARCH_I686:
> > +    case VIR_ARCH_X86_64:
> > +    case VIR_ARCH_AARCH64:
> > +        stripACPI = false;
> > +        break;
> > +
> > +    /* S390 never supported ACPI, but some users enabled it regardless in their
> > +     * configs for some reason.  In order to fix incoming migration we'll strip
> > +     * ACPI from those definitions, so that user configs are fixed by updating
> > +     * only the destination libvirt installation */
> > +    case VIR_ARCH_S390:
> > +    case VIR_ARCH_S390X:
> > +        stripACPI = true;
> > +        break;
> > +
> > +    /* Any other historic architectures could undergo the above treatment but
> > +     * since there were no user reports of invalid usage we avoid stripping ACPI */
> > +    case VIR_ARCH_ARMV6L:
> > +    case VIR_ARCH_ARMV7L:
> > +    case VIR_ARCH_ARMV7B:
> > +    case VIR_ARCH_PPC64:
> > +    case VIR_ARCH_PPC64LE:
> > +    case VIR_ARCH_ALPHA:
> > +    case VIR_ARCH_PPC:
> > +    case VIR_ARCH_PPCEMB:
> > +    case VIR_ARCH_SH4:
> > +    case VIR_ARCH_SH4EB:
> > +    case VIR_ARCH_RISCV32:
> > +    case VIR_ARCH_SPARC64:
> > +    case VIR_ARCH_MIPS:
> > +    case VIR_ARCH_MIPSEL:
> > +    case VIR_ARCH_MIPS64:
> > +    case VIR_ARCH_MIPS64EL:
> > +    case VIR_ARCH_CRIS:
> > +    case VIR_ARCH_ITANIUM:
> > +    case VIR_ARCH_LM32:
> > +    case VIR_ARCH_M68K:
> > +    case VIR_ARCH_MICROBLAZE:
> > +    case VIR_ARCH_MICROBLAZEEL:
> > +    case VIR_ARCH_OR32:
> > +    case VIR_ARCH_PARISC:
> > +    case VIR_ARCH_PARISC64:
> > +    case VIR_ARCH_PPCLE:
> > +    case VIR_ARCH_SPARC:
> > +    case VIR_ARCH_UNICORE32:
> > +    case VIR_ARCH_XTENSA:
> > +    case VIR_ARCH_XTENSAEB:
> > +        stripACPI = false;
> > +        break;
> > +
> > +    /* Any modern architecture must obey what qemu reports */
> > +    case VIR_ARCH_LOONGARCH64:
> > +    case VIR_ARCH_RISCV64:
> > +        stripACPI = false;
> > +        break;
> > +
> > +    case VIR_ARCH_NONE:
> > +    case VIR_ARCH_LAST:
> > +    default:
> > +        break;
> > +    }
> 
> This looks unnecessarily verbose. I kinda get the idea, but
> considering that we're unlikely to go back and do anything about most
> of the architectures that we're currently leaving alone, does it
> really make sense to have all this code instead of just
> 
>   bool stripACPI = false;
> 
>   /* S390 never supported ACPI, but some users enabled it regardless in their
>    * configs for some reason.  In order to fix incoming migration we'll strip
>    * ACPI from those definitions, so that user configs are fixed by updating
>    * only the destination libvirt installation */
>   if (def->os.arch == VIR_ARCH_S390 || def->os.arch == VIR_ARCH_S390X)
>       stripACPI = true;
> 
> ? Maybe there's some motivation that I'm not seeing.


Well, I tend to prefer the switch statement because it forces you to
consider every code path when adding another member into the enum. In
this case though, you're right that we won't ever want to add anything
to the case when stripping is allowed.

On the other hand it clearly differentiates the legacy arches where
theoretically the same bug could be considered to be fixed and the
modern arches where that stuff must never be done.

Also note that yes this is more lines, but I'd consider the "verbosity"
to be at the same level, because you then don't have to consider what
other options are around when reading the code. And yes, all of that
could be replaced by a comment.

> 
> > +    /* To be sure, we only strip ACPI if given machine type doesn't support it
> > +     * or if the support state is unknown (which effectively means that it's
> > +     * unsupported, as we don't fixup x86(_64) and AARCH64) */
> > +    if (stripACPI &&
> > +        virQEMUCapsMachineSupportsACPI(qemuCaps, def->virtType, def->os.machine) != VIR_TRISTATE_BOOL_YES)
> > +        def->features[VIR_DOMAIN_FEATURE_ACPI] = VIR_TRISTATE_SWITCH_ABSENT;
> 
> Clever safety measure.
> 
> 
> Regarding the disadvantage you mention in the commit message, how
> about we implement a middle-ground solution instead? If we only
> actually strip the feature when
> 
>   !(parseFlags & VIR_DOMAIN_DEF_PARSE_ABI_UPDATE)
> 
> then we still silently fix migration, but we also prevent new s390x
> domains with ACPI (supposedly) enabled from being defined, and
> generally limit the scope of the workaround as much as possible.

I've considered this, but in case when you migrate the persistent XML
along with the migration or use offline migration, you won't be able to
start the VM. Yes it will force you to fix the config but it feels wrong
to allow migration but then have a definition which no longer works.

If the s390 folks consider this important for their usage and want to
forgo the fact that we'll be stripping <acpi/> instead of them fixing
the configs in the first place, we should do that always to prevent
having weird intermediate states.

If we want to go the route of not allowing 'defining' a new config with
acpi enabled, we'll need another VIR_DOMAIN_DEF_PARSE_ flag asserted on
all migration related XML parsing where the modification will be
performed and on none of the normal code paths, but none of the existing
flags allow that granularity.
Re: [PATCH 1/2] qemu_domain: Strip <acpi/> from s390(x) definitions
Posted by Andrea Bolognani 3 months ago
On Thu, Aug 01, 2024 at 12:03:52PM GMT, Peter Krempa wrote:
> On Thu, Aug 01, 2024 at 02:35:59 -0700, Andrea Bolognani wrote:
> > This looks unnecessarily verbose. I kinda get the idea, but
> > considering that we're unlikely to go back and do anything about most
> > of the architectures that we're currently leaving alone, does it
> > really make sense to have all this code instead of just
> >
> >   bool stripACPI = false;
> >
> >   /* S390 never supported ACPI, but some users enabled it regardless in their
> >    * configs for some reason.  In order to fix incoming migration we'll strip
> >    * ACPI from those definitions, so that user configs are fixed by updating
> >    * only the destination libvirt installation */
> >   if (def->os.arch == VIR_ARCH_S390 || def->os.arch == VIR_ARCH_S390X)
> >       stripACPI = true;
> >
> > ? Maybe there's some motivation that I'm not seeing.
>
>
> Well, I tend to prefer the switch statement because it forces you to
> consider every code path when adding another member into the enum. In
> this case though, you're right that we won't ever want to add anything
> to the case when stripping is allowed.
>
> On the other hand it clearly differentiates the legacy arches where
> theoretically the same bug could be considered to be fixed and the
> modern arches where that stuff must never be done.
>
> Also note that yes this is more lines, but I'd consider the "verbosity"
> to be at the same level, because you then don't have to consider what
> other options are around when reading the code. And yes, all of that
> could be replaced by a comment.

I generally prefer the switch approach as well, it just seems
overkill in this specific case. But it's perfectly valid, so feel
free to just ignore this bit of feedback.

> > Regarding the disadvantage you mention in the commit message, how
> > about we implement a middle-ground solution instead? If we only
> > actually strip the feature when
> >
> >   !(parseFlags & VIR_DOMAIN_DEF_PARSE_ABI_UPDATE)
> >
> > then we still silently fix migration, but we also prevent new s390x
> > domains with ACPI (supposedly) enabled from being defined, and
> > generally limit the scope of the workaround as much as possible.
>
> I've considered this, but in case when you migrate the persistent XML
> along with the migration or use offline migration, you won't be able to
> start the VM. Yes it will force you to fix the config but it feels wrong
> to allow migration but then have a definition which no longer works.
>
> If the s390 folks consider this important for their usage and want to
> forgo the fact that we'll be stripping <acpi/> instead of them fixing
> the configs in the first place, we should do that always to prevent
> having weird intermediate states.
>
> If we want to go the route of not allowing 'defining' a new config with
> acpi enabled, we'll need another VIR_DOMAIN_DEF_PARSE_ flag asserted on
> all migration related XML parsing where the modification will be
> performed and on none of the normal code paths, but none of the existing
> flags allow that granularity.

So offline migration sets ABI_UPDATE? I guess that makes sense to
some extent, as live migration obviously has the strictest possible
requirements, but still it feels weird that the same domain migrated
between the same two hosts could end up with slightly different ABI
depending on whether or not it's migrated while it's running.

I can tell you for certain that the presence of ABI_UPDATE is
(mis)interpreted as a witness for "this is a newly-defined domain" in
several places. I've introduced a few myself. Having a separate flag
that actually carries that meaning would probably be a very good
idea.

-- 
Andrea Bolognani / Red Hat / Virtualization
Re: [PATCH 1/2] qemu_domain: Strip <acpi/> from s390(x) definitions
Posted by Peter Krempa 3 months ago
On Thu, Aug 01, 2024 at 06:04:59 -0700, Andrea Bolognani wrote:
> On Thu, Aug 01, 2024 at 12:03:52PM GMT, Peter Krempa wrote:
> > On Thu, Aug 01, 2024 at 02:35:59 -0700, Andrea Bolognani wrote:
> > > This looks unnecessarily verbose. I kinda get the idea, but
> > > considering that we're unlikely to go back and do anything about most
> > > of the architectures that we're currently leaving alone, does it
> > > really make sense to have all this code instead of just
> > >
> > >   bool stripACPI = false;
> > >
> > >   /* S390 never supported ACPI, but some users enabled it regardless in their
> > >    * configs for some reason.  In order to fix incoming migration we'll strip
> > >    * ACPI from those definitions, so that user configs are fixed by updating
> > >    * only the destination libvirt installation */
> > >   if (def->os.arch == VIR_ARCH_S390 || def->os.arch == VIR_ARCH_S390X)
> > >       stripACPI = true;
> > >
> > > ? Maybe there's some motivation that I'm not seeing.
> >
> >
> > Well, I tend to prefer the switch statement because it forces you to
> > consider every code path when adding another member into the enum. In
> > this case though, you're right that we won't ever want to add anything
> > to the case when stripping is allowed.
> >
> > On the other hand it clearly differentiates the legacy arches where
> > theoretically the same bug could be considered to be fixed and the
> > modern arches where that stuff must never be done.
> >
> > Also note that yes this is more lines, but I'd consider the "verbosity"
> > to be at the same level, because you then don't have to consider what
> > other options are around when reading the code. And yes, all of that
> > could be replaced by a comment.
> 
> I generally prefer the switch approach as well, it just seems
> overkill in this specific case. But it's perfectly valid, so feel
> free to just ignore this bit of feedback.
> 
> > > Regarding the disadvantage you mention in the commit message, how
> > > about we implement a middle-ground solution instead? If we only
> > > actually strip the feature when
> > >
> > >   !(parseFlags & VIR_DOMAIN_DEF_PARSE_ABI_UPDATE)
> > >
> > > then we still silently fix migration, but we also prevent new s390x
> > > domains with ACPI (supposedly) enabled from being defined, and
> > > generally limit the scope of the workaround as much as possible.
> >
> > I've considered this, but in case when you migrate the persistent XML
> > along with the migration or use offline migration, you won't be able to
> > start the VM. Yes it will force you to fix the config but it feels wrong
> > to allow migration but then have a definition which no longer works.
> >
> > If the s390 folks consider this important for their usage and want to
> > forgo the fact that we'll be stripping <acpi/> instead of them fixing
> > the configs in the first place, we should do that always to prevent
> > having weird intermediate states.
> >
> > If we want to go the route of not allowing 'defining' a new config with
> > acpi enabled, we'll need another VIR_DOMAIN_DEF_PARSE_ flag asserted on
> > all migration related XML parsing where the modification will be
> > performed and on none of the normal code paths, but none of the existing
> > flags allow that granularity.
> 
> So offline migration sets ABI_UPDATE? I guess that makes sense to
> some extent, as live migration obviously has the strictest possible
> requirements, but still it feels weird that the same domain migrated
> between the same two hosts could end up with slightly different ABI
> depending on whether or not it's migrated while it's running.

Well, I thought it did but it doesn't seem to be the case:

src/qemu/qemu_driver.c=static virDomainPtr qemuDomainCreateXML(virConnectPtr conn,
src/qemu/qemu_driver.c:                               VIR_DOMAIN_DEF_PARSE_ABI_UPDATE;
src/qemu/qemu_driver.c=static char *qemuConnectDomainXMLToNative(virConnectPtr conn,
src/qemu/qemu_driver.c:                                            VIR_DOMAIN_DEF_PARSE_ABI_UPDATE)))
src/qemu/qemu_driver.c=qemuDomainDefineXMLFlags(virConnectPtr conn,
src/qemu/qemu_driver.c:                               VIR_DOMAIN_DEF_PARSE_ABI_UPDATE;
src/qemu/qemu_driver.c=qemuDomainAttachDeviceLiveAndConfig(virDomainObj *vm,
src/qemu/qemu_driver.c:                               VIR_DOMAIN_DEF_PARSE_ABI_UPDATE;

Which are the only relevant code paths asserting it.
> 
> I can tell you for certain that the presence of ABI_UPDATE is
> (mis)interpreted as a witness for "this is a newly-defined domain" in
> several places. I've introduced a few myself. Having a separate flag
> that actually carries that meaning would probably be a very good
> idea.

Well, I thought this flag would be asserted in more code paths, as of
the 4 above, 1 is irrelevant and 3 are safe I guess we can do this thing
as well.
Re: [PATCH 1/2] qemu_domain: Strip <acpi/> from s390(x) definitions
Posted by Boris Fiuczynski 3 months ago
Reviewed-by: Boris Fiuczynski <fiuczy@linux.ibm.com>

On 7/31/24 1:02 PM, Peter Krempa wrote:
> The s390(x) machines never supported ACPI. That didn't stop users
> enabling ACPI in their config. As of libvirt-9.2 (98c4e3d073) with new
> enough qemu we reject configs which require ACPI, but qemu can't satisfy
> it.
> 
> This breaks migration of existing VMs with the old wrong configs to new
> libvirt installations.
> 
> To address this introduce a post-parse fixup removing the ACPI flag
> specifically for s390 machines which do enable it in the definition.
> 
> The advantage of doing it in post-parse, rather than simply relaxing the
> ABI stability check to allow users providing an fixed XML when migrating
> (allowing change of the ACPI flag for s390 in ABI stability check, as it
>   doesn't impact ABI), is that only the destination installation needs to
> be patched in order to preserve migration.
> 
> The disadvantage is that users will not be notified that they are using
> a broken configuration as it will be silently fixed. This is also the
> reason why only specifically s390(x) machines are being fixed up despite
> the problem also impacting other legacy architectures not supporting
> ACPI as well.
> 
> Resolves: https://issues.redhat.com/browse/RHEL-49516
> Signed-off-by: Peter Krempa <pkrempa@redhat.com>
> ---
>   src/qemu/qemu_domain.c | 94 ++++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 94 insertions(+)
> 
> diff --git a/src/qemu/qemu_domain.c b/src/qemu/qemu_domain.c
> index 298f4bfb9e..4f77817215 100644
> --- a/src/qemu/qemu_domain.c
> +++ b/src/qemu/qemu_domain.c
> @@ -5020,6 +5020,98 @@ qemuDomainDefPostParseBasic(virDomainDef *def,
>   }
> 
> 
> +/**
> + * qemuDomainDefACPIPostParse:
> + * @def: domain definition
> + * @qemuCaps: qemu capabilities object
> + *
> + * Fixup the use of ACPI flag on certain architectures that never supported it
> + * and users for some reason used it, which would break migration to newer
> + * libvirt versions which check whether given machine type supports ACPI.
> + */
> +static void
> +qemuDomainDefACPIPostParse(virDomainDef *def,
> +                           virQEMUCaps *qemuCaps)
> +{
> +    bool stripACPI = false;
> +
> +    /* Only cases when ACPI is enabled need to be fixed up */
> +    if (def->features[VIR_DOMAIN_FEATURE_ACPI] != VIR_TRISTATE_SWITCH_ON)
> +        return;
> +
> +    switch (def->os.arch) {
> +    /* x86(_64) and AARCH64 always supported ACPI */
> +    case VIR_ARCH_I686:
> +    case VIR_ARCH_X86_64:
> +    case VIR_ARCH_AARCH64:
> +        stripACPI = false;
> +        break;
> +
> +    /* S390 never supported ACPI, but some users enabled it regardless in their
> +     * configs for some reason.  In order to fix incoming migration we'll strip
> +     * ACPI from those definitions, so that user configs are fixed by updating
> +     * only the destination libvirt installation */
> +    case VIR_ARCH_S390:
> +    case VIR_ARCH_S390X:
> +        stripACPI = true;
> +        break;
> +
> +    /* Any other historic architectures could undergo the above treatment but
> +     * since there were no user reports of invalid usage we avoid stripping ACPI */
> +    case VIR_ARCH_ARMV6L:
> +    case VIR_ARCH_ARMV7L:
> +    case VIR_ARCH_ARMV7B:
> +    case VIR_ARCH_PPC64:
> +    case VIR_ARCH_PPC64LE:
> +    case VIR_ARCH_ALPHA:
> +    case VIR_ARCH_PPC:
> +    case VIR_ARCH_PPCEMB:
> +    case VIR_ARCH_SH4:
> +    case VIR_ARCH_SH4EB:
> +    case VIR_ARCH_RISCV32:
> +    case VIR_ARCH_SPARC64:
> +    case VIR_ARCH_MIPS:
> +    case VIR_ARCH_MIPSEL:
> +    case VIR_ARCH_MIPS64:
> +    case VIR_ARCH_MIPS64EL:
> +    case VIR_ARCH_CRIS:
> +    case VIR_ARCH_ITANIUM:
> +    case VIR_ARCH_LM32:
> +    case VIR_ARCH_M68K:
> +    case VIR_ARCH_MICROBLAZE:
> +    case VIR_ARCH_MICROBLAZEEL:
> +    case VIR_ARCH_OR32:
> +    case VIR_ARCH_PARISC:
> +    case VIR_ARCH_PARISC64:
> +    case VIR_ARCH_PPCLE:
> +    case VIR_ARCH_SPARC:
> +    case VIR_ARCH_UNICORE32:
> +    case VIR_ARCH_XTENSA:
> +    case VIR_ARCH_XTENSAEB:
> +        stripACPI = false;
> +        break;
> +
> +    /* Any modern architecture must obey what qemu reports */
> +    case VIR_ARCH_LOONGARCH64:
> +    case VIR_ARCH_RISCV64:
> +        stripACPI = false;
> +        break;
> +
> +    case VIR_ARCH_NONE:
> +    case VIR_ARCH_LAST:
> +    default:
> +        break;
> +    }
> +
> +    /* To be sure, we only strip ACPI if given machine type doesn't support it
> +     * or if the support state is unknown (which effectively means that it's
> +     * unsupported, as we don't fixup x86(_64) and AARCH64) */
> +    if (stripACPI &&
> +        virQEMUCapsMachineSupportsACPI(qemuCaps, def->virtType, def->os.machine) != VIR_TRISTATE_BOOL_YES)
> +        def->features[VIR_DOMAIN_FEATURE_ACPI] = VIR_TRISTATE_SWITCH_ABSENT;
> +}
> +
> +
>   static int
>   qemuDomainDefPostParse(virDomainDef *def,
>                          unsigned int parseFlags,
> @@ -5040,6 +5132,8 @@ qemuDomainDefPostParse(virDomainDef *def,
>       if (qemuDomainDefMachinePostParse(def, qemuCaps) < 0)
>           return -1;
> 
> +    qemuDomainDefACPIPostParse(def, qemuCaps);
> +
>       if (qemuDomainDefBootPostParse(def, driver, parseFlags) < 0)
>           return -1;
> 

-- 
Mit freundlichen Grüßen/Kind regards
    Boris Fiuczynski

IBM Deutschland Research & Development GmbH
Vorsitzender des Aufsichtsrats: Wolfgang Wendt
Geschäftsführung: David Faller
Sitz der Gesellschaft: Böblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294