[PATCH] NVRAM: check NVRAM file size and recover from template

Hao Wang posted 1 patch 3 years, 8 months ago
Test syntax-check failed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/libvirt tags/patchew/dcc1e41c-f32b-9560-c18b-657ec6c04821@huawei.com
src/qemu/qemu_process.c | 54 ++++++++++++++++++++++++++++++++++++++++-
1 file changed, 53 insertions(+), 1 deletion(-)
[PATCH] NVRAM: check NVRAM file size and recover from template
Posted by Hao Wang 3 years, 8 months ago
From: Hao Wang <wanghao232@huawei.com>
Subject: [PATCH] NVRAM: check NVRAM file size and recover from template

A corrupted nvram file (e.g. caused by last unsuccessful creation due to
insufficient memory) can lead to boot or migration failure.
Check the size of the existed nvram file when qemuPrepareNVRAM, and re-create
if the existed one is unhealthy.

Signed-off-by: Hao Wang <wanghao232@huawei.com>
---
 src/qemu/qemu_process.c | 54 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
index 126fabf5ef..42060bb36c 100644
--- a/src/qemu/qemu_process.c
+++ b/src/qemu/qemu_process.c
@@ -4376,6 +4376,48 @@ qemuProcessUpdateCPU(virQEMUDriverPtr driver,
 }


+static bool
+qemuIsNvramFileHealthy(virQEMUDriverConfigPtr cfg,
+                       virDomainLoaderDefPtr loader)
+{
+    const char *masterNvramPath;
+    off_t nvramSize;
+    off_t masterSize;
+
+    masterNvramPath = loader->templt;
+    if (!loader->templt) {
+        size_t i;
+        for (i = 0; i < cfg->nfirmwares; i++) {
+            if (STREQ(cfg->firmwares[i]->name, loader->path)) {
+                masterNvramPath = cfg->firmwares[i]->nvram;
+                break;
+            }
+        }
+    }
+
+    if (!masterNvramPath) {
+        VIR_WARN("no nvram template is found; assume the nvram file is healthy");
+        return true;
+    }
+
+    if ((nvramSize = virFileLength(loader->nvram, -1)) < 0 ||
+        (masterSize = virFileLength(masterNvramPath, -1)) < 0) {
+        virReportSystemError(errno,
+                             _("unable to get the size of '%s' or '%s'"),
+                             loader->nvram, masterNvramPath);
+        return false;
+    }
+
+    if (nvramSize != masterSize) {
+        VIR_WARN("the size(%zd) of the nvram file is not equal to that of the template %s",
+                 nvramSize, masterNvramPath);
+        return false;
+    }
+
+    return true;
+}
+
+
 static int
 qemuPrepareNVRAM(virQEMUDriverConfigPtr cfg,
                  virDomainObjPtr vm)
@@ -4388,9 +4430,19 @@ qemuPrepareNVRAM(virQEMUDriverConfigPtr cfg,
     const char *master_nvram_path;
     ssize_t r;

-    if (!loader || !loader->nvram || virFileExists(loader->nvram))
+    if (!loader || !loader->nvram)
         return 0;

+    if (virFileExists(loader->nvram)) {
+        if (qemuIsNvramFileHealthy(cfg, loader))
+            return 0;
+
+        ignore_value(virFileRemove(loader->nvram, -1, -1));
+        VIR_WARN("the nvram file %s exists but may be corrupted! "
+                 "Remove it and try to copy a new one from template.",
+                 loader->nvram);
+    }
+
     master_nvram_path = loader->templt;
     if (!loader->templt) {
         size_t i;
--
2.23.0

Re: [PATCH] NVRAM: check NVRAM file size and recover from template
Posted by Daniel Henrique Barboza 3 years, 7 months ago

On 8/8/20 7:01 AM, Hao Wang wrote:
> From: Hao Wang <wanghao232@huawei.com>
> Subject: [PATCH] NVRAM: check NVRAM file size and recover from template
> 
> A corrupted nvram file (e.g. caused by last unsuccessful creation due to
> insufficient memory) can lead to boot or migration failure.
> Check the size of the existed nvram file when qemuPrepareNVRAM, and re-create
> if the existed one is unhealthy.
> 
> Signed-off-by: Hao Wang <wanghao232@huawei.com>
> ---
>   src/qemu/qemu_process.c | 54 ++++++++++++++++++++++++++++++++++++++++-
>   1 file changed, 53 insertions(+), 1 deletion(-)
> 
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 126fabf5ef..42060bb36c 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -4376,6 +4376,48 @@ qemuProcessUpdateCPU(virQEMUDriverPtr driver,
>   }
> 
> 
> +static bool
> +qemuIsNvramFileHealthy(virQEMUDriverConfigPtr cfg,
> +                       virDomainLoaderDefPtr loader)
> +{
> +    const char *masterNvramPath;
> +    off_t nvramSize;
> +    off_t masterSize;
> +
> +    masterNvramPath = loader->templt;
> +    if (!loader->templt) {
> +        size_t i;
> +        for (i = 0; i < cfg->nfirmwares; i++) {
> +            if (STREQ(cfg->firmwares[i]->name, loader->path)) {
> +                masterNvramPath = cfg->firmwares[i]->nvram;
> +                break;
> +            }
> +        }
> +    }
> +
> +    if (!masterNvramPath) {
> +        VIR_WARN("no nvram template is found; assume the nvram file is healthy");
> +        return true;
> +    }

The issue I'm seeing here is that this code is duplicated in the body
of qemuPrepareNVRAM .....

> +
> +    if ((nvramSize = virFileLength(loader->nvram, -1)) < 0 ||
> +        (masterSize = virFileLength(masterNvramPath, -1)) < 0) {
> +        virReportSystemError(errno,
> +                             _("unable to get the size of '%s' or '%s'"),
> +                             loader->nvram, masterNvramPath);
> +        return false;
> +    }
> +
> +    if (nvramSize != masterSize) {
> +        VIR_WARN("the size(%zd) of the nvram file is not equal to that of the template %s",
> +                 nvramSize, masterNvramPath);
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +
>   static int
>   qemuPrepareNVRAM(virQEMUDriverConfigPtr cfg,
>                    virDomainObjPtr vm)
> @@ -4388,9 +4430,19 @@ qemuPrepareNVRAM(virQEMUDriverConfigPtr cfg,
>       const char *master_nvram_path;
>       ssize_t r;
> 
> -    if (!loader || !loader->nvram || virFileExists(loader->nvram))
> +    if (!loader || !loader->nvram)
>           return 0;
> 
> +    if (virFileExists(loader->nvram)) {
> +        if (qemuIsNvramFileHealthy(cfg, loader))
> +            return 0;
> +
> +        ignore_value(virFileRemove(loader->nvram, -1, -1));
> +        VIR_WARN("the nvram file %s exists but may be corrupted! "
> +                 "Remove it and try to copy a new one from template.",
> +                 loader->nvram);
> +    }
> +
>       master_nvram_path = loader->templt;
>       if (!loader->templt) {
>           size_t i;

^ right here. And not only that, the code is being duplicated and doing
two different things. The code you're adding in qemuIsNvramFileHealthy()
is checking for !master_nvram_path, throwing a VIR_WARN() and returning
'true'. The code in the body of qemuPrepareNVRAM is aborting and returning
-1 in the same condition:

     master_nvram_path = loader->templt;
     if (!loader->templt) {
         size_t i;
         for (i = 0; i < cfg->nfirmwares; i++) {
             if (STREQ(cfg->firmwares[i]->name, loader->path)) {
                 master_nvram_path = cfg->firmwares[i]->nvram;
                 break;
             }
         }
     }

     if (!master_nvram_path) {
         virReportError(VIR_ERR_OPERATION_FAILED,
                        _("unable to find any master var store for "
                          "loader: %s"), loader->path);
         goto cleanup;
     }

What will end up happening then, if master_nvram_path is NULL, is that
your code will give a VIR_WARN claiming it is fine, then the rest of
qemuProcessNVRAM will error out claiming this is not fine.

To preserve the already existing behavior and avoid repetition, my suggestion
here is to move this code block as is to qemuIsNvramFileHealthy(), same
error message and returning 'false'. The remaining of qemuNVRAMPrepare()
depends upon master_nvram_path not being NULL, so you're either embed this
assumption in your new function or you change the body of qemuNVRAMPrepare().
I believe the former is what we would want here.


Thanks,


DHB


> --
> 2.23.0
> 

Re: [PATCH] NVRAM: check NVRAM file size and recover from template
Posted by Daniel P. Berrangé 3 years, 7 months ago
On Sat, Aug 08, 2020 at 06:01:22PM +0800, Hao Wang wrote:
> From: Hao Wang <wanghao232@huawei.com>
> Subject: [PATCH] NVRAM: check NVRAM file size and recover from template
> 
> A corrupted nvram file (e.g. caused by last unsuccessful creation due to
> insufficient memory) can lead to boot or migration failure.

Or it can be caused by incorrect deployment of new NVRAM file by an
administrator or OS Distro...

Just last week we were debugging an issue on Ubuntu where a person
had deployed incorrect NVRAM files of a different size.

> Check the size of the existed nvram file when qemuPrepareNVRAM, and re-create
> if the existed one is unhealthy.

....this will delete valid NVRAM files, hiding the mistake by the
administrator.

Overall I'm not a fan of performing a destructive action like deleting
files which may or may not be correct.


> 
> Signed-off-by: Hao Wang <wanghao232@huawei.com>
> ---
>  src/qemu/qemu_process.c | 54 ++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 53 insertions(+), 1 deletion(-)
> 
> diff --git a/src/qemu/qemu_process.c b/src/qemu/qemu_process.c
> index 126fabf5ef..42060bb36c 100644
> --- a/src/qemu/qemu_process.c
> +++ b/src/qemu/qemu_process.c
> @@ -4376,6 +4376,48 @@ qemuProcessUpdateCPU(virQEMUDriverPtr driver,
>  }
> 
> 
> +static bool
> +qemuIsNvramFileHealthy(virQEMUDriverConfigPtr cfg,
> +                       virDomainLoaderDefPtr loader)
> +{
> +    const char *masterNvramPath;
> +    off_t nvramSize;
> +    off_t masterSize;
> +
> +    masterNvramPath = loader->templt;
> +    if (!loader->templt) {
> +        size_t i;
> +        for (i = 0; i < cfg->nfirmwares; i++) {
> +            if (STREQ(cfg->firmwares[i]->name, loader->path)) {
> +                masterNvramPath = cfg->firmwares[i]->nvram;
> +                break;
> +            }
> +        }
> +    }
> +
> +    if (!masterNvramPath) {
> +        VIR_WARN("no nvram template is found; assume the nvram file is healthy");
> +        return true;
> +    }
> +
> +    if ((nvramSize = virFileLength(loader->nvram, -1)) < 0 ||
> +        (masterSize = virFileLength(masterNvramPath, -1)) < 0) {
> +        virReportSystemError(errno,
> +                             _("unable to get the size of '%s' or '%s'"),
> +                             loader->nvram, masterNvramPath);
> +        return false;
> +    }
> +
> +    if (nvramSize != masterSize) {
> +        VIR_WARN("the size(%zd) of the nvram file is not equal to that of the template %s",
> +                 nvramSize, masterNvramPath);
> +        return false;
> +    }
> +
> +    return true;
> +}
> +
> +
>  static int
>  qemuPrepareNVRAM(virQEMUDriverConfigPtr cfg,
>                   virDomainObjPtr vm)
> @@ -4388,9 +4430,19 @@ qemuPrepareNVRAM(virQEMUDriverConfigPtr cfg,
>      const char *master_nvram_path;
>      ssize_t r;
> 
> -    if (!loader || !loader->nvram || virFileExists(loader->nvram))
> +    if (!loader || !loader->nvram)
>          return 0;
> 
> +    if (virFileExists(loader->nvram)) {
> +        if (qemuIsNvramFileHealthy(cfg, loader))
> +            return 0;
> +
> +        ignore_value(virFileRemove(loader->nvram, -1, -1));
> +        VIR_WARN("the nvram file %s exists but may be corrupted! "
> +                 "Remove it and try to copy a new one from template.",
> +                 loader->nvram);
> +    }
> +
>      master_nvram_path = loader->templt;
>      if (!loader->templt) {
>          size_t i;
> --
> 2.23.0
> 

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [PATCH] NVRAM: check NVRAM file size and recover from template
Posted by Michal Privoznik 3 years, 7 months ago
On 9/1/20 10:46 AM, Daniel P. Berrangé wrote:
> On Sat, Aug 08, 2020 at 06:01:22PM +0800, Hao Wang wrote:
>> From: Hao Wang <wanghao232@huawei.com>
>> Subject: [PATCH] NVRAM: check NVRAM file size and recover from template
>>
>> A corrupted nvram file (e.g. caused by last unsuccessful creation due to
>> insufficient memory) can lead to boot or migration failure.
> 
> Or it can be caused by incorrect deployment of new NVRAM file by an
> administrator or OS Distro...
> 
> Just last week we were debugging an issue on Ubuntu where a person
> had deployed incorrect NVRAM files of a different size.
> 
>> Check the size of the existed nvram file when qemuPrepareNVRAM, and re-create
>> if the existed one is unhealthy.
> 
> ....this will delete valid NVRAM files, hiding the mistake by the
> administrator.
> 
> Overall I'm not a fan of performing a destructive action like deleting
> files which may or may not be correct.
> 

Not to mention that the size is only one of indicators. The NVRAM has an 
internal structure and if it gets corrupted (e.g. guest crashes while 
writing the file), we have now way of detecting it (and I know this is 
not the scenario you are trying to fix).

BTW: This all could have been avoided if you'd use firmware 
autoselection: <os firmware='efi'/>. Is there a problem with it that 
made you go with the old way of specifying NVRAM?

Michal