[v1] qemu: Introduce shared_filesystems configuration option

[PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Andrea Bolognani 1 year, 10 months ago

As explained in the comment, this can help in scenarios where
a shared filesystem can't be detected as such by libvirt, by
giving the admin the opportunity to provide this information
manually.

Signed-off-by: Andrea Bolognani <abologna@redhat.com>
---
 src/qemu/libvirtd_qemu.aug         |  3 +++
 src/qemu/qemu.conf.in              | 17 +++++++++++++++++
 src/qemu/qemu_conf.c               | 17 +++++++++++++++++
 src/qemu/qemu_conf.h               |  2 ++
 src/qemu/test_libvirtd_qemu.aug.in |  5 +++++
 5 files changed, 44 insertions(+)

diff --git a/src/qemu/libvirtd_qemu.aug b/src/qemu/libvirtd_qemu.aug
index 2b6526538f..1377fd89cc 100644
--- a/src/qemu/libvirtd_qemu.aug
+++ b/src/qemu/libvirtd_qemu.aug
@@ -143,6 +143,8 @@ module Libvirtd_qemu =
 
    let storage_entry = bool_entry "storage_use_nbdkit"
 
+   let filesystem_entry = str_array_entry "shared_filesystems"
+
    (* Entries that used to exist in the config which are now
     * deleted. We keep on parsing them so we don't break
     * ability to parse old configs after upgrade
@@ -173,6 +175,7 @@ module Libvirtd_qemu =
              | swtpm_entry
              | capability_filters_entry
              | storage_entry
+             | filesystem_entry
              | obsolete_entry
 
    let comment = [ label "#comment" . del /#[ \t]*/ "# " .  store /([^ \t\n][^\n]*)?/ . del /\n/ "\n" ]
diff --git a/src/qemu/qemu.conf.in b/src/qemu/qemu.conf.in
index f406df8749..db42448239 100644
--- a/src/qemu/qemu.conf.in
+++ b/src/qemu/qemu.conf.in
@@ -986,3 +986,20 @@
 # note that the default might change in future releases.
 #
 #storage_use_nbdkit = @USE_NBDKIT_DEFAULT@
+
+# libvirt will normally prevent migration if the storage backing the VM is not
+# on a shared filesystems. Sometimes, however, the storage *is* shared despite
+# not being detected as such: for example, this is the case when one of the
+# hosts involved in the migration is exporting its local storage to the other
+# one via NFS.
+#
+# Any directory listed here will be assumed to live on a shared filesystem,
+# making migration possible in scenarios such as the one described above.
+#
+# If you need this feature, you probably want to set remember_owner=0 too.
+#
+#shared_filesystems = [
+#  "/var/lib/libvirt/images",
+#  "/var/lib/libvirt/qemu/nvram",
+#  "/var/lib/libvirt/swtpm"
+#]
diff --git a/src/qemu/qemu_conf.c b/src/qemu/qemu_conf.c
index 4050a82341..01c6bcc793 100644
--- a/src/qemu/qemu_conf.c
+++ b/src/qemu/qemu_conf.c
@@ -374,6 +374,8 @@ static void virQEMUDriverConfigDispose(void *obj)
 
     g_strfreev(cfg->capabilityfilters);
 
+    g_strfreev(cfg->sharedFilesystems);
+
     g_free(cfg->deprecationBehavior);
 }
 
@@ -1084,6 +1086,18 @@ virQEMUDriverConfigLoadStorageEntry(virQEMUDriverConfig *cfg,
 }
 
 
+static int
+virQEMUDriverConfigLoadFilesystemEntry(virQEMUDriverConfig *cfg,
+                                       virConf *conf)
+{
+    if (virConfGetValueStringList(conf, "shared_filesystems", false,
+                                  &cfg->sharedFilesystems) < 0)
+        return -1;
+
+    return 0;
+}
+
+
 int virQEMUDriverConfigLoadFile(virQEMUDriverConfig *cfg,
                                 const char *filename,
                                 bool privileged)
@@ -1158,6 +1172,9 @@ int virQEMUDriverConfigLoadFile(virQEMUDriverConfig *cfg,
     if (virQEMUDriverConfigLoadStorageEntry(cfg, conf) < 0)
         return -1;
 
+    if (virQEMUDriverConfigLoadFilesystemEntry(cfg, conf) < 0)
+        return -1;
+
     return 0;
 }
 
diff --git a/src/qemu/qemu_conf.h b/src/qemu/qemu_conf.h
index 36049b4bfa..b53d56be02 100644
--- a/src/qemu/qemu_conf.h
+++ b/src/qemu/qemu_conf.h
@@ -233,6 +233,8 @@ struct _virQEMUDriverConfig {
     bool storageUseNbdkit;
 
     virQEMUSchedCore schedCore;
+
+    char **sharedFilesystems;
 };
 
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(virQEMUDriverConfig, virObjectUnref);
diff --git a/src/qemu/test_libvirtd_qemu.aug.in b/src/qemu/test_libvirtd_qemu.aug.in
index b97e6de11e..f0a7a2a30e 100644
--- a/src/qemu/test_libvirtd_qemu.aug.in
+++ b/src/qemu/test_libvirtd_qemu.aug.in
@@ -119,3 +119,8 @@ module Test_libvirtd_qemu =
 { "deprecation_behavior" = "none" }
 { "sched_core" = "none" }
 { "storage_use_nbdkit" = "@USE_NBDKIT_DEFAULT@" }
+{ "shared_filesystems"
+    { "1" = "/var/lib/libvirt/images" }
+    { "2" = "/var/lib/libvirt/qemu/nvram" }
+    { "3" = "/var/lib/libvirt/swtpm" }
+}
-- 
2.44.0
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Peter Krempa 1 year, 10 months ago

On Wed, Mar 20, 2024 at 10:19:11 +0100, Andrea Bolognani wrote:
> As explained in the comment, this can help in scenarios where
> a shared filesystem can't be detected as such by libvirt, by
> giving the admin the opportunity to provide this information
> manually.
> 
> Signed-off-by: Andrea Bolognani <abologna@redhat.com>
> ---
>  src/qemu/libvirtd_qemu.aug         |  3 +++
>  src/qemu/qemu.conf.in              | 17 +++++++++++++++++
>  src/qemu/qemu_conf.c               | 17 +++++++++++++++++
>  src/qemu/qemu_conf.h               |  2 ++
>  src/qemu/test_libvirtd_qemu.aug.in |  5 +++++
>  5 files changed, 44 insertions(+)

[...]

> diff --git a/src/qemu/qemu.conf.in b/src/qemu/qemu.conf.in
> index f406df8749..db42448239 100644
> --- a/src/qemu/qemu.conf.in
> +++ b/src/qemu/qemu.conf.in
> @@ -986,3 +986,20 @@
>  # note that the default might change in future releases.
>  #
>  #storage_use_nbdkit = @USE_NBDKIT_DEFAULT@
> +
> +# libvirt will normally prevent migration if the storage backing the VM is not
> +# on a shared filesystems. Sometimes, however, the storage *is* shared despite
> +# not being detected as such: for example, this is the case when one of the
> +# hosts involved in the migration is exporting its local storage to the other
> +# one via NFS.
> +#
> +# Any directory listed here will be assumed to live on a shared filesystem,
> +# making migration possible in scenarios such as the one described above.
> +#
> +# If you need this feature, you probably want to set remember_owner=0 too.

Could you please elaborate why you'd want to disable owner remembering?
With remote filesystems this works so I expect that if this makes
certain paths behave as shared filesystems, they should behave such
without any additional tweaks

> +#shared_filesystems = [
> +#  "/var/lib/libvirt/images",
> +#  "/var/lib/libvirt/qemu/nvram",
> +#  "/var/lib/libvirt/swtpm"
> +#]

Do we want to give real paths as examples? Users might think that they
are the 'suggested' values, while it really depends on how they've
configured it. I sugest using something clearly dummy here.

The rest looks reasonable.
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Andrea Bolognani 1 year, 10 months ago

On Wed, Mar 20, 2024 at 12:37:37PM +0100, Peter Krempa wrote:
> On Wed, Mar 20, 2024 at 10:19:11 +0100, Andrea Bolognani wrote:
> > +# libvirt will normally prevent migration if the storage backing the VM is not
> > +# on a shared filesystems. Sometimes, however, the storage *is* shared despite
> > +# not being detected as such: for example, this is the case when one of the
> > +# hosts involved in the migration is exporting its local storage to the other
> > +# one via NFS.
> > +#
> > +# Any directory listed here will be assumed to live on a shared filesystem,
> > +# making migration possible in scenarios such as the one described above.
> > +#
> > +# If you need this feature, you probably want to set remember_owner=0 too.
>
> Could you please elaborate why you'd want to disable owner remembering?
> With remote filesystems this works so I expect that if this makes
> certain paths behave as shared filesystems, they should behave such
> without any additional tweaks

To be quite honest I don't remember exactly why I've added that, but
I can confirm that if remember_owner=0 is not used on the destination
host then migration will stall for a bit and then fail with

  error: unable to lock /var/lib/libvirt/swtpm/.../tpm2/.lock for
  metadata change: Resource temporarily unavailable

Things work fine if swtpm is not involved. I'm going to dig deeper,
but my guess is that, just like the situation addressed by the last
patch, having an additional process complicates things compared to
when we need to worry about QEMU only.

> > +#shared_filesystems = [
> > +#  "/var/lib/libvirt/images",
> > +#  "/var/lib/libvirt/qemu/nvram",
> > +#  "/var/lib/libvirt/swtpm"
> > +#]
>
> Do we want to give real paths as examples? Users might think that they
> are the 'suggested' values, while it really depends on how they've
> configured it. I sugest using something clearly dummy here.

Hopefully people will read the comment, but I can change those to
something like

  shared_filesystems = [
    "/path/to/images",
    "/path/to/nvram",
    "/path/to/swtpm"
  ]

if you think it's less likely to cause confusion.

-- 
Andrea Bolognani / Red Hat / Virtualization
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Andrea Bolognani 1 year, 10 months ago

On Wed, Mar 20, 2024 at 08:43:24AM -0700, Andrea Bolognani wrote:
> On Wed, Mar 20, 2024 at 12:37:37PM +0100, Peter Krempa wrote:
> > On Wed, Mar 20, 2024 at 10:19:11 +0100, Andrea Bolognani wrote:
> > > +# libvirt will normally prevent migration if the storage backing the VM is not
> > > +# on a shared filesystems. Sometimes, however, the storage *is* shared despite
> > > +# not being detected as such: for example, this is the case when one of the
> > > +# hosts involved in the migration is exporting its local storage to the other
> > > +# one via NFS.
> > > +#
> > > +# Any directory listed here will be assumed to live on a shared filesystem,
> > > +# making migration possible in scenarios such as the one described above.
> > > +#
> > > +# If you need this feature, you probably want to set remember_owner=0 too.
> >
> > Could you please elaborate why you'd want to disable owner remembering?
> > With remote filesystems this works so I expect that if this makes
> > certain paths behave as shared filesystems, they should behave such
> > without any additional tweaks
>
> To be quite honest I don't remember exactly why I've added that, but
> I can confirm that if remember_owner=0 is not used on the destination
> host then migration will stall for a bit and then fail with
>
>   error: unable to lock /var/lib/libvirt/swtpm/.../tpm2/.lock for
>   metadata change: Resource temporarily unavailable
>
> Things work fine if swtpm is not involved. I'm going to dig deeper,
> but my guess is that, just like the situation addressed by the last
> patch, having an additional process complicates things compared to
> when we need to worry about QEMU only.

I've managed to track this down, and I wasn't far off.

The issue is that, when remember_owner is enabled, we perform a bunch
of additional locking on files around the actual relabel operation;
specifically, we call virSecurityManagerTransactionCommit() with the
last argument set to true.

This doesn't seem to cause any issues in general, *except* when it
comes to swtpm's storage lock. The swtpm process holds this lock
while it's running, and only releases it once migration is triggered.
So, when we're about to start the target swtpm process and want to
prepare the environment by setting up labels, we try to acquire the
storage lock, and can't proceed because the source swtpm process is
still holding on to it.

The hacky patch below makes migration work even when remember_owner
is enabled. Obviously I'd rewrite it so that we'd only skip locking
for incoming migration, but I wonder if there could be nasty side
effects to this...

Other ideas? Can we perhaps change things so that swtpm releases the
lock earlier upon our request or something like that?

diff --git a/src/security/security_manager.c b/src/security/security_manager.c
index fc2747c45e..2aa06eaec2 100644
--- a/src/security/security_manager.c
+++ b/src/security/security_manager.c
@@ -1356,6 +1356,9 @@
virSecurityManagerMetadataLock(virSecurityManager *mgr G_GNUC_UNUSED,
             continue;
         }

+        if (g_str_has_suffix(p, "/.lock"))
+            continue;
+
         if ((fd = open(p, O_RDWR)) < 0) {
             if (errno == EROFS) {
                 /* There is nothing we can do for RO filesystem. */
-- 
Andrea Bolognani / Red Hat / Virtualization
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Andrea Bolognani 1 year, 9 months ago

On Tue, Mar 26, 2024 at 08:54:03AM -0700, Andrea Bolognani wrote:
> On Wed, Mar 20, 2024 at 08:43:24AM -0700, Andrea Bolognani wrote:
> > On Wed, Mar 20, 2024 at 12:37:37PM +0100, Peter Krempa wrote:
> > > On Wed, Mar 20, 2024 at 10:19:11 +0100, Andrea Bolognani wrote:
> > > > +# libvirt will normally prevent migration if the storage backing the VM is not
> > > > +# on a shared filesystems. Sometimes, however, the storage *is* shared despite
> > > > +# not being detected as such: for example, this is the case when one of the
> > > > +# hosts involved in the migration is exporting its local storage to the other
> > > > +# one via NFS.
> > > > +#
> > > > +# Any directory listed here will be assumed to live on a shared filesystem,
> > > > +# making migration possible in scenarios such as the one described above.
> > > > +#
> > > > +# If you need this feature, you probably want to set remember_owner=0 too.
> > >
> > > Could you please elaborate why you'd want to disable owner remembering?
> > > With remote filesystems this works so I expect that if this makes
> > > certain paths behave as shared filesystems, they should behave such
> > > without any additional tweaks
> >
> > To be quite honest I don't remember exactly why I've added that, but
> > I can confirm that if remember_owner=0 is not used on the destination
> > host then migration will stall for a bit and then fail with
> >
> >   error: unable to lock /var/lib/libvirt/swtpm/.../tpm2/.lock for
> >   metadata change: Resource temporarily unavailable
> >
> > Things work fine if swtpm is not involved. I'm going to dig deeper,
> > but my guess is that, just like the situation addressed by the last
> > patch, having an additional process complicates things compared to
> > when we need to worry about QEMU only.
>
> I've managed to track this down, and I wasn't far off.
>
> The issue is that, when remember_owner is enabled, we perform a bunch
> of additional locking on files around the actual relabel operation;
> specifically, we call virSecurityManagerTransactionCommit() with the
> last argument set to true.
>
> This doesn't seem to cause any issues in general, *except* when it
> comes to swtpm's storage lock.

I take this back. Further testing has confirmed that there are other
scenarios in which this is problematic, for example NVRAM files.

Even with the TPM locking issue out of the way, we still have to deal
with an underlying issue: label remembering is implemented by setting
custom XATTRs on the files involved, and NFS doesn't support XATTRs.

So what will happen when remember_owner=1 and the VM getting started
on the host that has local access to the filesystem is, that these
XATTRs will get set and not cleared up on migration; the VM will make
it safely to the other host.

Then, when you attempt to migrate it back, libvirt will report

  Setting different SELinux label on
/var/lib/libvirt/qemu/nvram/test_VARS.fd which is already in use

and abort the operation. This is caused by

  # getfattr -dm- /var/lib/libvirt/qemu/nvram/test_VARS.fd
  security.selinux="system_u:object_r:svirt_image_t:s0:c704,c774"
  trusted.libvirt.security.dac="+0:+0"
  trusted.libvirt.security.ref_dac="1"
  trusted.libvirt.security.ref_selinux="1"
  trusted.libvirt.security.selinux="system_u:object_r:svirt_image_t:s0"
  trusted.libvirt.security.timestamp_dac="1713347549"
  trusted.libvirt.security.timestamp_selinux="1713347549"

In particular, the non-zero reference counts are what convinces
libvirt to bail.

I don't think we can handle this nicely without opening other cans of
worms. The principle of "don't touch anything on the source once the
VM is running on the destination" is a good one to follow, since
ignoring it might result in the destination VM suddenly being
prevented from performing I/O. And in case the migration fails, which
as you both pointed out earlier is a possibility that we need to take
into account, retaining the existing labels instead of clearing them
is actually a good thing, as it allows resuming execution on the
source host without running into permission issues.

In other words, I've come to the conclusion that remember_owner=0 is
the least bad option here.

In the v2 that I've just posted, I've updated the comment to stress
further the fact that this option comes with caveats and is not to be
used lightly.

-- 
Andrea Bolognani / Red Hat / Virtualization
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Stefan Berger 1 year, 10 months ago


On 3/26/24 11:54, Andrea Bolognani wrote:
> On Wed, Mar 20, 2024 at 08:43:24AM -0700, Andrea Bolognani wrote:
>> On Wed, Mar 20, 2024 at 12:37:37PM +0100, Peter Krempa wrote:
>>> On Wed, Mar 20, 2024 at 10:19:11 +0100, Andrea Bolognani wrote:
>>>> +# libvirt will normally prevent migration if the storage backing the VM is not
>>>> +# on a shared filesystems. Sometimes, however, the storage *is* shared despite
>>>> +# not being detected as such: for example, this is the case when one of the
>>>> +# hosts involved in the migration is exporting its local storage to the other
>>>> +# one via NFS.
>>>> +#
>>>> +# Any directory listed here will be assumed to live on a shared filesystem,
>>>> +# making migration possible in scenarios such as the one described above.
>>>> +#
>>>> +# If you need this feature, you probably want to set remember_owner=0 too.
>>>
>>> Could you please elaborate why you'd want to disable owner remembering?
>>> With remote filesystems this works so I expect that if this makes
>>> certain paths behave as shared filesystems, they should behave such
>>> without any additional tweaks
>>
>> To be quite honest I don't remember exactly why I've added that, but
>> I can confirm that if remember_owner=0 is not used on the destination
>> host then migration will stall for a bit and then fail with
>>
>>    error: unable to lock /var/lib/libvirt/swtpm/.../tpm2/.lock for
>>    metadata change: Resource temporarily unavailable
>>
>> Things work fine if swtpm is not involved. I'm going to dig deeper,
>> but my guess is that, just like the situation addressed by the last
>> patch, having an additional process complicates things compared to
>> when we need to worry about QEMU only.
> 
> I've managed to track this down, and I wasn't far off.
> 
> The issue is that, when remember_owner is enabled, we perform a bunch
> of additional locking on files around the actual relabel operation;
> specifically, we call virSecurityManagerTransactionCommit() with the
> last argument set to true.
> 
> This doesn't seem to cause any issues in general, *except* when it
> comes to swtpm's storage lock. The swtpm process holds this lock
> while it's running, and only releases it once migration is triggered.
> So, when we're about to start the target swtpm process and want to
> prepare the environment by setting up labels, we try to acquire the
> storage lock, and can't proceed because the source swtpm process is
> still holding on to it.

Who is 'we try to acquire the storage lock'? Is libvirt trying to 
acquire swtpm's storage lock? I would assume that only an instance of 
swtpm would acquire the lock.

> 
> The hacky patch below makes migration work even when remember_owner
> is enabled. Obviously I'd rewrite it so that we'd only skip locking
> for incoming migration, but I wonder if there could be nasty side
> effects to this...
> 
> Other ideas? Can we perhaps change things so that swtpm releases the
> lock earlier upon our request or something like that?

Maybe below functiokn needs to be passed an array of exceptions, one 
being swtpm's lock file. I mean the lock file serves the purpose of 
locking via filesystem, so I don't think a third party should start 
interfering here and influencing the protocol ...

> 
> 
> diff --git a/src/security/security_manager.c b/src/security/security_manager.c
> index fc2747c45e..2aa06eaec2 100644
> --- a/src/security/security_manager.c
> +++ b/src/security/security_manager.c
> @@ -1356,6 +1356,9 @@
> virSecurityManagerMetadataLock(virSecurityManager *mgr G_GNUC_UNUSED,
>               continue;
>           }
> 
> +        if (g_str_has_suffix(p, "/.lock"))
> +            continue;
> +
>           if ((fd = open(p, O_RDWR)) < 0) {
>               if (errno == EROFS) {
>                   /* There is nothing we can do for RO filesystem. */
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Andrea Bolognani 1 year, 10 months ago

On Tue, Mar 26, 2024 at 12:04:21PM -0400, Stefan Berger wrote:
>
>
> On 3/26/24 11:54, Andrea Bolognani wrote:
> > On Wed, Mar 20, 2024 at 08:43:24AM -0700, Andrea Bolognani wrote:
> > > On Wed, Mar 20, 2024 at 12:37:37PM +0100, Peter Krempa wrote:
> > > > On Wed, Mar 20, 2024 at 10:19:11 +0100, Andrea Bolognani wrote:
> > > > > +# libvirt will normally prevent migration if the storage backing the VM is not
> > > > > +# on a shared filesystems. Sometimes, however, the storage *is* shared despite
> > > > > +# not being detected as such: for example, this is the case when one of the
> > > > > +# hosts involved in the migration is exporting its local storage to the other
> > > > > +# one via NFS.
> > > > > +#
> > > > > +# Any directory listed here will be assumed to live on a shared filesystem,
> > > > > +# making migration possible in scenarios such as the one described above.
> > > > > +#
> > > > > +# If you need this feature, you probably want to set remember_owner=0 too.
> > > >
> > > > Could you please elaborate why you'd want to disable owner remembering?
> > > > With remote filesystems this works so I expect that if this makes
> > > > certain paths behave as shared filesystems, they should behave such
> > > > without any additional tweaks
> > >
> > > To be quite honest I don't remember exactly why I've added that, but
> > > I can confirm that if remember_owner=0 is not used on the destination
> > > host then migration will stall for a bit and then fail with
> > >
> > >    error: unable to lock /var/lib/libvirt/swtpm/.../tpm2/.lock for
> > >    metadata change: Resource temporarily unavailable
> > >
> > > Things work fine if swtpm is not involved. I'm going to dig deeper,
> > > but my guess is that, just like the situation addressed by the last
> > > patch, having an additional process complicates things compared to
> > > when we need to worry about QEMU only.
> >
> > I've managed to track this down, and I wasn't far off.
> >
> > The issue is that, when remember_owner is enabled, we perform a bunch
> > of additional locking on files around the actual relabel operation;
> > specifically, we call virSecurityManagerTransactionCommit() with the
> > last argument set to true.
> >
> > This doesn't seem to cause any issues in general, *except* when it
> > comes to swtpm's storage lock. The swtpm process holds this lock
> > while it's running, and only releases it once migration is triggered.
> > So, when we're about to start the target swtpm process and want to
> > prepare the environment by setting up labels, we try to acquire the
> > storage lock, and can't proceed because the source swtpm process is
> > still holding on to it.
>
> Who is 'we try to acquire the storage lock'? Is libvirt trying to acquire
> swtpm's storage lock? I would assume that only an instance of swtpm would
> acquire the lock.

Yes, it's libvirt doing that. The lock is only held for a brief
period while labels are being applied, and released immediately
afterwards. swtpm is only launched once that's done, so generally
there's no conflict.

In the migration case, things have worked so far because labeling in
general has been skipped for shared filesystem, which means that
locking was not performed either. This series changes things so that
labeling is necessary.

> > The hacky patch below makes migration work even when remember_owner
> > is enabled. Obviously I'd rewrite it so that we'd only skip locking
> > for incoming migration, but I wonder if there could be nasty side
> > effects to this...
> >
> > Other ideas? Can we perhaps change things so that swtpm releases the
> > lock earlier upon our request or something like that?
>
> Maybe below functiokn needs to be passed an array of exceptions, one being
> swtpm's lock file. I mean the lock file serves the purpose of locking via
> filesystem, so I don't think a third party should start interfering here and
> influencing the protocol ...

I guess the idea behind the locking is to prevent unrelated processes
(and possibly other libvirt threads?) from stepping on each other's
toes. Some exceptions are carved out already, so it's not like
there's no precedent for what we're suggesting. I'm just concerned
about accidentally opening the door for fun race conditions or CVEs.

-- 
Andrea Bolognani / Red Hat / Virtualization
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Stefan Berger 1 year, 10 months ago


On 3/26/24 12:38, Andrea Bolognani wrote:
> On Tue, Mar 26, 2024 at 12:04:21PM -0400, Stefan Berger wrote:
>>
>>
>> On 3/26/24 11:54, Andrea Bolognani wrote:
>>> On Wed, Mar 20, 2024 at 08:43:24AM -0700, Andrea Bolognani wrote:
>>>> On Wed, Mar 20, 2024 at 12:37:37PM +0100, Peter Krempa wrote:
>>>>> On Wed, Mar 20, 2024 at 10:19:11 +0100, Andrea Bolognani wrote:
>>>>>> +# libvirt will normally prevent migration if the storage backing the VM is not
>>>>>> +# on a shared filesystems. Sometimes, however, the storage *is* shared despite
>>>>>> +# not being detected as such: for example, this is the case when one of the
>>>>>> +# hosts involved in the migration is exporting its local storage to the other
>>>>>> +# one via NFS.
>>>>>> +#
>>>>>> +# Any directory listed here will be assumed to live on a shared filesystem,
>>>>>> +# making migration possible in scenarios such as the one described above.
>>>>>> +#
>>>>>> +# If you need this feature, you probably want to set remember_owner=0 too.
>>>>>
>>>>> Could you please elaborate why you'd want to disable owner remembering?
>>>>> With remote filesystems this works so I expect that if this makes
>>>>> certain paths behave as shared filesystems, they should behave such
>>>>> without any additional tweaks
>>>>
>>>> To be quite honest I don't remember exactly why I've added that, but
>>>> I can confirm that if remember_owner=0 is not used on the destination
>>>> host then migration will stall for a bit and then fail with
>>>>
>>>>     error: unable to lock /var/lib/libvirt/swtpm/.../tpm2/.lock for
>>>>     metadata change: Resource temporarily unavailable
>>>>
>>>> Things work fine if swtpm is not involved. I'm going to dig deeper,
>>>> but my guess is that, just like the situation addressed by the last
>>>> patch, having an additional process complicates things compared to
>>>> when we need to worry about QEMU only.
>>>
>>> I've managed to track this down, and I wasn't far off.
>>>
>>> The issue is that, when remember_owner is enabled, we perform a bunch
>>> of additional locking on files around the actual relabel operation;
>>> specifically, we call virSecurityManagerTransactionCommit() with the
>>> last argument set to true.
>>>
>>> This doesn't seem to cause any issues in general, *except* when it
>>> comes to swtpm's storage lock. The swtpm process holds this lock
>>> while it's running, and only releases it once migration is triggered.
>>> So, when we're about to start the target swtpm process and want to
>>> prepare the environment by setting up labels, we try to acquire the
>>> storage lock, and can't proceed because the source swtpm process is
>>> still holding on to it.
>>
>> Who is 'we try to acquire the storage lock'? Is libvirt trying to acquire
>> swtpm's storage lock? I would assume that only an instance of swtpm would
>> acquire the lock.
> 
> Yes, it's libvirt doing that. The lock is only held for a brief
> period while labels are being applied, and released immediately
> afterwards. swtpm is only launched once that's done, so generally
> there's no conflict.

Yes, I saw the code now. It kind of prevents lock files from being used.

> 
> In the migration case, things have worked so far because labeling in
> general has been skipped for shared filesystem, which means that
> locking was not performed either. This series changes things so that
> labeling is necessary.

Thanks for the re-cap.
Does libvirt actually get involved in case of a migration failure and 
fallback to the source host so that it could again relabel all files 
before QEMU and swtpm (and possibly other virtual devices) again open 
their files?

> 
>>> The hacky patch below makes migration work even when remember_owner
>>> is enabled. Obviously I'd rewrite it so that we'd only skip locking
>>> for incoming migration, but I wonder if there could be nasty side
>>> effects to this...
>>>
>>> Other ideas? Can we perhaps change things so that swtpm releases the
>>> lock earlier upon our request or something like that?
>>
>> Maybe below functiokn needs to be passed an array of exceptions, one being
>> swtpm's lock file. I mean the lock file serves the purpose of locking via
>> filesystem, so I don't think a third party should start interfering here and
>> influencing the protocol ...
> 
> I guess the idea behind the locking is to prevent unrelated processes
> (and possibly other libvirt threads?) from stepping on each other's
> toes. Some exceptions are carved out already, so it's not like
> there's no precedent for what we're suggesting. I'm just concerned
> about accidentally opening the door for fun race conditions or CVEs.
> 
Hm, maybe exceptions of filenames not to lock could be configured in the 
config file instead of hard coded but with concrete names already given 
in the sample config file so that users don't need to find out.

Some ideas about exceptions for files not to lock:
- there is a list of exception of files
- check whether a file is locked by a process with a given name in an 
exception list (e.g., swtpm in /proc/<flock->l_pid>/comm ).
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Andrea Bolognani 1 year, 10 months ago

On Tue, Mar 26, 2024 at 01:15:41PM -0400, Stefan Berger wrote:
> On 3/26/24 12:38, Andrea Bolognani wrote:
> > On Tue, Mar 26, 2024 at 12:04:21PM -0400, Stefan Berger wrote:
> > > On 3/26/24 11:54, Andrea Bolognani wrote:
> > > > The issue is that, when remember_owner is enabled, we perform a bunch
> > > > of additional locking on files around the actual relabel operation;
> > > > specifically, we call virSecurityManagerTransactionCommit() with the
> > > > last argument set to true.
> > > >
> > > > This doesn't seem to cause any issues in general, *except* when it
> > > > comes to swtpm's storage lock. The swtpm process holds this lock
> > > > while it's running, and only releases it once migration is triggered.
> > > > So, when we're about to start the target swtpm process and want to
> > > > prepare the environment by setting up labels, we try to acquire the
> > > > storage lock, and can't proceed because the source swtpm process is
> > > > still holding on to it.
> > >
> > > Who is 'we try to acquire the storage lock'? Is libvirt trying to acquire
> > > swtpm's storage lock? I would assume that only an instance of swtpm would
> > > acquire the lock.
> >
> > Yes, it's libvirt doing that. The lock is only held for a brief
> > period while labels are being applied, and released immediately
> > afterwards. swtpm is only launched once that's done, so generally
> > there's no conflict.
>
> Yes, I saw the code now. It kind of prevents lock files from being used.
>
> > In the migration case, things have worked so far because labeling in
> > general has been skipped for shared filesystem, which means that
> > locking was not performed either. This series changes things so that
> > labeling is necessary.
>
> Thanks for the re-cap.
> Does libvirt actually get involved in case of a migration failure and
> fallback to the source host so that it could again relabel all files before
> QEMU and swtpm (and possibly other virtual devices) again open their files?

Determining this is next in my to-do list :)

As I've explained in an earlier message, I believe things will be
fine on account of at least one of the hosts involved accessing the
storage via NFS, which doesn't support SELinux and is thus unaffected
by changes to labeling. But I still need to actually confirm that.

> > > > The hacky patch below makes migration work even when remember_owner
> > > > is enabled. Obviously I'd rewrite it so that we'd only skip locking
> > > > for incoming migration, but I wonder if there could be nasty side
> > > > effects to this...
> > > >
> > > > Other ideas? Can we perhaps change things so that swtpm releases the
> > > > lock earlier upon our request or something like that?
> > >
> > > Maybe below functiokn needs to be passed an array of exceptions, one being
> > > swtpm's lock file. I mean the lock file serves the purpose of locking via
> > > filesystem, so I don't think a third party should start interfering here and
> > > influencing the protocol ...
> >
> > I guess the idea behind the locking is to prevent unrelated processes
> > (and possibly other libvirt threads?) from stepping on each other's
> > toes. Some exceptions are carved out already, so it's not like
> > there's no precedent for what we're suggesting. I'm just concerned
> > about accidentally opening the door for fun race conditions or CVEs.
>
> Hm, maybe exceptions of filenames not to lock could be configured in the
> config file instead of hard coded but with concrete names already given in
> the sample config file so that users don't need to find out.
>
> Some ideas about exceptions for files not to lock:
> - there is a list of exception of files
> - check whether a file is locked by a process with a given name in an
> exception list (e.g., swtpm in /proc/<flock->l_pid>/comm ).

Ideally users wouldn't need to configure any of that, and libvirt can
instead figure things out automatically.

I had the "check whether the process currently holding the lock is
swtpm, ignore the failure if so" thought as well, perhaps that could
be the way to go.

Or keep it simple and just don't attempt to lock the swtpm files if
we know we're preparing for migration, if we can determine that it's
safe to do so.

-- 
Andrea Bolognani / Red Hat / Virtualization
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Peter Krempa 1 year, 10 months ago

On Wed, Mar 20, 2024 at 12:37:37 +0100, Peter Krempa wrote:
> On Wed, Mar 20, 2024 at 10:19:11 +0100, Andrea Bolognani wrote:
> > As explained in the comment, this can help in scenarios where
> > a shared filesystem can't be detected as such by libvirt, by
> > giving the admin the opportunity to provide this information
> > manually.
> > 
> > Signed-off-by: Andrea Bolognani <abologna@redhat.com>
> > ---
> >  src/qemu/libvirtd_qemu.aug         |  3 +++
> >  src/qemu/qemu.conf.in              | 17 +++++++++++++++++
> >  src/qemu/qemu_conf.c               | 17 +++++++++++++++++
> >  src/qemu/qemu_conf.h               |  2 ++
> >  src/qemu/test_libvirtd_qemu.aug.in |  5 +++++
> >  5 files changed, 44 insertions(+)
> 
> [...]
> 
> > diff --git a/src/qemu/qemu.conf.in b/src/qemu/qemu.conf.in
> > index f406df8749..db42448239 100644
> > --- a/src/qemu/qemu.conf.in
> > +++ b/src/qemu/qemu.conf.in
> > @@ -986,3 +986,20 @@
> >  # note that the default might change in future releases.
> >  #
> >  #storage_use_nbdkit = @USE_NBDKIT_DEFAULT@
> > +
> > +# libvirt will normally prevent migration if the storage backing the VM is not
> > +# on a shared filesystems. Sometimes, however, the storage *is* shared despite
> > +# not being detected as such: for example, this is the case when one of the
> > +# hosts involved in the migration is exporting its local storage to the other
> > +# one via NFS.

TBH this feels a bit weird. Having the NFS server on the host you are
migrating from will e.g. cause storage to start have network latency
which it didn't before.

Additionally you can't even take the host down for maintenance as it's
still serving storage for the VM.

How is this expected to be used? I don't seem to understand why would
anybody want to do this in contrast to e.g. migrating also the storage
fully to the destination.
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Daniel P. Berrangé 1 year, 9 months ago

On Wed, Mar 20, 2024 at 01:22:15PM +0100, Peter Krempa wrote:
> On Wed, Mar 20, 2024 at 12:37:37 +0100, Peter Krempa wrote:
> > On Wed, Mar 20, 2024 at 10:19:11 +0100, Andrea Bolognani wrote:
> > > As explained in the comment, this can help in scenarios where
> > > a shared filesystem can't be detected as such by libvirt, by
> > > giving the admin the opportunity to provide this information
> > > manually.
> > > 
> > > Signed-off-by: Andrea Bolognani <abologna@redhat.com>
> > > ---
> > >  src/qemu/libvirtd_qemu.aug         |  3 +++
> > >  src/qemu/qemu.conf.in              | 17 +++++++++++++++++
> > >  src/qemu/qemu_conf.c               | 17 +++++++++++++++++
> > >  src/qemu/qemu_conf.h               |  2 ++
> > >  src/qemu/test_libvirtd_qemu.aug.in |  5 +++++
> > >  5 files changed, 44 insertions(+)
> > 
> > [...]
> > 
> > > diff --git a/src/qemu/qemu.conf.in b/src/qemu/qemu.conf.in
> > > index f406df8749..db42448239 100644
> > > --- a/src/qemu/qemu.conf.in
> > > +++ b/src/qemu/qemu.conf.in
> > > @@ -986,3 +986,20 @@
> > >  # note that the default might change in future releases.
> > >  #
> > >  #storage_use_nbdkit = @USE_NBDKIT_DEFAULT@
> > > +
> > > +# libvirt will normally prevent migration if the storage backing the VM is not
> > > +# on a shared filesystems. Sometimes, however, the storage *is* shared despite
> > > +# not being detected as such: for example, this is the case when one of the
> > > +# hosts involved in the migration is exporting its local storage to the other
> > > +# one via NFS.
> 
> TBH this feels a bit weird. Having the NFS server on the host you are
> migrating from will e.g. cause storage to start have network latency
> which it didn't before.
> 
> Additionally you can't even take the host down for maintenance as it's
> still serving storage for the VM.

You can do a block copy / rebase after you have migrated in order to
then pull the storage over to the new host.

If disk is much larger than RAM, this has the advantage that you can
get the live migration completed quickly, and then let the disk copy
run in the background a long time.

Obviously not much use for an emergency evacuation of a failing host,
but reasonable for planned maint actions.

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org

Re: [PATCH 06/10] qemu: Introduce shared_filesystems configuration option

Posted by Andrea Bolognani 1 year, 10 months ago

On Wed, Mar 20, 2024 at 01:22:15PM +0100, Peter Krempa wrote:
> > > +# libvirt will normally prevent migration if the storage backing the VM is not
> > > +# on a shared filesystems. Sometimes, however, the storage *is* shared despite
> > > +# not being detected as such: for example, this is the case when one of the
> > > +# hosts involved in the migration is exporting its local storage to the other
> > > +# one via NFS.
>
> TBH this feels a bit weird. Having the NFS server on the host you are
> migrating from will e.g. cause storage to start have network latency
> which it didn't before.
>
> Additionally you can't even take the host down for maintenance as it's
> still serving storage for the VM.
>
> How is this expected to be used? I don't seem to understand why would
> anybody want to do this in contrast to e.g. migrating also the storage
> fully to the destination.

The main driver for this feature is KubeVirt, and more specifically
its integration with a storage provider called Portworx.

From what I understand, the way this provider works is that it will
initially configure the VM storage so that it's local to the machine
running the pod; when migration occurs, however, the same storage
will be exported to the destination machine via NFS.

Since currently libvirt blocks attempt to migrate from local storage,
in order to make things work they're tricking it into thinking it's
NFS instead by using an LD_PRELOAD hack[1].

Since that's a... Somewhat questionable approach, the idea here is to
provide a more targeted and supportable way to let libvirt know that
some host paths are shared, even though a shared filesystem is not
used locally.

In the more general case, there's really nothing preventing people
from creating a similar setup outside of KubeVirt with Portworx. As
you rightfully point out, there are some drawbacks to the approach
but it's not necessarily an invalid one per se.

[1] https://github.com/libopenstorage/stork/pull/1117/files#diff-9aadee401eff19b4f32f9a37a124b4e08a19ad2b915a44ff9f4ff43c1da50ca5
-- 
Andrea Bolognani / Red Hat / Virtualization
_______________________________________________
Devel mailing list -- devel@lists.libvirt.org
To unsubscribe send an email to devel-leave@lists.libvirt.org