[PATCH] libvirt-guests.sh: fix log output with new systemd

Christian Ehrhardt posted 1 patch 5 years, 9 months ago
Test syntax-check failed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/libvirt tags/patchew/20200430095957.2940867-1-christian.ehrhardt@canonical.com
tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
1 file changed, 42 insertions(+), 32 deletions(-)
[PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Christian Ehrhardt 5 years, 9 months ago
Newer systemd is too smart, it detects the PIDs of the gettext calls
and due to that a log lists libvirt-guests.sh with many different PIDs.

Furthermore it struggles to collect the output in time, so it can be
truncated and overall looks like:
   libvirt-guests.sh[37986]: Running guests on default URI:
   libvirt-guests.sh[37995]: Running guests on
   libvirt-guests.sh[37977]: R

Gather the gettext result into a local variable and printing
the value from libvirt-guests.sh itself fixes both issues.

Fixes: https://bugs.launchpad.net/ubuntu/+source/libvirt/+bug/1875708

Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
---
 tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 32 deletions(-)

diff --git a/tools/libvirt-guests.sh.in b/tools/libvirt-guests.sh.in
index 7af24dab3b..324abe3623 100644
--- a/tools/libvirt-guests.sh.in
+++ b/tools/libvirt-guests.sh.in
@@ -28,6 +28,16 @@ test ! -r "$sysconfdir"/rc.d/init.d/functions ||
 # Make sure this file is recognized as having translations: _("dummy")
 . "@bindir@"/gettext.sh
 
+# Avoid output being listed under gettext PID and being truncated
+unbuffered_gettext () {
+    msg="$(gettext "$1")"
+    echo "$msg"
+}
+unbuffered_eval_gettext () {
+    msg="$(eval_gettext "$1")"
+    echo "$msg"
+}
+
 export TEXTDOMAIN="@PACKAGE@" TEXTDOMAINDIR="@localedir@"
 
 URIS=default
@@ -91,7 +101,7 @@ test_connect()
     if run_virsh "$uri" connect 2>/dev/null; then
         return 0;
     else
-        eval_gettext "Can't connect to \$uri. Skipping."
+        unbuffered_eval_gettext "Can't connect to \$uri. Skipping."
         return 1
     fi
 }
@@ -160,7 +170,7 @@ start() {
     [ -f "$LISTFILE" ] || { started; return 0; }
 
     if [ "x$ON_BOOT" != xstart ]; then
-        gettext "libvirt-guests is configured not to start any guests on boot"
+        unbuffered_gettext "libvirt-guests is configured not to start any guests on boot"
         echo
         rm -f "$LISTFILE"
         started
@@ -184,19 +194,19 @@ start() {
         done
         set +f
         if ! "$configured"; then
-            eval_gettext "Ignoring guests on \$uri URI"; echo
+            unbuffered_eval_gettext "Ignoring guests on \$uri URI"; echo
             continue
         fi
 
         test_connect "$uri" || continue
 
-        eval_gettext "Resuming guests on \$uri URI..."; echo
+        unbuffered_eval_gettext "Resuming guests on \$uri URI..."; echo
         for guest in $list; do
             local name=$(guest_name "$uri" "$guest")
-            eval_gettext "Resuming guest \$name: "
+            unbuffered_eval_gettext "Resuming guest \$name: "
             if guest_is_on "$uri" "$guest"; then
                 if "$guest_running"; then
-                    gettext "already active"; echo
+                    unbuffered_gettext "already active"; echo
                 else
                     if "$isfirst"; then
                         isfirst=false
@@ -205,7 +215,7 @@ start() {
                     fi
                     retval run_virsh "$uri" start $bypass "$name" \
                         >/dev/null && \
-                    gettext "done"; echo
+                    unbuffered_gettext "done"; echo
                     if "$sync_time"; then
                         run_virsh "$uri" domtime --sync "$name" >/dev/null
                     fi
@@ -226,7 +236,7 @@ suspend_guest()
     local uri=$1
     local guest=$2
     local name=$(guest_name "$uri" "$guest")
-    local label=$(eval_gettext "Suspending \$name: ")
+    local label=$(unbuffered_eval_gettext "Suspending \$name: ")
     local bypass=
     local slept=0
 
@@ -249,7 +259,7 @@ suspend_guest()
             fi
         fi
     done
-    retval wait "$virsh_pid" && printf '%s%s\n' "$label" "$(gettext "done")"
+    retval wait "$virsh_pid" && printf '%s%s\n' "$label" "$(unbuffered_gettext "done")"
 }
 
 # shutdown_guest URI GUEST
@@ -265,16 +275,16 @@ shutdown_guest()
     local format=
     local slept=
 
-    eval_gettext "Starting shutdown on guest: \$name"
+    unbuffered_eval_gettext "Starting shutdown on guest: \$name"
     echo
     retval run_virsh "$uri" shutdown "$guest" >/dev/null || return
 
     if [ $timeout -gt 0 ]; then
         check_timeout=true
-        format=$(eval_gettext "Waiting for guest %s to shut down, %d seconds left\n")
+        format=$(unbuffered_eval_gettext "Waiting for guest %s to shut down, %d seconds left\n")
     else
         slept=0
-        format=$(eval_gettext "Waiting for guest %s to shut down\n")
+        format=$(unbuffered_eval_gettext "Waiting for guest %s to shut down\n")
     fi
     while ! $check_timeout || [ "$timeout" -gt 0 ]; do
         sleep 1
@@ -296,9 +306,9 @@ shutdown_guest()
 
     if guest_is_on "$uri" "$guest"; then
         if "$guest_running"; then
-            eval_gettext "Shutdown of guest \$name failed to complete in time."
+            unbuffered_eval_gettext "Shutdown of guest \$name failed to complete in time."
         else
-            eval_gettext "Shutdown of guest \$name complete."
+            unbuffered_eval_gettext "Shutdown of guest \$name complete."
         fi
         echo
     fi
@@ -313,7 +323,7 @@ shutdown_guest_async()
     local guest=$2
     local name=$(guest_name "$uri" "$guest")
 
-    eval_gettext "Starting shutdown on guest: \$name"
+    unbuffered_eval_gettext "Starting shutdown on guest: \$name"
     echo
     retval run_virsh "$uri" shutdown "$guest" > /dev/null
 }
@@ -339,7 +349,7 @@ check_guests_shutdown()
     guests_shutting_down=
     for guest in $guests_to_check; do
         if ! guest_is_on "$uri" "$guest" >/dev/null 2>&1; then
-            eval_gettext "Failed to determine state of guest: \$guest. Not tracking it anymore."
+            unbuffered_eval_gettext "Failed to determine state of guest: \$guest. Not tracking it anymore."
             echo
             continue
         fi
@@ -366,7 +376,7 @@ print_guests_shutdown()
 
         local name=$(guest_name "$uri" "$guest")
         if [ -n "$name" ]; then
-            eval_gettext "Shutdown of guest \$name complete."
+            unbuffered_eval_gettext "Shutdown of guest \$name complete."
             echo
         fi
     done
@@ -386,10 +396,10 @@ shutdown_guests_parallel()
 
     if [ $timeout -gt 0 ]; then
         check_timeout=true
-        format=$(eval_gettext "Waiting for %d guests to shut down, %d seconds left\n")
+        format=$(unbuffered_eval_gettext "Waiting for %d guests to shut down, %d seconds left\n")
     else
         slept=0
-        format=$(eval_gettext "Waiting for %d guests to shut down\n")
+        format=$(unbuffered_eval_gettext "Waiting for %d guests to shut down\n")
     fi
     while [ -n "$on_shutdown" ] || [ -n "$guests" ]; do
         while [ -n "$guests" ] &&
@@ -417,7 +427,7 @@ shutdown_guests_parallel()
             fi
             timeout=$(($timeout - 1))
             if [ $timeout -le 0 ]; then
-                eval_gettext "Timeout expired while shutting down domains"; echo
+                unbuffered_eval_gettext "Timeout expired while shutting down domains"; echo
                 RETVAL=1
                 return
             fi
@@ -447,7 +457,7 @@ stop() {
     if [ "/x$ON_SHUTDOWN" = xshutdown ]; then
         suspending=false
         if [ $SHUTDOWN_TIMEOUT -lt 0 ]; then
-            gettext "SHUTDOWN_TIMEOUT must be equal or greater than 0"
+            unbuffered_gettext "SHUTDOWN_TIMEOUT must be equal or greater than 0"
             echo
             RETVAL=6
             return
@@ -461,7 +471,7 @@ stop() {
 
         test_connect "$uri" || continue
 
-        eval_gettext "Running guests on \$uri URI: "
+        unbuffered_eval_gettext "Running guests on \$uri URI: "
 
         local list=$(list_guests "$uri")
         if [ $? -eq 0 ]; then
@@ -473,7 +483,7 @@ stop() {
             done
 
             if "$empty"; then
-                gettext "no running guests."
+                unbuffered_gettext "no running guests."
             fi
             echo
         fi
@@ -486,7 +496,7 @@ stop() {
 
                 for uuid in $transient; do
                     if "$empty"; then
-                        eval_gettext "Not suspending transient guests on URI: \$uri: "
+                        unbuffered_eval_gettext "Not suspending transient guests on URI: \$uri: "
                         empty=false
                     else
                         printf ", "
@@ -497,14 +507,14 @@ stop() {
                 # reload domain list to contain only persistent guests
                 list=$(list_guests "$uri" "--persistent")
                 if [ $? -ne 0 ]; then
-                    eval_gettext "Failed to list persistent guests on \$uri"
+                    unbuffered_eval_gettext "Failed to list persistent guests on \$uri"
                     echo
                     RETVAL=1
                     set +f
                     return
                 fi
             else
-                gettext "Failed to list transient guests"
+                unbuffered_gettext "Failed to list transient guests"
                 echo
                 RETVAL=1
                 set +f
@@ -521,9 +531,9 @@ stop() {
     if [ -s "$LISTFILE" ]; then
         while read uri list; do
             if "$suspending"; then
-                eval_gettext "Suspending guests on \$uri URI..."; echo
+                unbuffered_eval_gettext "Suspending guests on \$uri URI..."; echo
             else
-                eval_gettext "Shutting down guests on \$uri URI..."; echo
+                unbuffered_eval_gettext "Shutting down guests on \$uri URI..."; echo
             fi
 
             if [ "$PARALLEL_SHUTDOWN" -gt 1 ] &&
@@ -566,14 +576,14 @@ gueststatus() {
 # since there is no external daemon process matching this init script.
 rh_status() {
     if [ -f "$LISTFILE" ]; then
-        gettext "stopped, with saved guests"; echo
+        unbuffered_gettext "stopped, with saved guests"; echo
         RETVAL=3
     else
         if [ -f "$VAR_SUBSYS_LIBVIRT_GUESTS" ]; then
-            gettext "started"; echo
+            unbuffered_gettext "started"; echo
             RETVAL=0
         else
-            gettext "stopped, with no saved guests"; echo
+            unbuffered_gettext "stopped, with no saved guests"; echo
             RETVAL=3
         fi
     fi
@@ -583,7 +593,7 @@ rh_status() {
 # Display usage string, then exit with VAL (defaults to 2).
 usage() {
     local program_name=$0
-    eval_gettext "Usage: \$program_name {start|stop|status|restart|"\
+    unbuffered_eval_gettext "Usage: \$program_name {start|stop|status|restart|"\
 "condrestart|try-restart|reload|force-reload|gueststatus|shutdown}"; echo
     exit ${1-2}
 }
-- 
2.26.0


Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Michal Privoznik 5 years, 9 months ago
On 4/30/20 11:59 AM, Christian Ehrhardt wrote:
> Newer systemd is too smart, it detects the PIDs of the gettext calls
> and due to that a log lists libvirt-guests.sh with many different PIDs.
> 
> Furthermore it struggles to collect the output in time, so it can be
> truncated and overall looks like:
>     libvirt-guests.sh[37986]: Running guests on default URI:
>     libvirt-guests.sh[37995]: Running guests on
>     libvirt-guests.sh[37977]: R
> 
> Gather the gettext result into a local variable and printing
> the value from libvirt-guests.sh itself fixes both issues.
> 
> Fixes: https://bugs.launchpad.net/ubuntu/+source/libvirt/+bug/1875708
> 
> Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> ---
>   tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
>   1 file changed, 42 insertions(+), 32 deletions(-)
> 
> diff --git a/tools/libvirt-guests.sh.in b/tools/libvirt-guests.sh.in
> index 7af24dab3b..324abe3623 100644
> --- a/tools/libvirt-guests.sh.in
> +++ b/tools/libvirt-guests.sh.in
> @@ -28,6 +28,16 @@ test ! -r "$sysconfdir"/rc.d/init.d/functions ||
>   # Make sure this file is recognized as having translations: _("dummy")
>   . "@bindir@"/gettext.sh
>   
> +# Avoid output being listed under gettext PID and being truncated
> +unbuffered_gettext () {
> +    msg="$(gettext "$1")"
> +    echo "$msg"
> +}
> +unbuffered_eval_gettext () {
> +    msg="$(eval_gettext "$1")"
> +    echo "$msg"
> +}
> +

Ah, is this the case where gettext prints the text in many sequences 
whereas echo does it all at once? Or what do you think is happening 
here? I don't quite understand how buffering steps into this.

Michal

Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Christian Ehrhardt 5 years, 9 months ago
On Thu, Apr 30, 2020 at 2:10 PM Michal Privoznik <mprivozn@redhat.com> wrote:
>
> On 4/30/20 11:59 AM, Christian Ehrhardt wrote:
> > Newer systemd is too smart, it detects the PIDs of the gettext calls
> > and due to that a log lists libvirt-guests.sh with many different PIDs.
> >
> > Furthermore it struggles to collect the output in time, so it can be
> > truncated and overall looks like:
> >     libvirt-guests.sh[37986]: Running guests on default URI:
> >     libvirt-guests.sh[37995]: Running guests on
> >     libvirt-guests.sh[37977]: R
> >
> > Gather the gettext result into a local variable and printing
> > the value from libvirt-guests.sh itself fixes both issues.
> >
> > Fixes: https://bugs.launchpad.net/ubuntu/+source/libvirt/+bug/1875708
> >
> > Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> > ---
> >   tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
> >   1 file changed, 42 insertions(+), 32 deletions(-)
> >
> > diff --git a/tools/libvirt-guests.sh.in b/tools/libvirt-guests.sh.in
> > index 7af24dab3b..324abe3623 100644
> > --- a/tools/libvirt-guests.sh.in
> > +++ b/tools/libvirt-guests.sh.in
> > @@ -28,6 +28,16 @@ test ! -r "$sysconfdir"/rc.d/init.d/functions ||
> >   # Make sure this file is recognized as having translations: _("dummy")
> >   . "@bindir@"/gettext.sh
> >
> > +# Avoid output being listed under gettext PID and being truncated
> > +unbuffered_gettext () {
> > +    msg="$(gettext "$1")"
> > +    echo "$msg"
> > +}
> > +unbuffered_eval_gettext () {
> > +    msg="$(eval_gettext "$1")"
> > +    echo "$msg"
> > +}
> > +
>
> Ah, is this the case where gettext prints the text in many sequences
> whereas echo does it all at once? Or what do you think is happening
> here? I don't quite understand how buffering steps into this.

You have two effects:
- systemd tries to detect and report which PID the output is from when
  something it printed. Therefore it appears to a user as if the script
  itself would have been invoked multiple times while actually is will be
  the PIDs of the different gettext calls
- I assume that to get this content it is hitchhiking the output streams
  somehow - the gettext processes are very very short lived and it seems
  when they are exiting some output of them might be lost.

Maybe calling it "buffering" was wrong, but the change above resolves
both issues, the odd PIDs and the truncation by printing from the actual script.

> Michal
>


-- 
Christian Ehrhardt
Staff Engineer, Ubuntu Server
Canonical Ltd


Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Daniel P. Berrangé 5 years, 9 months ago
On Thu, Apr 30, 2020 at 02:42:21PM +0200, Christian Ehrhardt wrote:
> On Thu, Apr 30, 2020 at 2:10 PM Michal Privoznik <mprivozn@redhat.com> wrote:
> >
> > On 4/30/20 11:59 AM, Christian Ehrhardt wrote:
> > > Newer systemd is too smart, it detects the PIDs of the gettext calls
> > > and due to that a log lists libvirt-guests.sh with many different PIDs.
> > >
> > > Furthermore it struggles to collect the output in time, so it can be
> > > truncated and overall looks like:
> > >     libvirt-guests.sh[37986]: Running guests on default URI:
> > >     libvirt-guests.sh[37995]: Running guests on
> > >     libvirt-guests.sh[37977]: R
> > >
> > > Gather the gettext result into a local variable and printing
> > > the value from libvirt-guests.sh itself fixes both issues.
> > >
> > > Fixes: https://bugs.launchpad.net/ubuntu/+source/libvirt/+bug/1875708
> > >
> > > Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> > > ---
> > >   tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
> > >   1 file changed, 42 insertions(+), 32 deletions(-)
> > >
> > > diff --git a/tools/libvirt-guests.sh.in b/tools/libvirt-guests.sh.in
> > > index 7af24dab3b..324abe3623 100644
> > > --- a/tools/libvirt-guests.sh.in
> > > +++ b/tools/libvirt-guests.sh.in
> > > @@ -28,6 +28,16 @@ test ! -r "$sysconfdir"/rc.d/init.d/functions ||
> > >   # Make sure this file is recognized as having translations: _("dummy")
> > >   . "@bindir@"/gettext.sh
> > >
> > > +# Avoid output being listed under gettext PID and being truncated
> > > +unbuffered_gettext () {
> > > +    msg="$(gettext "$1")"
> > > +    echo "$msg"
> > > +}
> > > +unbuffered_eval_gettext () {
> > > +    msg="$(eval_gettext "$1")"
> > > +    echo "$msg"
> > > +}
> > > +
> >
> > Ah, is this the case where gettext prints the text in many sequences
> > whereas echo does it all at once? Or what do you think is happening
> > here? I don't quite understand how buffering steps into this.
> 
> You have two effects:
> - systemd tries to detect and report which PID the output is from when
>   something it printed. Therefore it appears to a user as if the script
>   itself would have been invoked multiple times while actually is will be
>   the PIDs of the different gettext calls
> - I assume that to get this content it is hitchhiking the output streams
>   somehow - the gettext processes are very very short lived and it seems
>   when they are exiting some output of them might be lost.

This doesn't pass a sniff test to me.

systemd is reading stdout/stderr of the libvirt-guests script. How
would it ever tell that certain lines from stdout/err are coming from
different PIDs. Systemd isn't involved in spawning the gettext processes.
It can see they come & go via cgroups pids, but that's about it.

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Christian Ehrhardt 5 years, 9 months ago
On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Thu, Apr 30, 2020 at 02:42:21PM +0200, Christian Ehrhardt wrote:
> > On Thu, Apr 30, 2020 at 2:10 PM Michal Privoznik <mprivozn@redhat.com> wrote:
> > >
> > > On 4/30/20 11:59 AM, Christian Ehrhardt wrote:
> > > > Newer systemd is too smart, it detects the PIDs of the gettext calls
> > > > and due to that a log lists libvirt-guests.sh with many different PIDs.
> > > >
> > > > Furthermore it struggles to collect the output in time, so it can be
> > > > truncated and overall looks like:
> > > >     libvirt-guests.sh[37986]: Running guests on default URI:
> > > >     libvirt-guests.sh[37995]: Running guests on
> > > >     libvirt-guests.sh[37977]: R
> > > >
> > > > Gather the gettext result into a local variable and printing
> > > > the value from libvirt-guests.sh itself fixes both issues.
> > > >
> > > > Fixes: https://bugs.launchpad.net/ubuntu/+source/libvirt/+bug/1875708
> > > >
> > > > Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> > > > ---
> > > >   tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
> > > >   1 file changed, 42 insertions(+), 32 deletions(-)
> > > >
> > > > diff --git a/tools/libvirt-guests.sh.in b/tools/libvirt-guests.sh.in
> > > > index 7af24dab3b..324abe3623 100644
> > > > --- a/tools/libvirt-guests.sh.in
> > > > +++ b/tools/libvirt-guests.sh.in
> > > > @@ -28,6 +28,16 @@ test ! -r "$sysconfdir"/rc.d/init.d/functions ||
> > > >   # Make sure this file is recognized as having translations: _("dummy")
> > > >   . "@bindir@"/gettext.sh
> > > >
> > > > +# Avoid output being listed under gettext PID and being truncated
> > > > +unbuffered_gettext () {
> > > > +    msg="$(gettext "$1")"
> > > > +    echo "$msg"
> > > > +}
> > > > +unbuffered_eval_gettext () {
> > > > +    msg="$(eval_gettext "$1")"
> > > > +    echo "$msg"
> > > > +}
> > > > +
> > >
> > > Ah, is this the case where gettext prints the text in many sequences
> > > whereas echo does it all at once? Or what do you think is happening
> > > here? I don't quite understand how buffering steps into this.
> >
> > You have two effects:
> > - systemd tries to detect and report which PID the output is from when
> >   something it printed. Therefore it appears to a user as if the script
> >   itself would have been invoked multiple times while actually is will be
> >   the PIDs of the different gettext calls
> > - I assume that to get this content it is hitchhiking the output streams
> >   somehow - the gettext processes are very very short lived and it seems
> >   when they are exiting some output of them might be lost.
>
> This doesn't pass a sniff test to me.
>
> systemd is reading stdout/stderr of the libvirt-guests script. How
> would it ever tell that certain lines from stdout/err are coming from
> different PIDs. Systemd isn't involved in spawning the gettext processes.
> It can see they come & go via cgroups pids, but that's about it.

Well it seems I have a reproducible symptom and a fix, but not the
explanation why the latter fixes the former.
I'll need to dive into some debug & analysis myself to explain it better.
I'll be back here once I got time to do that in depth check.

Until then whoever is affected (should be everyone) can give it a
thought as well.
Repro is as easy as
One console:
$ journalctl -f -u libvirt-guests
Other console:
$ systemctl stop libvirt-guests
$ systemctl start libvirt-guests

I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
237-3ubuntu10.39 (20.04).
Maybe it is a systemd bug after all?
I'd be interested to hear if that is/isn't clobbering output for
anyone else and what your systemd versions are?

> Regards,
> Daniel
> --
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>


-- 
Christian Ehrhardt
Staff Engineer, Ubuntu Server
Canonical Ltd


Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Daniel P. Berrangé 5 years, 9 months ago
On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote:
> On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> Well it seems I have a reproducible symptom and a fix, but not the
> explanation why the latter fixes the former.
> I'll need to dive into some debug & analysis myself to explain it better.
> I'll be back here once I got time to do that in depth check.
> 
> Until then whoever is affected (should be everyone) can give it a
> thought as well.
> Repro is as easy as
> One console:
> $ journalctl -f -u libvirt-guests
> Other console:
> $ systemctl stop libvirt-guests
> $ systemctl start libvirt-guests
> 
> I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
> 237-3ubuntu10.39 (20.04).
> Maybe it is a systemd bug after all?
> I'd be interested to hear if that is/isn't clobbering output for
> anyone else and what your systemd versions are?

FWIW, it works correctly on Fedora 31 with systemd 243.

Apr 30 16:07:27 localhost.localdomain systemd[1]: Starting Suspend/Resume Running libvirt Guests...
Apr 30 16:07:27 localhost.localdomain systemd[1]: Started Suspend/Resume Running libvirt Guests.
Apr 30 16:08:33 localhost.localdomain systemd[1]: Stopping Suspend/Resume Running libvirt Guests...
Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Running guests on default URI: memtest, memtest1
Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Suspending guests on default URI...
Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest: ...
Apr 30 16:08:34 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest: done
Apr 30 16:08:34 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest1: ...
Apr 30 16:08:35 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest1: done
Apr 30 16:08:35 localhost.localdomain systemd[1]: libvirt-guests.service: Succeeded.
Apr 30 16:08:35 localhost.localdomain systemd[1]: Stopped Suspend/Resume Running libvirt Guests.

I'll see if I can do a test with newer systemd too


Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Christian Ehrhardt 5 years, 9 months ago
On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote:
> > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > Well it seems I have a reproducible symptom and a fix, but not the
> > explanation why the latter fixes the former.
> > I'll need to dive into some debug & analysis myself to explain it better.
> > I'll be back here once I got time to do that in depth check.
> >
> > Until then whoever is affected (should be everyone) can give it a
> > thought as well.
> > Repro is as easy as
> > One console:
> > $ journalctl -f -u libvirt-guests
> > Other console:
> > $ systemctl stop libvirt-guests
> > $ systemctl start libvirt-guests
> >
> > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
> > 237-3ubuntu10.39 (20.04).
> > Maybe it is a systemd bug after all?
> > I'd be interested to hear if that is/isn't clobbering output for
> > anyone else and what your systemd versions are?
>
> FWIW, it works correctly on Fedora 31 with systemd 243.

Eoan with 242-7ubuntu3.7 is good as well.
I might need to try to get some interim versions from somewhere.

>
> Apr 30 16:07:27 localhost.localdomain systemd[1]: Starting Suspend/Resume Running libvirt Guests...
> Apr 30 16:07:27 localhost.localdomain systemd[1]: Started Suspend/Resume Running libvirt Guests.
> Apr 30 16:08:33 localhost.localdomain systemd[1]: Stopping Suspend/Resume Running libvirt Guests...
> Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Running guests on default URI: memtest, memtest1
> Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Suspending guests on default URI...
> Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest: ...
> Apr 30 16:08:34 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest: done
> Apr 30 16:08:34 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest1: ...
> Apr 30 16:08:35 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest1: done
> Apr 30 16:08:35 localhost.localdomain systemd[1]: libvirt-guests.service: Succeeded.
> Apr 30 16:08:35 localhost.localdomain systemd[1]: Stopped Suspend/Resume Running libvirt Guests.
>
> I'll see if I can do a test with newer systemd too
>
>
> Regards,
> Daniel
> --
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>


-- 
Christian Ehrhardt
Staff Engineer, Ubuntu Server
Canonical Ltd


Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Daniel P. Berrangé 5 years, 9 months ago
On Thu, Apr 30, 2020 at 06:28:08PM +0200, Christian Ehrhardt wrote:
> On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> >
> > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote:
> > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > Well it seems I have a reproducible symptom and a fix, but not the
> > > explanation why the latter fixes the former.
> > > I'll need to dive into some debug & analysis myself to explain it better.
> > > I'll be back here once I got time to do that in depth check.
> > >
> > > Until then whoever is affected (should be everyone) can give it a
> > > thought as well.
> > > Repro is as easy as
> > > One console:
> > > $ journalctl -f -u libvirt-guests
> > > Other console:
> > > $ systemctl stop libvirt-guests
> > > $ systemctl start libvirt-guests
> > >
> > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
> > > 237-3ubuntu10.39 (20.04).
> > > Maybe it is a systemd bug after all?
> > > I'd be interested to hear if that is/isn't clobbering output for
> > > anyone else and what your systemd versions are?
> >
> > FWIW, it works correctly on Fedora 31 with systemd 243.
> 
> Eoan with 242-7ubuntu3.7 is good as well.
> I might need to try to get some interim versions from somewhere.

I've reproduced on Fedora 33 rawhide with systemd 245 - the first
place where it lists running guests is screwed up slightly:

Apr 30 17:16:13 libvirt-fedora-rawhide systemd[1]: Stopping Suspend/Resume Running libvirt Guests...
Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69903]: Running guests on default URI:
Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Runningcore2
Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69934]: Suspending guests on default URI...
Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: SSuspending core1: ...
Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core1: done
Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: ...
Apr 30 17:16:15 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: done
Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: libvirt-guests.service: Succeeded.
Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: Stopped Suspend/Resume Running libvirt Guests.

On resume it is even worse

Apr 30 17:19:40 libvirt-fedora-rawhide systemd[1]: Starting Suspend/Resume Running libvirt Guests...
Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70041]: Resuming guests on default URI...
Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70030]: R
Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70048]: R
Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70048]: esuming guest core1:
Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70079]: esum
Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e
Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70086]: e
Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70086]: esuming guest core2:
Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70119]: esum
Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e
Apr 30 17:19:42 libvirt-fedora-rawhide systemd[1]: Finished Suspend/Resume Running libvirt Guests.


It is possible it isn't systemd related - could be other packages that
are co-incidentally affecting it.

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Christian Ehrhardt 5 years, 9 months ago
On Thu, Apr 30, 2020 at 7:23 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Thu, Apr 30, 2020 at 06:28:08PM +0200, Christian Ehrhardt wrote:
> > On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > >
> > > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote:
> > > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > > Well it seems I have a reproducible symptom and a fix, but not the
> > > > explanation why the latter fixes the former.
> > > > I'll need to dive into some debug & analysis myself to explain it better.
> > > > I'll be back here once I got time to do that in depth check.
> > > >
> > > > Until then whoever is affected (should be everyone) can give it a
> > > > thought as well.
> > > > Repro is as easy as
> > > > One console:
> > > > $ journalctl -f -u libvirt-guests
> > > > Other console:
> > > > $ systemctl stop libvirt-guests
> > > > $ systemctl start libvirt-guests
> > > >
> > > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
> > > > 237-3ubuntu10.39 (20.04).
> > > > Maybe it is a systemd bug after all?
> > > > I'd be interested to hear if that is/isn't clobbering output for
> > > > anyone else and what your systemd versions are?
> > >
> > > FWIW, it works correctly on Fedora 31 with systemd 243.
> >
> > Eoan with 242-7ubuntu3.7 is good as well.
> > I might need to try to get some interim versions from somewhere.
>
> I've reproduced on Fedora 33 rawhide with systemd 245 - the first
> place where it lists running guests is screwed up slightly:
>
> Apr 30 17:16:13 libvirt-fedora-rawhide systemd[1]: Stopping Suspend/Resume Running libvirt Guests...
> Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69903]: Running guests on default URI:
> Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Runningcore2
> Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69934]: Suspending guests on default URI...
> Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: SSuspending core1: ...
> Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core1: done
> Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: ...
> Apr 30 17:16:15 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: done
> Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: libvirt-guests.service: Succeeded.
> Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: Stopped Suspend/Resume Running libvirt Guests.
>
> On resume it is even worse
>
> Apr 30 17:19:40 libvirt-fedora-rawhide systemd[1]: Starting Suspend/Resume Running libvirt Guests...
> Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70041]: Resuming guests on default URI...
> Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70030]: R
> Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70048]: R
> Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70048]: esuming guest core1:
> Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70079]: esum
> Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e
> Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70086]: e
> Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70086]: esuming guest core2:
> Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70119]: esum
> Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e
> Apr 30 17:19:42 libvirt-fedora-rawhide systemd[1]: Finished Suspend/Resume Running libvirt Guests.
>
>
> It is possible it isn't systemd related - could be other packages that
> are co-incidentally affecting it.

Since launchpad holds all former builds still ready for download I
could easily test a few versions.

I up/downgraded just the following packages on an otherwise unmodified system:
- libnss-systemd
- libpam-systemd
- libsystemd0
- systemd
- systemd-container
- systemd-sysv

Those versions I did check:

245.4-4ubuntu3 bad
245.2-1ubuntu1 bad
244.3-1ubuntu1 bad
244.1-0ubuntu3 bad (bad on retry)
243-3ubuntu1 good (good (on retry)
242-7ubuntu3 good

Retry means that I went and installed from good -> bad -> good and the
behavior was the same.
So none of the installs fixed it to then be good forever. It was
consistently good <244 and bad >=244.

Since I only installed the mentioned systemd packages and no others
I'd say it is systemd.
I can't break this up into which package, since they inter-depend each other.
I guess it might be time to file a systemd bug for this - if not to
fix then to understand what is going on so that we can make a better
decision.

>
> Regards,
> Daniel
> --
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>


-- 
Christian Ehrhardt
Staff Engineer, Ubuntu Server
Canonical Ltd


Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Daniel P. Berrangé 5 years, 9 months ago
On Thu, Apr 30, 2020 at 07:36:54PM +0200, Christian Ehrhardt wrote:
> On Thu, Apr 30, 2020 at 7:23 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> >
> > On Thu, Apr 30, 2020 at 06:28:08PM +0200, Christian Ehrhardt wrote:
> > > On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > >
> > > > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote:
> > > > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > > > Well it seems I have a reproducible symptom and a fix, but not the
> > > > > explanation why the latter fixes the former.
> > > > > I'll need to dive into some debug & analysis myself to explain it better.
> > > > > I'll be back here once I got time to do that in depth check.
> > > > >
> > > > > Until then whoever is affected (should be everyone) can give it a
> > > > > thought as well.
> > > > > Repro is as easy as
> > > > > One console:
> > > > > $ journalctl -f -u libvirt-guests
> > > > > Other console:
> > > > > $ systemctl stop libvirt-guests
> > > > > $ systemctl start libvirt-guests
> > > > >
> > > > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
> > > > > 237-3ubuntu10.39 (20.04).
> > > > > Maybe it is a systemd bug after all?
> > > > > I'd be interested to hear if that is/isn't clobbering output for
> > > > > anyone else and what your systemd versions are?
> > > >
> > > > FWIW, it works correctly on Fedora 31 with systemd 243.
> > >
> > > Eoan with 242-7ubuntu3.7 is good as well.
> > > I might need to try to get some interim versions from somewhere.
> >
> > I've reproduced on Fedora 33 rawhide with systemd 245 - the first
> > place where it lists running guests is screwed up slightly:
> >
> > Apr 30 17:16:13 libvirt-fedora-rawhide systemd[1]: Stopping Suspend/Resume Running libvirt Guests...
> > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69903]: Running guests on default URI:
> > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Runningcore2
> > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69934]: Suspending guests on default URI...
> > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: SSuspending core1: ...
> > Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core1: done
> > Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: ...
> > Apr 30 17:16:15 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: done
> > Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: libvirt-guests.service: Succeeded.
> > Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: Stopped Suspend/Resume Running libvirt Guests.
> >
> > On resume it is even worse
> >
> > Apr 30 17:19:40 libvirt-fedora-rawhide systemd[1]: Starting Suspend/Resume Running libvirt Guests...
> > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70041]: Resuming guests on default URI...
> > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70030]: R
> > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70048]: R
> > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70048]: esuming guest core1:
> > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70079]: esum
> > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e
> > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70086]: e
> > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70086]: esuming guest core2:
> > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70119]: esum
> > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e
> > Apr 30 17:19:42 libvirt-fedora-rawhide systemd[1]: Finished Suspend/Resume Running libvirt Guests.
> >
> >
> > It is possible it isn't systemd related - could be other packages that
> > are co-incidentally affecting it.
> 
> Since launchpad holds all former builds still ready for download I
> could easily test a few versions.
> 
> I up/downgraded just the following packages on an otherwise unmodified system:
> - libnss-systemd
> - libpam-systemd
> - libsystemd0
> - systemd
> - systemd-container
> - systemd-sysv
> 
> Those versions I did check:
> 
> 245.4-4ubuntu3 bad
> 245.2-1ubuntu1 bad
> 244.3-1ubuntu1 bad
> 244.1-0ubuntu3 bad (bad on retry)
> 243-3ubuntu1 good (good (on retry)
> 242-7ubuntu3 good
> 
> Retry means that I went and installed from good -> bad -> good and the
> behavior was the same.
> So none of the installs fixed it to then be good forever. It was
> consistently good <244 and bad >=244.

Based on that, a possibly interesting change between v243..v244 was
this:

commit 09d0b46ab61bebafe5bdc1be95ee153dfb13d6bc
Author: Lorenz Bauer <lmb@cloudflare.com>
Date:   Mon Nov 4 16:35:46 2019 +0000

    journal: refresh cached credentials of stdout streams
    
    journald assumes that getsockopt(SO_PEERCRED) correctly identifies the
    process on the remote end of the socket. However, this is incorrect
    according to man 7 socket:
    
        The returned  credentials  are  those that were in effect at the
        time of the call to connect(2) or socketpair(2).
    
    This becomes a problem when a new process inherits the stdout stream
    from a parent. First, log messages from the child process will
    be attributed to the parent. Second, the struct ucred used by journald
    becomes invalid as soon as the parent exits. Further sendmsg calls then
    fail with ENOENT. Logs for the child process then vanish from the journal.
    
    Fix this by using recvmsg on the stdout stream, and refreshing the cached
    struct ucred if SCM_CREDENTIALS indicate a new process.
    

but I can't see an obvious mistake the code which would cause the
behaviour we see.

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Christian Ehrhardt 5 years, 9 months ago
On Thu, Apr 30, 2020 at 7:59 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Thu, Apr 30, 2020 at 07:36:54PM +0200, Christian Ehrhardt wrote:
> > On Thu, Apr 30, 2020 at 7:23 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > >
> > > On Thu, Apr 30, 2020 at 06:28:08PM +0200, Christian Ehrhardt wrote:
> > > > On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > > >
> > > > > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote:
> > > > > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > > > > Well it seems I have a reproducible symptom and a fix, but not the
> > > > > > explanation why the latter fixes the former.
> > > > > > I'll need to dive into some debug & analysis myself to explain it better.
> > > > > > I'll be back here once I got time to do that in depth check.
> > > > > >
> > > > > > Until then whoever is affected (should be everyone) can give it a
> > > > > > thought as well.
> > > > > > Repro is as easy as
> > > > > > One console:
> > > > > > $ journalctl -f -u libvirt-guests
> > > > > > Other console:
> > > > > > $ systemctl stop libvirt-guests
> > > > > > $ systemctl start libvirt-guests
> > > > > >
> > > > > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
> > > > > > 237-3ubuntu10.39 (20.04).
> > > > > > Maybe it is a systemd bug after all?
> > > > > > I'd be interested to hear if that is/isn't clobbering output for
> > > > > > anyone else and what your systemd versions are?
> > > > >
> > > > > FWIW, it works correctly on Fedora 31 with systemd 243.
> > > >
> > > > Eoan with 242-7ubuntu3.7 is good as well.
> > > > I might need to try to get some interim versions from somewhere.
> > >
> > > I've reproduced on Fedora 33 rawhide with systemd 245 - the first
> > > place where it lists running guests is screwed up slightly:
> > >
> > > Apr 30 17:16:13 libvirt-fedora-rawhide systemd[1]: Stopping Suspend/Resume Running libvirt Guests...
> > > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69903]: Running guests on default URI:
> > > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Runningcore2
> > > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69934]: Suspending guests on default URI...
> > > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: SSuspending core1: ...
> > > Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core1: done
> > > Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: ...
> > > Apr 30 17:16:15 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: done
> > > Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: libvirt-guests.service: Succeeded.
> > > Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: Stopped Suspend/Resume Running libvirt Guests.
> > >
> > > On resume it is even worse
> > >
> > > Apr 30 17:19:40 libvirt-fedora-rawhide systemd[1]: Starting Suspend/Resume Running libvirt Guests...
> > > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70041]: Resuming guests on default URI...
> > > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70030]: R
> > > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70048]: R
> > > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70048]: esuming guest core1:
> > > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70079]: esum
> > > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e
> > > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70086]: e
> > > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70086]: esuming guest core2:
> > > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70119]: esum
> > > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e
> > > Apr 30 17:19:42 libvirt-fedora-rawhide systemd[1]: Finished Suspend/Resume Running libvirt Guests.
> > >
> > >
> > > It is possible it isn't systemd related - could be other packages that
> > > are co-incidentally affecting it.
> >
> > Since launchpad holds all former builds still ready for download I
> > could easily test a few versions.
> >
> > I up/downgraded just the following packages on an otherwise unmodified system:
> > - libnss-systemd
> > - libpam-systemd
> > - libsystemd0
> > - systemd
> > - systemd-container
> > - systemd-sysv
> >
> > Those versions I did check:
> >
> > 245.4-4ubuntu3 bad
> > 245.2-1ubuntu1 bad
> > 244.3-1ubuntu1 bad
> > 244.1-0ubuntu3 bad (bad on retry)
> > 243-3ubuntu1 good (good (on retry)
> > 242-7ubuntu3 good
> >
> > Retry means that I went and installed from good -> bad -> good and the
> > behavior was the same.
> > So none of the installs fixed it to then be good forever. It was
> > consistently good <244 and bad >=244.
>
> Based on that, a possibly interesting change between v243..v244 was
> this:
>
> commit 09d0b46ab61bebafe5bdc1be95ee153dfb13d6bc
> Author: Lorenz Bauer <lmb@cloudflare.com>
> Date:   Mon Nov 4 16:35:46 2019 +0000
>
>     journal: refresh cached credentials of stdout streams

FYI - reported to systemd https://github.com/systemd/systemd/issues/15654

>     journald assumes that getsockopt(SO_PEERCRED) correctly identifies the
>     process on the remote end of the socket. However, this is incorrect
>     according to man 7 socket:
>
>         The returned  credentials  are  those that were in effect at the
>         time of the call to connect(2) or socketpair(2).
>
>     This becomes a problem when a new process inherits the stdout stream
>     from a parent. First, log messages from the child process will
>     be attributed to the parent. Second, the struct ucred used by journald
>     becomes invalid as soon as the parent exits. Further sendmsg calls then
>     fail with ENOENT. Logs for the child process then vanish from the journal.
>
>     Fix this by using recvmsg on the stdout stream, and refreshing the cached
>     struct ucred if SCM_CREDENTIALS indicate a new process.
>
>
> but I can't see an obvious mistake the code which would cause the
> behaviour we see.
>
> Regards,
> Daniel
> --
> |: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org         -o-            https://fstop138.berrange.com :|
> |: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|
>


-- 
Christian Ehrhardt
Staff Engineer, Ubuntu Server
Canonical Ltd


Re: [PATCH] libvirt-guests.sh: fix log output with new systemd
Posted by Daniel P. Berrangé 5 years, 9 months ago
On Thu, Apr 30, 2020 at 06:22:28PM +0100, Daniel P. Berrangé wrote:
> On Thu, Apr 30, 2020 at 06:28:08PM +0200, Christian Ehrhardt wrote:
> > On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > >
> > > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote:
> > > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > > Well it seems I have a reproducible symptom and a fix, but not the
> > > > explanation why the latter fixes the former.
> > > > I'll need to dive into some debug & analysis myself to explain it better.
> > > > I'll be back here once I got time to do that in depth check.
> > > >
> > > > Until then whoever is affected (should be everyone) can give it a
> > > > thought as well.
> > > > Repro is as easy as
> > > > One console:
> > > > $ journalctl -f -u libvirt-guests
> > > > Other console:
> > > > $ systemctl stop libvirt-guests
> > > > $ systemctl start libvirt-guests
> > > >
> > > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
> > > > 237-3ubuntu10.39 (20.04).
> > > > Maybe it is a systemd bug after all?
> > > > I'd be interested to hear if that is/isn't clobbering output for
> > > > anyone else and what your systemd versions are?
> > >
> > > FWIW, it works correctly on Fedora 31 with systemd 243.
> > 
> > Eoan with 242-7ubuntu3.7 is good as well.
> > I might need to try to get some interim versions from somewhere.
> 
> I've reproduced on Fedora 33 rawhide with systemd 245 - the first
> place where it lists running guests is screwed up slightly:

I strace'd libvirt-guests.sh and the data sent looks fine.

I strace'd systemd-journald and the data it receives looks fine,
but the data systemd-journald writes into the journal, however, is
completely fubar:


recvmsg(32, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="Resuming guest core1: ", iov_len=2054}], msg_iovlen=1, msg_control=[{cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, cmsg_data={pid=70624, uid=0, gid=0}}], msg_controllen=32, msg_flags=MSG_CTRUNC|MSG_CMSG_CLOEXEC}, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 22
writev(33, [{iov_base="[ 2741.146550] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70605]: ", iov_len=9}, {iov_base="R", iov_len=1}, {iov_base="\n", iov_len=1}], 5) = 43
writev(33, [{iov_base="[ 2741.163787] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70624]: ", iov_len=9}, {iov_base="R", iov_len=1}, {iov_base="\n", iov_len=1}], 5) = 43
writev(33, [{iov_base="[ 2742.518626] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70624]: ", iov_len=9}, {iov_base="esuming guest core1:", iov_len=20}, {iov_base="\n", iov_len=1}], 5) = 62
writev(33, [{iov_base="[ 2742.531644] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70655]: ", iov_len=9}, {iov_base="esum", iov_len=4}, {iov_base="\n", iov_len=1}], 5) = 46
recvmsg(32, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="Resuming guest core2: ", iov_len=2054}], msg_iovlen=1, msg_control=[{cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, cmsg_data={pid=70662, uid=0, gid=0}}], msg_controllen=32, msg_flags=MSG_CTRUNC|MSG_CMSG_CLOEXEC}, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 22

writev(33, [{iov_base="[ 2742.687436] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70605]: ", iov_len=9}, {iov_base="e", iov_len=1}, {iov_base="\n", iov_len=1}], 5) = 43
writev(33, [{iov_base="[ 2742.690716] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70662]: ", iov_len=9}, {iov_base="e", iov_len=1}, {iov_base="\n", iov_len=1}], 5) = 43
writev(33, [{iov_base="[ 2744.015012] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70662]: ", iov_len=9}, {iov_base="esuming guest core2:", iov_len=20}, {iov_base="\n", iov_len=1}], 5) = 62
writev(33, [{iov_base="[ 2744.019848] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70694]: ", iov_len=9}, {iov_base="esum", iov_len=4}, {iov_base="\n", iov_len=1}], 5) = 46
writev(33, [{iov_base="[ 2744.036819] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70605]: ", iov_len=9}, {iov_base="e", iov_len=1}, {iov_base="\n", iov_len=1}], 5) = 43

So I think we found a regression in journald making it mangle log
messages. I smells like it is corrupting some buffers somewhere.

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|