tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++----------------- 1 file changed, 42 insertions(+), 32 deletions(-)
Newer systemd is too smart, it detects the PIDs of the gettext calls
and due to that a log lists libvirt-guests.sh with many different PIDs.
Furthermore it struggles to collect the output in time, so it can be
truncated and overall looks like:
libvirt-guests.sh[37986]: Running guests on default URI:
libvirt-guests.sh[37995]: Running guests on
libvirt-guests.sh[37977]: R
Gather the gettext result into a local variable and printing
the value from libvirt-guests.sh itself fixes both issues.
Fixes: https://bugs.launchpad.net/ubuntu/+source/libvirt/+bug/1875708
Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
---
tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
1 file changed, 42 insertions(+), 32 deletions(-)
diff --git a/tools/libvirt-guests.sh.in b/tools/libvirt-guests.sh.in
index 7af24dab3b..324abe3623 100644
--- a/tools/libvirt-guests.sh.in
+++ b/tools/libvirt-guests.sh.in
@@ -28,6 +28,16 @@ test ! -r "$sysconfdir"/rc.d/init.d/functions ||
# Make sure this file is recognized as having translations: _("dummy")
. "@bindir@"/gettext.sh
+# Avoid output being listed under gettext PID and being truncated
+unbuffered_gettext () {
+ msg="$(gettext "$1")"
+ echo "$msg"
+}
+unbuffered_eval_gettext () {
+ msg="$(eval_gettext "$1")"
+ echo "$msg"
+}
+
export TEXTDOMAIN="@PACKAGE@" TEXTDOMAINDIR="@localedir@"
URIS=default
@@ -91,7 +101,7 @@ test_connect()
if run_virsh "$uri" connect 2>/dev/null; then
return 0;
else
- eval_gettext "Can't connect to \$uri. Skipping."
+ unbuffered_eval_gettext "Can't connect to \$uri. Skipping."
return 1
fi
}
@@ -160,7 +170,7 @@ start() {
[ -f "$LISTFILE" ] || { started; return 0; }
if [ "x$ON_BOOT" != xstart ]; then
- gettext "libvirt-guests is configured not to start any guests on boot"
+ unbuffered_gettext "libvirt-guests is configured not to start any guests on boot"
echo
rm -f "$LISTFILE"
started
@@ -184,19 +194,19 @@ start() {
done
set +f
if ! "$configured"; then
- eval_gettext "Ignoring guests on \$uri URI"; echo
+ unbuffered_eval_gettext "Ignoring guests on \$uri URI"; echo
continue
fi
test_connect "$uri" || continue
- eval_gettext "Resuming guests on \$uri URI..."; echo
+ unbuffered_eval_gettext "Resuming guests on \$uri URI..."; echo
for guest in $list; do
local name=$(guest_name "$uri" "$guest")
- eval_gettext "Resuming guest \$name: "
+ unbuffered_eval_gettext "Resuming guest \$name: "
if guest_is_on "$uri" "$guest"; then
if "$guest_running"; then
- gettext "already active"; echo
+ unbuffered_gettext "already active"; echo
else
if "$isfirst"; then
isfirst=false
@@ -205,7 +215,7 @@ start() {
fi
retval run_virsh "$uri" start $bypass "$name" \
>/dev/null && \
- gettext "done"; echo
+ unbuffered_gettext "done"; echo
if "$sync_time"; then
run_virsh "$uri" domtime --sync "$name" >/dev/null
fi
@@ -226,7 +236,7 @@ suspend_guest()
local uri=$1
local guest=$2
local name=$(guest_name "$uri" "$guest")
- local label=$(eval_gettext "Suspending \$name: ")
+ local label=$(unbuffered_eval_gettext "Suspending \$name: ")
local bypass=
local slept=0
@@ -249,7 +259,7 @@ suspend_guest()
fi
fi
done
- retval wait "$virsh_pid" && printf '%s%s\n' "$label" "$(gettext "done")"
+ retval wait "$virsh_pid" && printf '%s%s\n' "$label" "$(unbuffered_gettext "done")"
}
# shutdown_guest URI GUEST
@@ -265,16 +275,16 @@ shutdown_guest()
local format=
local slept=
- eval_gettext "Starting shutdown on guest: \$name"
+ unbuffered_eval_gettext "Starting shutdown on guest: \$name"
echo
retval run_virsh "$uri" shutdown "$guest" >/dev/null || return
if [ $timeout -gt 0 ]; then
check_timeout=true
- format=$(eval_gettext "Waiting for guest %s to shut down, %d seconds left\n")
+ format=$(unbuffered_eval_gettext "Waiting for guest %s to shut down, %d seconds left\n")
else
slept=0
- format=$(eval_gettext "Waiting for guest %s to shut down\n")
+ format=$(unbuffered_eval_gettext "Waiting for guest %s to shut down\n")
fi
while ! $check_timeout || [ "$timeout" -gt 0 ]; do
sleep 1
@@ -296,9 +306,9 @@ shutdown_guest()
if guest_is_on "$uri" "$guest"; then
if "$guest_running"; then
- eval_gettext "Shutdown of guest \$name failed to complete in time."
+ unbuffered_eval_gettext "Shutdown of guest \$name failed to complete in time."
else
- eval_gettext "Shutdown of guest \$name complete."
+ unbuffered_eval_gettext "Shutdown of guest \$name complete."
fi
echo
fi
@@ -313,7 +323,7 @@ shutdown_guest_async()
local guest=$2
local name=$(guest_name "$uri" "$guest")
- eval_gettext "Starting shutdown on guest: \$name"
+ unbuffered_eval_gettext "Starting shutdown on guest: \$name"
echo
retval run_virsh "$uri" shutdown "$guest" > /dev/null
}
@@ -339,7 +349,7 @@ check_guests_shutdown()
guests_shutting_down=
for guest in $guests_to_check; do
if ! guest_is_on "$uri" "$guest" >/dev/null 2>&1; then
- eval_gettext "Failed to determine state of guest: \$guest. Not tracking it anymore."
+ unbuffered_eval_gettext "Failed to determine state of guest: \$guest. Not tracking it anymore."
echo
continue
fi
@@ -366,7 +376,7 @@ print_guests_shutdown()
local name=$(guest_name "$uri" "$guest")
if [ -n "$name" ]; then
- eval_gettext "Shutdown of guest \$name complete."
+ unbuffered_eval_gettext "Shutdown of guest \$name complete."
echo
fi
done
@@ -386,10 +396,10 @@ shutdown_guests_parallel()
if [ $timeout -gt 0 ]; then
check_timeout=true
- format=$(eval_gettext "Waiting for %d guests to shut down, %d seconds left\n")
+ format=$(unbuffered_eval_gettext "Waiting for %d guests to shut down, %d seconds left\n")
else
slept=0
- format=$(eval_gettext "Waiting for %d guests to shut down\n")
+ format=$(unbuffered_eval_gettext "Waiting for %d guests to shut down\n")
fi
while [ -n "$on_shutdown" ] || [ -n "$guests" ]; do
while [ -n "$guests" ] &&
@@ -417,7 +427,7 @@ shutdown_guests_parallel()
fi
timeout=$(($timeout - 1))
if [ $timeout -le 0 ]; then
- eval_gettext "Timeout expired while shutting down domains"; echo
+ unbuffered_eval_gettext "Timeout expired while shutting down domains"; echo
RETVAL=1
return
fi
@@ -447,7 +457,7 @@ stop() {
if [ "/x$ON_SHUTDOWN" = xshutdown ]; then
suspending=false
if [ $SHUTDOWN_TIMEOUT -lt 0 ]; then
- gettext "SHUTDOWN_TIMEOUT must be equal or greater than 0"
+ unbuffered_gettext "SHUTDOWN_TIMEOUT must be equal or greater than 0"
echo
RETVAL=6
return
@@ -461,7 +471,7 @@ stop() {
test_connect "$uri" || continue
- eval_gettext "Running guests on \$uri URI: "
+ unbuffered_eval_gettext "Running guests on \$uri URI: "
local list=$(list_guests "$uri")
if [ $? -eq 0 ]; then
@@ -473,7 +483,7 @@ stop() {
done
if "$empty"; then
- gettext "no running guests."
+ unbuffered_gettext "no running guests."
fi
echo
fi
@@ -486,7 +496,7 @@ stop() {
for uuid in $transient; do
if "$empty"; then
- eval_gettext "Not suspending transient guests on URI: \$uri: "
+ unbuffered_eval_gettext "Not suspending transient guests on URI: \$uri: "
empty=false
else
printf ", "
@@ -497,14 +507,14 @@ stop() {
# reload domain list to contain only persistent guests
list=$(list_guests "$uri" "--persistent")
if [ $? -ne 0 ]; then
- eval_gettext "Failed to list persistent guests on \$uri"
+ unbuffered_eval_gettext "Failed to list persistent guests on \$uri"
echo
RETVAL=1
set +f
return
fi
else
- gettext "Failed to list transient guests"
+ unbuffered_gettext "Failed to list transient guests"
echo
RETVAL=1
set +f
@@ -521,9 +531,9 @@ stop() {
if [ -s "$LISTFILE" ]; then
while read uri list; do
if "$suspending"; then
- eval_gettext "Suspending guests on \$uri URI..."; echo
+ unbuffered_eval_gettext "Suspending guests on \$uri URI..."; echo
else
- eval_gettext "Shutting down guests on \$uri URI..."; echo
+ unbuffered_eval_gettext "Shutting down guests on \$uri URI..."; echo
fi
if [ "$PARALLEL_SHUTDOWN" -gt 1 ] &&
@@ -566,14 +576,14 @@ gueststatus() {
# since there is no external daemon process matching this init script.
rh_status() {
if [ -f "$LISTFILE" ]; then
- gettext "stopped, with saved guests"; echo
+ unbuffered_gettext "stopped, with saved guests"; echo
RETVAL=3
else
if [ -f "$VAR_SUBSYS_LIBVIRT_GUESTS" ]; then
- gettext "started"; echo
+ unbuffered_gettext "started"; echo
RETVAL=0
else
- gettext "stopped, with no saved guests"; echo
+ unbuffered_gettext "stopped, with no saved guests"; echo
RETVAL=3
fi
fi
@@ -583,7 +593,7 @@ rh_status() {
# Display usage string, then exit with VAL (defaults to 2).
usage() {
local program_name=$0
- eval_gettext "Usage: \$program_name {start|stop|status|restart|"\
+ unbuffered_eval_gettext "Usage: \$program_name {start|stop|status|restart|"\
"condrestart|try-restart|reload|force-reload|gueststatus|shutdown}"; echo
exit ${1-2}
}
--
2.26.0
On 4/30/20 11:59 AM, Christian Ehrhardt wrote:
> Newer systemd is too smart, it detects the PIDs of the gettext calls
> and due to that a log lists libvirt-guests.sh with many different PIDs.
>
> Furthermore it struggles to collect the output in time, so it can be
> truncated and overall looks like:
> libvirt-guests.sh[37986]: Running guests on default URI:
> libvirt-guests.sh[37995]: Running guests on
> libvirt-guests.sh[37977]: R
>
> Gather the gettext result into a local variable and printing
> the value from libvirt-guests.sh itself fixes both issues.
>
> Fixes: https://bugs.launchpad.net/ubuntu/+source/libvirt/+bug/1875708
>
> Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> ---
> tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
> 1 file changed, 42 insertions(+), 32 deletions(-)
>
> diff --git a/tools/libvirt-guests.sh.in b/tools/libvirt-guests.sh.in
> index 7af24dab3b..324abe3623 100644
> --- a/tools/libvirt-guests.sh.in
> +++ b/tools/libvirt-guests.sh.in
> @@ -28,6 +28,16 @@ test ! -r "$sysconfdir"/rc.d/init.d/functions ||
> # Make sure this file is recognized as having translations: _("dummy")
> . "@bindir@"/gettext.sh
>
> +# Avoid output being listed under gettext PID and being truncated
> +unbuffered_gettext () {
> + msg="$(gettext "$1")"
> + echo "$msg"
> +}
> +unbuffered_eval_gettext () {
> + msg="$(eval_gettext "$1")"
> + echo "$msg"
> +}
> +
Ah, is this the case where gettext prints the text in many sequences
whereas echo does it all at once? Or what do you think is happening
here? I don't quite understand how buffering steps into this.
Michal
On Thu, Apr 30, 2020 at 2:10 PM Michal Privoznik <mprivozn@redhat.com> wrote:
>
> On 4/30/20 11:59 AM, Christian Ehrhardt wrote:
> > Newer systemd is too smart, it detects the PIDs of the gettext calls
> > and due to that a log lists libvirt-guests.sh with many different PIDs.
> >
> > Furthermore it struggles to collect the output in time, so it can be
> > truncated and overall looks like:
> > libvirt-guests.sh[37986]: Running guests on default URI:
> > libvirt-guests.sh[37995]: Running guests on
> > libvirt-guests.sh[37977]: R
> >
> > Gather the gettext result into a local variable and printing
> > the value from libvirt-guests.sh itself fixes both issues.
> >
> > Fixes: https://bugs.launchpad.net/ubuntu/+source/libvirt/+bug/1875708
> >
> > Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> > ---
> > tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
> > 1 file changed, 42 insertions(+), 32 deletions(-)
> >
> > diff --git a/tools/libvirt-guests.sh.in b/tools/libvirt-guests.sh.in
> > index 7af24dab3b..324abe3623 100644
> > --- a/tools/libvirt-guests.sh.in
> > +++ b/tools/libvirt-guests.sh.in
> > @@ -28,6 +28,16 @@ test ! -r "$sysconfdir"/rc.d/init.d/functions ||
> > # Make sure this file is recognized as having translations: _("dummy")
> > . "@bindir@"/gettext.sh
> >
> > +# Avoid output being listed under gettext PID and being truncated
> > +unbuffered_gettext () {
> > + msg="$(gettext "$1")"
> > + echo "$msg"
> > +}
> > +unbuffered_eval_gettext () {
> > + msg="$(eval_gettext "$1")"
> > + echo "$msg"
> > +}
> > +
>
> Ah, is this the case where gettext prints the text in many sequences
> whereas echo does it all at once? Or what do you think is happening
> here? I don't quite understand how buffering steps into this.
You have two effects:
- systemd tries to detect and report which PID the output is from when
something it printed. Therefore it appears to a user as if the script
itself would have been invoked multiple times while actually is will be
the PIDs of the different gettext calls
- I assume that to get this content it is hitchhiking the output streams
somehow - the gettext processes are very very short lived and it seems
when they are exiting some output of them might be lost.
Maybe calling it "buffering" was wrong, but the change above resolves
both issues, the odd PIDs and the truncation by printing from the actual script.
> Michal
>
--
Christian Ehrhardt
Staff Engineer, Ubuntu Server
Canonical Ltd
On Thu, Apr 30, 2020 at 02:42:21PM +0200, Christian Ehrhardt wrote:
> On Thu, Apr 30, 2020 at 2:10 PM Michal Privoznik <mprivozn@redhat.com> wrote:
> >
> > On 4/30/20 11:59 AM, Christian Ehrhardt wrote:
> > > Newer systemd is too smart, it detects the PIDs of the gettext calls
> > > and due to that a log lists libvirt-guests.sh with many different PIDs.
> > >
> > > Furthermore it struggles to collect the output in time, so it can be
> > > truncated and overall looks like:
> > > libvirt-guests.sh[37986]: Running guests on default URI:
> > > libvirt-guests.sh[37995]: Running guests on
> > > libvirt-guests.sh[37977]: R
> > >
> > > Gather the gettext result into a local variable and printing
> > > the value from libvirt-guests.sh itself fixes both issues.
> > >
> > > Fixes: https://bugs.launchpad.net/ubuntu/+source/libvirt/+bug/1875708
> > >
> > > Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> > > ---
> > > tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
> > > 1 file changed, 42 insertions(+), 32 deletions(-)
> > >
> > > diff --git a/tools/libvirt-guests.sh.in b/tools/libvirt-guests.sh.in
> > > index 7af24dab3b..324abe3623 100644
> > > --- a/tools/libvirt-guests.sh.in
> > > +++ b/tools/libvirt-guests.sh.in
> > > @@ -28,6 +28,16 @@ test ! -r "$sysconfdir"/rc.d/init.d/functions ||
> > > # Make sure this file is recognized as having translations: _("dummy")
> > > . "@bindir@"/gettext.sh
> > >
> > > +# Avoid output being listed under gettext PID and being truncated
> > > +unbuffered_gettext () {
> > > + msg="$(gettext "$1")"
> > > + echo "$msg"
> > > +}
> > > +unbuffered_eval_gettext () {
> > > + msg="$(eval_gettext "$1")"
> > > + echo "$msg"
> > > +}
> > > +
> >
> > Ah, is this the case where gettext prints the text in many sequences
> > whereas echo does it all at once? Or what do you think is happening
> > here? I don't quite understand how buffering steps into this.
>
> You have two effects:
> - systemd tries to detect and report which PID the output is from when
> something it printed. Therefore it appears to a user as if the script
> itself would have been invoked multiple times while actually is will be
> the PIDs of the different gettext calls
> - I assume that to get this content it is hitchhiking the output streams
> somehow - the gettext processes are very very short lived and it seems
> when they are exiting some output of them might be lost.
This doesn't pass a sniff test to me.
systemd is reading stdout/stderr of the libvirt-guests script. How
would it ever tell that certain lines from stdout/err are coming from
different PIDs. Systemd isn't involved in spawning the gettext processes.
It can see they come & go via cgroups pids, but that's about it.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
>
> On Thu, Apr 30, 2020 at 02:42:21PM +0200, Christian Ehrhardt wrote:
> > On Thu, Apr 30, 2020 at 2:10 PM Michal Privoznik <mprivozn@redhat.com> wrote:
> > >
> > > On 4/30/20 11:59 AM, Christian Ehrhardt wrote:
> > > > Newer systemd is too smart, it detects the PIDs of the gettext calls
> > > > and due to that a log lists libvirt-guests.sh with many different PIDs.
> > > >
> > > > Furthermore it struggles to collect the output in time, so it can be
> > > > truncated and overall looks like:
> > > > libvirt-guests.sh[37986]: Running guests on default URI:
> > > > libvirt-guests.sh[37995]: Running guests on
> > > > libvirt-guests.sh[37977]: R
> > > >
> > > > Gather the gettext result into a local variable and printing
> > > > the value from libvirt-guests.sh itself fixes both issues.
> > > >
> > > > Fixes: https://bugs.launchpad.net/ubuntu/+source/libvirt/+bug/1875708
> > > >
> > > > Signed-off-by: Christian Ehrhardt <christian.ehrhardt@canonical.com>
> > > > ---
> > > > tools/libvirt-guests.sh.in | 74 +++++++++++++++++++++-----------------
> > > > 1 file changed, 42 insertions(+), 32 deletions(-)
> > > >
> > > > diff --git a/tools/libvirt-guests.sh.in b/tools/libvirt-guests.sh.in
> > > > index 7af24dab3b..324abe3623 100644
> > > > --- a/tools/libvirt-guests.sh.in
> > > > +++ b/tools/libvirt-guests.sh.in
> > > > @@ -28,6 +28,16 @@ test ! -r "$sysconfdir"/rc.d/init.d/functions ||
> > > > # Make sure this file is recognized as having translations: _("dummy")
> > > > . "@bindir@"/gettext.sh
> > > >
> > > > +# Avoid output being listed under gettext PID and being truncated
> > > > +unbuffered_gettext () {
> > > > + msg="$(gettext "$1")"
> > > > + echo "$msg"
> > > > +}
> > > > +unbuffered_eval_gettext () {
> > > > + msg="$(eval_gettext "$1")"
> > > > + echo "$msg"
> > > > +}
> > > > +
> > >
> > > Ah, is this the case where gettext prints the text in many sequences
> > > whereas echo does it all at once? Or what do you think is happening
> > > here? I don't quite understand how buffering steps into this.
> >
> > You have two effects:
> > - systemd tries to detect and report which PID the output is from when
> > something it printed. Therefore it appears to a user as if the script
> > itself would have been invoked multiple times while actually is will be
> > the PIDs of the different gettext calls
> > - I assume that to get this content it is hitchhiking the output streams
> > somehow - the gettext processes are very very short lived and it seems
> > when they are exiting some output of them might be lost.
>
> This doesn't pass a sniff test to me.
>
> systemd is reading stdout/stderr of the libvirt-guests script. How
> would it ever tell that certain lines from stdout/err are coming from
> different PIDs. Systemd isn't involved in spawning the gettext processes.
> It can see they come & go via cgroups pids, but that's about it.
Well it seems I have a reproducible symptom and a fix, but not the
explanation why the latter fixes the former.
I'll need to dive into some debug & analysis myself to explain it better.
I'll be back here once I got time to do that in depth check.
Until then whoever is affected (should be everyone) can give it a
thought as well.
Repro is as easy as
One console:
$ journalctl -f -u libvirt-guests
Other console:
$ systemctl stop libvirt-guests
$ systemctl start libvirt-guests
I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
237-3ubuntu10.39 (20.04).
Maybe it is a systemd bug after all?
I'd be interested to hear if that is/isn't clobbering output for
anyone else and what your systemd versions are?
> Regards,
> Daniel
> --
> |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o- https://fstop138.berrange.com :|
> |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
>
--
Christian Ehrhardt
Staff Engineer, Ubuntu Server
Canonical Ltd
On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote: > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > Well it seems I have a reproducible symptom and a fix, but not the > explanation why the latter fixes the former. > I'll need to dive into some debug & analysis myself to explain it better. > I'll be back here once I got time to do that in depth check. > > Until then whoever is affected (should be everyone) can give it a > thought as well. > Repro is as easy as > One console: > $ journalctl -f -u libvirt-guests > Other console: > $ systemctl stop libvirt-guests > $ systemctl start libvirt-guests > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on > 237-3ubuntu10.39 (20.04). > Maybe it is a systemd bug after all? > I'd be interested to hear if that is/isn't clobbering output for > anyone else and what your systemd versions are? FWIW, it works correctly on Fedora 31 with systemd 243. Apr 30 16:07:27 localhost.localdomain systemd[1]: Starting Suspend/Resume Running libvirt Guests... Apr 30 16:07:27 localhost.localdomain systemd[1]: Started Suspend/Resume Running libvirt Guests. Apr 30 16:08:33 localhost.localdomain systemd[1]: Stopping Suspend/Resume Running libvirt Guests... Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Running guests on default URI: memtest, memtest1 Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Suspending guests on default URI... Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest: ... Apr 30 16:08:34 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest: done Apr 30 16:08:34 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest1: ... Apr 30 16:08:35 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest1: done Apr 30 16:08:35 localhost.localdomain systemd[1]: libvirt-guests.service: Succeeded. Apr 30 16:08:35 localhost.localdomain systemd[1]: Stopped Suspend/Resume Running libvirt Guests. I'll see if I can do a test with newer systemd too Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote: > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > Well it seems I have a reproducible symptom and a fix, but not the > > explanation why the latter fixes the former. > > I'll need to dive into some debug & analysis myself to explain it better. > > I'll be back here once I got time to do that in depth check. > > > > Until then whoever is affected (should be everyone) can give it a > > thought as well. > > Repro is as easy as > > One console: > > $ journalctl -f -u libvirt-guests > > Other console: > > $ systemctl stop libvirt-guests > > $ systemctl start libvirt-guests > > > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on > > 237-3ubuntu10.39 (20.04). > > Maybe it is a systemd bug after all? > > I'd be interested to hear if that is/isn't clobbering output for > > anyone else and what your systemd versions are? > > FWIW, it works correctly on Fedora 31 with systemd 243. Eoan with 242-7ubuntu3.7 is good as well. I might need to try to get some interim versions from somewhere. > > Apr 30 16:07:27 localhost.localdomain systemd[1]: Starting Suspend/Resume Running libvirt Guests... > Apr 30 16:07:27 localhost.localdomain systemd[1]: Started Suspend/Resume Running libvirt Guests. > Apr 30 16:08:33 localhost.localdomain systemd[1]: Stopping Suspend/Resume Running libvirt Guests... > Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Running guests on default URI: memtest, memtest1 > Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Suspending guests on default URI... > Apr 30 16:08:33 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest: ... > Apr 30 16:08:34 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest: done > Apr 30 16:08:34 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest1: ... > Apr 30 16:08:35 localhost.localdomain libvirt-guests.sh[2192485]: Suspending memtest1: done > Apr 30 16:08:35 localhost.localdomain systemd[1]: libvirt-guests.service: Succeeded. > Apr 30 16:08:35 localhost.localdomain systemd[1]: Stopped Suspend/Resume Running libvirt Guests. > > I'll see if I can do a test with newer systemd too > > > Regards, > Daniel > -- > |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| > |: https://libvirt.org -o- https://fstop138.berrange.com :| > |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :| > -- Christian Ehrhardt Staff Engineer, Ubuntu Server Canonical Ltd
On Thu, Apr 30, 2020 at 06:28:08PM +0200, Christian Ehrhardt wrote: > On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > > > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote: > > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > > Well it seems I have a reproducible symptom and a fix, but not the > > > explanation why the latter fixes the former. > > > I'll need to dive into some debug & analysis myself to explain it better. > > > I'll be back here once I got time to do that in depth check. > > > > > > Until then whoever is affected (should be everyone) can give it a > > > thought as well. > > > Repro is as easy as > > > One console: > > > $ journalctl -f -u libvirt-guests > > > Other console: > > > $ systemctl stop libvirt-guests > > > $ systemctl start libvirt-guests > > > > > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on > > > 237-3ubuntu10.39 (20.04). > > > Maybe it is a systemd bug after all? > > > I'd be interested to hear if that is/isn't clobbering output for > > > anyone else and what your systemd versions are? > > > > FWIW, it works correctly on Fedora 31 with systemd 243. > > Eoan with 242-7ubuntu3.7 is good as well. > I might need to try to get some interim versions from somewhere. I've reproduced on Fedora 33 rawhide with systemd 245 - the first place where it lists running guests is screwed up slightly: Apr 30 17:16:13 libvirt-fedora-rawhide systemd[1]: Stopping Suspend/Resume Running libvirt Guests... Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69903]: Running guests on default URI: Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Runningcore2 Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69934]: Suspending guests on default URI... Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: SSuspending core1: ... Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core1: done Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: ... Apr 30 17:16:15 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: done Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: libvirt-guests.service: Succeeded. Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: Stopped Suspend/Resume Running libvirt Guests. On resume it is even worse Apr 30 17:19:40 libvirt-fedora-rawhide systemd[1]: Starting Suspend/Resume Running libvirt Guests... Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70041]: Resuming guests on default URI... Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70030]: R Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70048]: R Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70048]: esuming guest core1: Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70079]: esum Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70086]: e Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70086]: esuming guest core2: Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70119]: esum Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e Apr 30 17:19:42 libvirt-fedora-rawhide systemd[1]: Finished Suspend/Resume Running libvirt Guests. It is possible it isn't systemd related - could be other packages that are co-incidentally affecting it. Regards, Daniel -- |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| |: https://libvirt.org -o- https://fstop138.berrange.com :| |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Thu, Apr 30, 2020 at 7:23 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > On Thu, Apr 30, 2020 at 06:28:08PM +0200, Christian Ehrhardt wrote: > > On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > > > > > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote: > > > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > > > Well it seems I have a reproducible symptom and a fix, but not the > > > > explanation why the latter fixes the former. > > > > I'll need to dive into some debug & analysis myself to explain it better. > > > > I'll be back here once I got time to do that in depth check. > > > > > > > > Until then whoever is affected (should be everyone) can give it a > > > > thought as well. > > > > Repro is as easy as > > > > One console: > > > > $ journalctl -f -u libvirt-guests > > > > Other console: > > > > $ systemctl stop libvirt-guests > > > > $ systemctl start libvirt-guests > > > > > > > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on > > > > 237-3ubuntu10.39 (20.04). > > > > Maybe it is a systemd bug after all? > > > > I'd be interested to hear if that is/isn't clobbering output for > > > > anyone else and what your systemd versions are? > > > > > > FWIW, it works correctly on Fedora 31 with systemd 243. > > > > Eoan with 242-7ubuntu3.7 is good as well. > > I might need to try to get some interim versions from somewhere. > > I've reproduced on Fedora 33 rawhide with systemd 245 - the first > place where it lists running guests is screwed up slightly: > > Apr 30 17:16:13 libvirt-fedora-rawhide systemd[1]: Stopping Suspend/Resume Running libvirt Guests... > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69903]: Running guests on default URI: > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Runningcore2 > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69934]: Suspending guests on default URI... > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: SSuspending core1: ... > Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core1: done > Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: ... > Apr 30 17:16:15 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: done > Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: libvirt-guests.service: Succeeded. > Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: Stopped Suspend/Resume Running libvirt Guests. > > On resume it is even worse > > Apr 30 17:19:40 libvirt-fedora-rawhide systemd[1]: Starting Suspend/Resume Running libvirt Guests... > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70041]: Resuming guests on default URI... > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70030]: R > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70048]: R > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70048]: esuming guest core1: > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70079]: esum > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70086]: e > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70086]: esuming guest core2: > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70119]: esum > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e > Apr 30 17:19:42 libvirt-fedora-rawhide systemd[1]: Finished Suspend/Resume Running libvirt Guests. > > > It is possible it isn't systemd related - could be other packages that > are co-incidentally affecting it. Since launchpad holds all former builds still ready for download I could easily test a few versions. I up/downgraded just the following packages on an otherwise unmodified system: - libnss-systemd - libpam-systemd - libsystemd0 - systemd - systemd-container - systemd-sysv Those versions I did check: 245.4-4ubuntu3 bad 245.2-1ubuntu1 bad 244.3-1ubuntu1 bad 244.1-0ubuntu3 bad (bad on retry) 243-3ubuntu1 good (good (on retry) 242-7ubuntu3 good Retry means that I went and installed from good -> bad -> good and the behavior was the same. So none of the installs fixed it to then be good forever. It was consistently good <244 and bad >=244. Since I only installed the mentioned systemd packages and no others I'd say it is systemd. I can't break this up into which package, since they inter-depend each other. I guess it might be time to file a systemd bug for this - if not to fix then to understand what is going on so that we can make a better decision. > > Regards, > Daniel > -- > |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| > |: https://libvirt.org -o- https://fstop138.berrange.com :| > |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :| > -- Christian Ehrhardt Staff Engineer, Ubuntu Server Canonical Ltd
On Thu, Apr 30, 2020 at 07:36:54PM +0200, Christian Ehrhardt wrote:
> On Thu, Apr 30, 2020 at 7:23 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> >
> > On Thu, Apr 30, 2020 at 06:28:08PM +0200, Christian Ehrhardt wrote:
> > > On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > >
> > > > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote:
> > > > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > > > Well it seems I have a reproducible symptom and a fix, but not the
> > > > > explanation why the latter fixes the former.
> > > > > I'll need to dive into some debug & analysis myself to explain it better.
> > > > > I'll be back here once I got time to do that in depth check.
> > > > >
> > > > > Until then whoever is affected (should be everyone) can give it a
> > > > > thought as well.
> > > > > Repro is as easy as
> > > > > One console:
> > > > > $ journalctl -f -u libvirt-guests
> > > > > Other console:
> > > > > $ systemctl stop libvirt-guests
> > > > > $ systemctl start libvirt-guests
> > > > >
> > > > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
> > > > > 237-3ubuntu10.39 (20.04).
> > > > > Maybe it is a systemd bug after all?
> > > > > I'd be interested to hear if that is/isn't clobbering output for
> > > > > anyone else and what your systemd versions are?
> > > >
> > > > FWIW, it works correctly on Fedora 31 with systemd 243.
> > >
> > > Eoan with 242-7ubuntu3.7 is good as well.
> > > I might need to try to get some interim versions from somewhere.
> >
> > I've reproduced on Fedora 33 rawhide with systemd 245 - the first
> > place where it lists running guests is screwed up slightly:
> >
> > Apr 30 17:16:13 libvirt-fedora-rawhide systemd[1]: Stopping Suspend/Resume Running libvirt Guests...
> > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69903]: Running guests on default URI:
> > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Runningcore2
> > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69934]: Suspending guests on default URI...
> > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: SSuspending core1: ...
> > Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core1: done
> > Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: ...
> > Apr 30 17:16:15 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: done
> > Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: libvirt-guests.service: Succeeded.
> > Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: Stopped Suspend/Resume Running libvirt Guests.
> >
> > On resume it is even worse
> >
> > Apr 30 17:19:40 libvirt-fedora-rawhide systemd[1]: Starting Suspend/Resume Running libvirt Guests...
> > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70041]: Resuming guests on default URI...
> > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70030]: R
> > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70048]: R
> > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70048]: esuming guest core1:
> > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70079]: esum
> > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e
> > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70086]: e
> > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70086]: esuming guest core2:
> > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70119]: esum
> > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e
> > Apr 30 17:19:42 libvirt-fedora-rawhide systemd[1]: Finished Suspend/Resume Running libvirt Guests.
> >
> >
> > It is possible it isn't systemd related - could be other packages that
> > are co-incidentally affecting it.
>
> Since launchpad holds all former builds still ready for download I
> could easily test a few versions.
>
> I up/downgraded just the following packages on an otherwise unmodified system:
> - libnss-systemd
> - libpam-systemd
> - libsystemd0
> - systemd
> - systemd-container
> - systemd-sysv
>
> Those versions I did check:
>
> 245.4-4ubuntu3 bad
> 245.2-1ubuntu1 bad
> 244.3-1ubuntu1 bad
> 244.1-0ubuntu3 bad (bad on retry)
> 243-3ubuntu1 good (good (on retry)
> 242-7ubuntu3 good
>
> Retry means that I went and installed from good -> bad -> good and the
> behavior was the same.
> So none of the installs fixed it to then be good forever. It was
> consistently good <244 and bad >=244.
Based on that, a possibly interesting change between v243..v244 was
this:
commit 09d0b46ab61bebafe5bdc1be95ee153dfb13d6bc
Author: Lorenz Bauer <lmb@cloudflare.com>
Date: Mon Nov 4 16:35:46 2019 +0000
journal: refresh cached credentials of stdout streams
journald assumes that getsockopt(SO_PEERCRED) correctly identifies the
process on the remote end of the socket. However, this is incorrect
according to man 7 socket:
The returned credentials are those that were in effect at the
time of the call to connect(2) or socketpair(2).
This becomes a problem when a new process inherits the stdout stream
from a parent. First, log messages from the child process will
be attributed to the parent. Second, the struct ucred used by journald
becomes invalid as soon as the parent exits. Further sendmsg calls then
fail with ENOENT. Logs for the child process then vanish from the journal.
Fix this by using recvmsg on the stdout stream, and refreshing the cached
struct ucred if SCM_CREDENTIALS indicate a new process.
but I can't see an obvious mistake the code which would cause the
behaviour we see.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Thu, Apr 30, 2020 at 7:59 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > On Thu, Apr 30, 2020 at 07:36:54PM +0200, Christian Ehrhardt wrote: > > On Thu, Apr 30, 2020 at 7:23 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > > > > > On Thu, Apr 30, 2020 at 06:28:08PM +0200, Christian Ehrhardt wrote: > > > > On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > > > > > > > > > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote: > > > > > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote: > > > > > > Well it seems I have a reproducible symptom and a fix, but not the > > > > > > explanation why the latter fixes the former. > > > > > > I'll need to dive into some debug & analysis myself to explain it better. > > > > > > I'll be back here once I got time to do that in depth check. > > > > > > > > > > > > Until then whoever is affected (should be everyone) can give it a > > > > > > thought as well. > > > > > > Repro is as easy as > > > > > > One console: > > > > > > $ journalctl -f -u libvirt-guests > > > > > > Other console: > > > > > > $ systemctl stop libvirt-guests > > > > > > $ systemctl start libvirt-guests > > > > > > > > > > > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on > > > > > > 237-3ubuntu10.39 (20.04). > > > > > > Maybe it is a systemd bug after all? > > > > > > I'd be interested to hear if that is/isn't clobbering output for > > > > > > anyone else and what your systemd versions are? > > > > > > > > > > FWIW, it works correctly on Fedora 31 with systemd 243. > > > > > > > > Eoan with 242-7ubuntu3.7 is good as well. > > > > I might need to try to get some interim versions from somewhere. > > > > > > I've reproduced on Fedora 33 rawhide with systemd 245 - the first > > > place where it lists running guests is screwed up slightly: > > > > > > Apr 30 17:16:13 libvirt-fedora-rawhide systemd[1]: Stopping Suspend/Resume Running libvirt Guests... > > > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69903]: Running guests on default URI: > > > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Runningcore2 > > > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69934]: Suspending guests on default URI... > > > Apr 30 17:16:13 libvirt-fedora-rawhide libvirt-guests.sh[69892]: SSuspending core1: ... > > > Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core1: done > > > Apr 30 17:16:14 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: ... > > > Apr 30 17:16:15 libvirt-fedora-rawhide libvirt-guests.sh[69892]: Suspending core2: done > > > Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: libvirt-guests.service: Succeeded. > > > Apr 30 17:16:15 libvirt-fedora-rawhide systemd[1]: Stopped Suspend/Resume Running libvirt Guests. > > > > > > On resume it is even worse > > > > > > Apr 30 17:19:40 libvirt-fedora-rawhide systemd[1]: Starting Suspend/Resume Running libvirt Guests... > > > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70041]: Resuming guests on default URI... > > > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70030]: R > > > Apr 30 17:19:40 libvirt-fedora-rawhide libvirt-guests.sh[70048]: R > > > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70048]: esuming guest core1: > > > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70079]: esum > > > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e > > > Apr 30 17:19:41 libvirt-fedora-rawhide libvirt-guests.sh[70086]: e > > > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70086]: esuming guest core2: > > > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70119]: esum > > > Apr 30 17:19:42 libvirt-fedora-rawhide libvirt-guests.sh[70030]: e > > > Apr 30 17:19:42 libvirt-fedora-rawhide systemd[1]: Finished Suspend/Resume Running libvirt Guests. > > > > > > > > > It is possible it isn't systemd related - could be other packages that > > > are co-incidentally affecting it. > > > > Since launchpad holds all former builds still ready for download I > > could easily test a few versions. > > > > I up/downgraded just the following packages on an otherwise unmodified system: > > - libnss-systemd > > - libpam-systemd > > - libsystemd0 > > - systemd > > - systemd-container > > - systemd-sysv > > > > Those versions I did check: > > > > 245.4-4ubuntu3 bad > > 245.2-1ubuntu1 bad > > 244.3-1ubuntu1 bad > > 244.1-0ubuntu3 bad (bad on retry) > > 243-3ubuntu1 good (good (on retry) > > 242-7ubuntu3 good > > > > Retry means that I went and installed from good -> bad -> good and the > > behavior was the same. > > So none of the installs fixed it to then be good forever. It was > > consistently good <244 and bad >=244. > > Based on that, a possibly interesting change between v243..v244 was > this: > > commit 09d0b46ab61bebafe5bdc1be95ee153dfb13d6bc > Author: Lorenz Bauer <lmb@cloudflare.com> > Date: Mon Nov 4 16:35:46 2019 +0000 > > journal: refresh cached credentials of stdout streams FYI - reported to systemd https://github.com/systemd/systemd/issues/15654 > journald assumes that getsockopt(SO_PEERCRED) correctly identifies the > process on the remote end of the socket. However, this is incorrect > according to man 7 socket: > > The returned credentials are those that were in effect at the > time of the call to connect(2) or socketpair(2). > > This becomes a problem when a new process inherits the stdout stream > from a parent. First, log messages from the child process will > be attributed to the parent. Second, the struct ucred used by journald > becomes invalid as soon as the parent exits. Further sendmsg calls then > fail with ENOENT. Logs for the child process then vanish from the journal. > > Fix this by using recvmsg on the stdout stream, and refreshing the cached > struct ucred if SCM_CREDENTIALS indicate a new process. > > > but I can't see an obvious mistake the code which would cause the > behaviour we see. > > Regards, > Daniel > -- > |: https://berrange.com -o- https://www.flickr.com/photos/dberrange :| > |: https://libvirt.org -o- https://fstop138.berrange.com :| > |: https://entangle-photo.org -o- https://www.instagram.com/dberrange :| > -- Christian Ehrhardt Staff Engineer, Ubuntu Server Canonical Ltd
On Thu, Apr 30, 2020 at 06:22:28PM +0100, Daniel P. Berrangé wrote:
> On Thu, Apr 30, 2020 at 06:28:08PM +0200, Christian Ehrhardt wrote:
> > On Thu, Apr 30, 2020 at 5:10 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > >
> > > On Thu, Apr 30, 2020 at 04:58:25PM +0200, Christian Ehrhardt wrote:
> > > > On Thu, Apr 30, 2020 at 2:51 PM Daniel P. Berrangé <berrange@redhat.com> wrote:
> > > > Well it seems I have a reproducible symptom and a fix, but not the
> > > > explanation why the latter fixes the former.
> > > > I'll need to dive into some debug & analysis myself to explain it better.
> > > > I'll be back here once I got time to do that in depth check.
> > > >
> > > > Until then whoever is affected (should be everyone) can give it a
> > > > thought as well.
> > > > Repro is as easy as
> > > > One console:
> > > > $ journalctl -f -u libvirt-guests
> > > > Other console:
> > > > $ systemctl stop libvirt-guests
> > > > $ systemctl start libvirt-guests
> > > >
> > > > I see it with 245.4-4ubuntu3 (18.04) I'm not seeing it on
> > > > 237-3ubuntu10.39 (20.04).
> > > > Maybe it is a systemd bug after all?
> > > > I'd be interested to hear if that is/isn't clobbering output for
> > > > anyone else and what your systemd versions are?
> > >
> > > FWIW, it works correctly on Fedora 31 with systemd 243.
> >
> > Eoan with 242-7ubuntu3.7 is good as well.
> > I might need to try to get some interim versions from somewhere.
>
> I've reproduced on Fedora 33 rawhide with systemd 245 - the first
> place where it lists running guests is screwed up slightly:
I strace'd libvirt-guests.sh and the data sent looks fine.
I strace'd systemd-journald and the data it receives looks fine,
but the data systemd-journald writes into the journal, however, is
completely fubar:
recvmsg(32, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="Resuming guest core1: ", iov_len=2054}], msg_iovlen=1, msg_control=[{cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, cmsg_data={pid=70624, uid=0, gid=0}}], msg_controllen=32, msg_flags=MSG_CTRUNC|MSG_CMSG_CLOEXEC}, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 22
writev(33, [{iov_base="[ 2741.146550] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70605]: ", iov_len=9}, {iov_base="R", iov_len=1}, {iov_base="\n", iov_len=1}], 5) = 43
writev(33, [{iov_base="[ 2741.163787] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70624]: ", iov_len=9}, {iov_base="R", iov_len=1}, {iov_base="\n", iov_len=1}], 5) = 43
writev(33, [{iov_base="[ 2742.518626] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70624]: ", iov_len=9}, {iov_base="esuming guest core1:", iov_len=20}, {iov_base="\n", iov_len=1}], 5) = 62
writev(33, [{iov_base="[ 2742.531644] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70655]: ", iov_len=9}, {iov_base="esum", iov_len=4}, {iov_base="\n", iov_len=1}], 5) = 46
recvmsg(32, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="Resuming guest core2: ", iov_len=2054}], msg_iovlen=1, msg_control=[{cmsg_len=28, cmsg_level=SOL_SOCKET, cmsg_type=SCM_CREDENTIALS, cmsg_data={pid=70662, uid=0, gid=0}}], msg_controllen=32, msg_flags=MSG_CTRUNC|MSG_CMSG_CLOEXEC}, MSG_DONTWAIT|MSG_CMSG_CLOEXEC) = 22
writev(33, [{iov_base="[ 2742.687436] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70605]: ", iov_len=9}, {iov_base="e", iov_len=1}, {iov_base="\n", iov_len=1}], 5) = 43
writev(33, [{iov_base="[ 2742.690716] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70662]: ", iov_len=9}, {iov_base="e", iov_len=1}, {iov_base="\n", iov_len=1}], 5) = 43
writev(33, [{iov_base="[ 2744.015012] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70662]: ", iov_len=9}, {iov_base="esuming guest core2:", iov_len=20}, {iov_base="\n", iov_len=1}], 5) = 62
writev(33, [{iov_base="[ 2744.019848] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70694]: ", iov_len=9}, {iov_base="esum", iov_len=4}, {iov_base="\n", iov_len=1}], 5) = 46
writev(33, [{iov_base="[ 2744.036819] ", iov_len=15}, {iov_base="libvirt-guests.sh", iov_len=17}, {iov_base="[70605]: ", iov_len=9}, {iov_base="e", iov_len=1}, {iov_base="\n", iov_len=1}], 5) = 43
So I think we found a regression in journald making it mangle log
messages. I smells like it is corrupting some buffers somewhere.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
© 2016 - 2026 Red Hat, Inc.