[PATCH 09/11] replay: stop us hanging in rr_wait_io_event

Alex Bennée posted 11 patches 11 months, 3 weeks ago
Maintainers: Richard Henderson <richard.henderson@linaro.org>, Paolo Bonzini <pbonzini@redhat.com>, "Marc-André Lureau" <marcandre.lureau@redhat.com>, Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>, John Snow <jsnow@redhat.com>, Cleber Rosa <crosa@redhat.com>, "Philippe Mathieu-Daudé" <philmd@linaro.org>, Wainer dos Santos Moschetta <wainersm@redhat.com>, Beraldo Leal <bleal@redhat.com>, Eduardo Habkost <eduardo@habkost.net>
[PATCH 09/11] replay: stop us hanging in rr_wait_io_event
Posted by Alex Bennée 11 months, 3 weeks ago
A lot of the hang I see are when we end up spinning in
rr_wait_io_event for an event that will never come in playback. As a
new check functions which can see if we are in PLAY mode and kick us
us the wait function so the event can be processed.

This fixes most of the failures in replay_kernel.py

Fixes: https://gitlab.com/qemu-project/qemu/-/issues/2013
Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Cc: Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
---
 include/sysemu/replay.h      |  5 +++++
 accel/tcg/tcg-accel-ops-rr.c |  2 +-
 replay/replay.c              | 24 ++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
index 08aae5869f..83995ae4bd 100644
--- a/include/sysemu/replay.h
+++ b/include/sysemu/replay.h
@@ -70,6 +70,11 @@ int replay_get_instructions(void);
 /*! Updates instructions counter in replay mode. */
 void replay_account_executed_instructions(void);
 
+/**
+ * replay_can_wait: check if we should pause for wait-io
+ */
+bool replay_can_wait(void);
+
 /* Processing clocks and other time sources */
 
 /*! Save the specified clock */
diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
index 611932f3c3..825e35b3dc 100644
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -109,7 +109,7 @@ static void rr_wait_io_event(void)
 {
     CPUState *cpu;
 
-    while (all_cpu_threads_idle()) {
+    while (all_cpu_threads_idle() && replay_can_wait()) {
         rr_stop_kick_timer();
         qemu_cond_wait_iothread(first_cpu->halt_cond);
     }
diff --git a/replay/replay.c b/replay/replay.c
index e83c01285c..042a6a9636 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -347,6 +347,30 @@ void replay_start(void)
     replay_enable_events();
 }
 
+/*
+ * For none/record the answer is yes.
+ */
+bool replay_can_wait(void)
+{
+    if (replay_mode == REPLAY_MODE_PLAY) {
+        /*
+         * For playback we shouldn't ever be at a point we wait. If
+         * the instruction count has reached zero and we have an
+         * unconsumed event we should go around again and consume it.
+         */
+        if (replay_state.instruction_count == 0 && replay_state.has_unread_data) {
+            return false;
+        } else {
+            fprintf(stderr, "Error: Invalid replay state\n");
+            fprintf(stderr,"instruction_count = %d, has = %d, event_kind = %d\n",
+                    replay_state.instruction_count, replay_state.has_unread_data, replay_state.data_kind);
+            abort();
+        }
+    }
+    return true;
+}
+
+
 void replay_finish(void)
 {
     if (replay_mode == REPLAY_MODE_NONE) {
-- 
2.39.2


Re: [PATCH 09/11] replay: stop us hanging in rr_wait_io_event
Posted by Pavel Dovgalyuk 11 months, 3 weeks ago
On 05.12.2023 23:41, Alex Bennée wrote:
> A lot of the hang I see are when we end up spinning in
> rr_wait_io_event for an event that will never come in playback. As a
> new check functions which can see if we are in PLAY mode and kick us
> us the wait function so the event can be processed.
> 
> This fixes most of the failures in replay_kernel.py

Is there an effect for console QEMU only?
I've tested this patch on Windows7 boot scenario and replay speed has 
not changed.

> 
> Fixes: https://gitlab.com/qemu-project/qemu/-/issues/2013
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
> ---
>   include/sysemu/replay.h      |  5 +++++
>   accel/tcg/tcg-accel-ops-rr.c |  2 +-
>   replay/replay.c              | 24 ++++++++++++++++++++++++
>   3 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
> index 08aae5869f..83995ae4bd 100644
> --- a/include/sysemu/replay.h
> +++ b/include/sysemu/replay.h
> @@ -70,6 +70,11 @@ int replay_get_instructions(void);
>   /*! Updates instructions counter in replay mode. */
>   void replay_account_executed_instructions(void);
>   
> +/**
> + * replay_can_wait: check if we should pause for wait-io
> + */
> +bool replay_can_wait(void);
> +
>   /* Processing clocks and other time sources */
>   
>   /*! Save the specified clock */
> diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
> index 611932f3c3..825e35b3dc 100644
> --- a/accel/tcg/tcg-accel-ops-rr.c
> +++ b/accel/tcg/tcg-accel-ops-rr.c
> @@ -109,7 +109,7 @@ static void rr_wait_io_event(void)
>   {
>       CPUState *cpu;
>   
> -    while (all_cpu_threads_idle()) {
> +    while (all_cpu_threads_idle() && replay_can_wait()) {
>           rr_stop_kick_timer();
>           qemu_cond_wait_iothread(first_cpu->halt_cond);
>       }
> diff --git a/replay/replay.c b/replay/replay.c
> index e83c01285c..042a6a9636 100644
> --- a/replay/replay.c
> +++ b/replay/replay.c
> @@ -347,6 +347,30 @@ void replay_start(void)
>       replay_enable_events();
>   }
>   
> +/*
> + * For none/record the answer is yes.
> + */
> +bool replay_can_wait(void)
> +{
> +    if (replay_mode == REPLAY_MODE_PLAY) {
> +        /*
> +         * For playback we shouldn't ever be at a point we wait. If
> +         * the instruction count has reached zero and we have an
> +         * unconsumed event we should go around again and consume it.
> +         */
> +        if (replay_state.instruction_count == 0 && replay_state.has_unread_data) {
> +            return false;
> +        } else {
> +            fprintf(stderr, "Error: Invalid replay state\n");
> +            fprintf(stderr,"instruction_count = %d, has = %d, event_kind = %d\n",
> +                    replay_state.instruction_count, replay_state.has_unread_data, replay_state.data_kind);
> +            abort();
> +        }
> +    }
> +    return true;
> +}
> +
> +
>   void replay_finish(void)
>   {
>       if (replay_mode == REPLAY_MODE_NONE) {


Re: [PATCH 09/11] replay: stop us hanging in rr_wait_io_event
Posted by Alex Bennée 11 months, 3 weeks ago
Pavel Dovgalyuk <pavel.dovgalyuk@ispras.ru> writes:

> On 05.12.2023 23:41, Alex Bennée wrote:
>> A lot of the hang I see are when we end up spinning in
>> rr_wait_io_event for an event that will never come in playback. As a
>> new check functions which can see if we are in PLAY mode and kick us
>> us the wait function so the event can be processed.
>> This fixes most of the failures in replay_kernel.py
>
> Is there an effect for console QEMU only?
> I've tested this patch on Windows7 boot scenario and replay speed has
> not changed.

It was a lock up I was seeing (because once it was in this mode what was
going to wake it up). It could be outside of running in avocado some
other random event woke things up and allowed stuff to progress. However
I think this is a correctness issue, when would we ever wait for io
during playback?

>
>> Fixes: https://gitlab.com/qemu-project/qemu/-/issues/2013
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Cc: Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
>> ---
>>   include/sysemu/replay.h      |  5 +++++
>>   accel/tcg/tcg-accel-ops-rr.c |  2 +-
>>   replay/replay.c              | 24 ++++++++++++++++++++++++
>>   3 files changed, 30 insertions(+), 1 deletion(-)
>> diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
>> index 08aae5869f..83995ae4bd 100644
>> --- a/include/sysemu/replay.h
>> +++ b/include/sysemu/replay.h
>> @@ -70,6 +70,11 @@ int replay_get_instructions(void);
>>   /*! Updates instructions counter in replay mode. */
>>   void replay_account_executed_instructions(void);
>>   +/**
>> + * replay_can_wait: check if we should pause for wait-io
>> + */
>> +bool replay_can_wait(void);
>> +
>>   /* Processing clocks and other time sources */
>>     /*! Save the specified clock */
>> diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
>> index 611932f3c3..825e35b3dc 100644
>> --- a/accel/tcg/tcg-accel-ops-rr.c
>> +++ b/accel/tcg/tcg-accel-ops-rr.c
>> @@ -109,7 +109,7 @@ static void rr_wait_io_event(void)
>>   {
>>       CPUState *cpu;
>>   -    while (all_cpu_threads_idle()) {
>> +    while (all_cpu_threads_idle() && replay_can_wait()) {
>>           rr_stop_kick_timer();
>>           qemu_cond_wait_iothread(first_cpu->halt_cond);
>>       }
>> diff --git a/replay/replay.c b/replay/replay.c
>> index e83c01285c..042a6a9636 100644
>> --- a/replay/replay.c
>> +++ b/replay/replay.c
>> @@ -347,6 +347,30 @@ void replay_start(void)
>>       replay_enable_events();
>>   }
>>   +/*
>> + * For none/record the answer is yes.
>> + */
>> +bool replay_can_wait(void)
>> +{
>> +    if (replay_mode == REPLAY_MODE_PLAY) {
>> +        /*
>> +         * For playback we shouldn't ever be at a point we wait. If
>> +         * the instruction count has reached zero and we have an
>> +         * unconsumed event we should go around again and consume it.
>> +         */
>> +        if (replay_state.instruction_count == 0 && replay_state.has_unread_data) {
>> +            return false;
>> +        } else {
>> +            fprintf(stderr, "Error: Invalid replay state\n");
>> +            fprintf(stderr,"instruction_count = %d, has = %d, event_kind = %d\n",
>> +                    replay_state.instruction_count, replay_state.has_unread_data, replay_state.data_kind);
>> +            abort();
>> +        }
>> +    }
>> +    return true;
>> +}
>> +
>> +
>>   void replay_finish(void)
>>   {
>>       if (replay_mode == REPLAY_MODE_NONE) {

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro
Re: [PATCH 09/11] replay: stop us hanging in rr_wait_io_event
Posted by Richard Henderson 11 months, 3 weeks ago
On 12/5/23 12:41, Alex Bennée wrote:
> A lot of the hang I see are when we end up spinning in
> rr_wait_io_event for an event that will never come in playback. As a
> new check functions which can see if we are in PLAY mode and kick us
> us the wait function so the event can be processed.
> 
> This fixes most of the failures in replay_kernel.py
> 
> Fixes: https://gitlab.com/qemu-project/qemu/-/issues/2013
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
> ---
>   include/sysemu/replay.h      |  5 +++++
>   accel/tcg/tcg-accel-ops-rr.c |  2 +-
>   replay/replay.c              | 24 ++++++++++++++++++++++++
>   3 files changed, 30 insertions(+), 1 deletion(-)
> 
> diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
> index 08aae5869f..83995ae4bd 100644
> --- a/include/sysemu/replay.h
> +++ b/include/sysemu/replay.h
> @@ -70,6 +70,11 @@ int replay_get_instructions(void);
>   /*! Updates instructions counter in replay mode. */
>   void replay_account_executed_instructions(void);
>   
> +/**
> + * replay_can_wait: check if we should pause for wait-io
> + */
> +bool replay_can_wait(void);
> +
>   /* Processing clocks and other time sources */
>   
>   /*! Save the specified clock */
> diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
> index 611932f3c3..825e35b3dc 100644
> --- a/accel/tcg/tcg-accel-ops-rr.c
> +++ b/accel/tcg/tcg-accel-ops-rr.c
> @@ -109,7 +109,7 @@ static void rr_wait_io_event(void)
>   {
>       CPUState *cpu;
>   
> -    while (all_cpu_threads_idle()) {
> +    while (all_cpu_threads_idle() && replay_can_wait()) {
>           rr_stop_kick_timer();
>           qemu_cond_wait_iothread(first_cpu->halt_cond);
>       }
> diff --git a/replay/replay.c b/replay/replay.c
> index e83c01285c..042a6a9636 100644
> --- a/replay/replay.c
> +++ b/replay/replay.c
> @@ -347,6 +347,30 @@ void replay_start(void)
>       replay_enable_events();
>   }
>   
> +/*
> + * For none/record the answer is yes.
> + */
> +bool replay_can_wait(void)
> +{
> +    if (replay_mode == REPLAY_MODE_PLAY) {
> +        /*
> +         * For playback we shouldn't ever be at a point we wait. If
> +         * the instruction count has reached zero and we have an
> +         * unconsumed event we should go around again and consume it.
> +         */
> +        if (replay_state.instruction_count == 0 && replay_state.has_unread_data) {
> +            return false;
> +        } else {
> +            fprintf(stderr, "Error: Invalid replay state\n");
> +            fprintf(stderr,"instruction_count = %d, has = %d, event_kind = %d\n",
> +                    replay_state.instruction_count, replay_state.has_unread_data, replay_state.data_kind);
> +            abort();

error_report.


r~

> +        }
> +    }
> +    return true;
> +}
> +
> +
>   void replay_finish(void)
>   {
>       if (replay_mode == REPLAY_MODE_NONE) {