diff mbox series

[RFC] replay: stop us hanging in rr_wait_io_event

Message ID 20231205121420.3287637-1-alex.bennee@linaro.org
State Superseded
Headers show
Series [RFC] replay: stop us hanging in rr_wait_io_event | expand

Commit Message

Alex Bennée Dec. 5, 2023, 12:14 p.m. UTC
A lot of the hang I see are when we end up spinning in
rr_wait_io_event for an event that will never come in playback. As a
new check functions which can see if we are in PLAY mode and kick us
us the wait function so the event can be processed.

This fixes most of the failures in replay_kernel.py

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
Cc: Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
---
 include/sysemu/replay.h      |  5 +++++
 accel/tcg/tcg-accel-ops-rr.c |  2 +-
 replay/replay.c              | 24 ++++++++++++++++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

Comments

Peter Maydell Dec. 5, 2023, 2:10 p.m. UTC | #1
On Tue, 5 Dec 2023 at 12:15, Alex Bennée <alex.bennee@linaro.org> wrote:
>
> A lot of the hang I see are when we end up spinning in
> rr_wait_io_event for an event that will never come in playback. As a
> new check functions which can see if we are in PLAY mode and kick us
> us the wait function so the event can be processed.
>
> This fixes most of the failures in replay_kernel.py
>
> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
> Cc: Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
> ---
>  include/sysemu/replay.h      |  5 +++++
>  accel/tcg/tcg-accel-ops-rr.c |  2 +-
>  replay/replay.c              | 24 ++++++++++++++++++++++++
>  3 files changed, 30 insertions(+), 1 deletion(-)
>
> diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
> index 08aae5869f..83995ae4bd 100644
> --- a/include/sysemu/replay.h
> +++ b/include/sysemu/replay.h
> @@ -70,6 +70,11 @@ int replay_get_instructions(void);
>  /*! Updates instructions counter in replay mode. */
>  void replay_account_executed_instructions(void);
>
> +/**
> + * replay_can_wait: check if we should pause for wait-io
> + */
> +bool replay_can_wait(void);
> +
>  /* Processing clocks and other time sources */
>
>  /*! Save the specified clock */
> diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
> index 611932f3c3..825e35b3dc 100644
> --- a/accel/tcg/tcg-accel-ops-rr.c
> +++ b/accel/tcg/tcg-accel-ops-rr.c
> @@ -109,7 +109,7 @@ static void rr_wait_io_event(void)
>  {
>      CPUState *cpu;
>
> -    while (all_cpu_threads_idle()) {
> +    while (all_cpu_threads_idle() && replay_can_wait()) {
>          rr_stop_kick_timer();
>          qemu_cond_wait_iothread(first_cpu->halt_cond);
>      }
> diff --git a/replay/replay.c b/replay/replay.c
> index 0f7d766efe..e71cdbf819 100644
> --- a/replay/replay.c
> +++ b/replay/replay.c
> @@ -338,6 +338,30 @@ void replay_start(void)
>      replay_enable_events();
>  }
>
> +/*
> + * For none/record the answer is yes.
> + */
> +bool replay_can_wait(void)
> +{
> +    if (replay_mode == REPLAY_MODE_PLAY) {
> +        /*
> +         * For playback we shouldn't ever be at a point we wait. If

This comment sounds like "this can't happen, but it does for
some reason we don't understand" ... Is there some bug somewhere
that we're papering over here ?

thanks
-- PMM
Alex Bennée Dec. 5, 2023, 3:15 p.m. UTC | #2
Peter Maydell <peter.maydell@linaro.org> writes:

> On Tue, 5 Dec 2023 at 12:15, Alex Bennée <alex.bennee@linaro.org> wrote:
>>
>> A lot of the hang I see are when we end up spinning in
>> rr_wait_io_event for an event that will never come in playback. As a
>> new check functions which can see if we are in PLAY mode and kick us
>> us the wait function so the event can be processed.
>>
>> This fixes most of the failures in replay_kernel.py
>>
>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>
>> Cc: Pavel Dovgalyuk <pavel.dovgaluk@ispras.ru>
>> ---
>>  include/sysemu/replay.h      |  5 +++++
>>  accel/tcg/tcg-accel-ops-rr.c |  2 +-
>>  replay/replay.c              | 24 ++++++++++++++++++++++++
>>  3 files changed, 30 insertions(+), 1 deletion(-)
>>
>> diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
>> index 08aae5869f..83995ae4bd 100644
>> --- a/include/sysemu/replay.h
>> +++ b/include/sysemu/replay.h
>> @@ -70,6 +70,11 @@ int replay_get_instructions(void);
>>  /*! Updates instructions counter in replay mode. */
>>  void replay_account_executed_instructions(void);
>>
>> +/**
>> + * replay_can_wait: check if we should pause for wait-io
>> + */
>> +bool replay_can_wait(void);
>> +
>>  /* Processing clocks and other time sources */
>>
>>  /*! Save the specified clock */
>> diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
>> index 611932f3c3..825e35b3dc 100644
>> --- a/accel/tcg/tcg-accel-ops-rr.c
>> +++ b/accel/tcg/tcg-accel-ops-rr.c
>> @@ -109,7 +109,7 @@ static void rr_wait_io_event(void)
>>  {
>>      CPUState *cpu;
>>
>> -    while (all_cpu_threads_idle()) {
>> +    while (all_cpu_threads_idle() && replay_can_wait()) {
>>          rr_stop_kick_timer();
>>          qemu_cond_wait_iothread(first_cpu->halt_cond);
>>      }
>> diff --git a/replay/replay.c b/replay/replay.c
>> index 0f7d766efe..e71cdbf819 100644
>> --- a/replay/replay.c
>> +++ b/replay/replay.c
>> @@ -338,6 +338,30 @@ void replay_start(void)
>>      replay_enable_events();
>>  }
>>
>> +/*
>> + * For none/record the answer is yes.
>> + */
>> +bool replay_can_wait(void)
>> +{
>> +    if (replay_mode == REPLAY_MODE_PLAY) {
>> +        /*
>> +         * For playback we shouldn't ever be at a point we wait. If
>
> This comment sounds like "this can't happen, but it does for
> some reason we don't understand" ... Is there some bug somewhere
> that we're papering over here ?

I don't think so - it really is just a difference in behaviour needed
here between record and playback. If we short cut earlier then that
would make the code harder to follow and introduce another potential
point of divergence.
diff mbox series

Patch

diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
index 08aae5869f..83995ae4bd 100644
--- a/include/sysemu/replay.h
+++ b/include/sysemu/replay.h
@@ -70,6 +70,11 @@  int replay_get_instructions(void);
 /*! Updates instructions counter in replay mode. */
 void replay_account_executed_instructions(void);
 
+/**
+ * replay_can_wait: check if we should pause for wait-io
+ */
+bool replay_can_wait(void);
+
 /* Processing clocks and other time sources */
 
 /*! Save the specified clock */
diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
index 611932f3c3..825e35b3dc 100644
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -109,7 +109,7 @@  static void rr_wait_io_event(void)
 {
     CPUState *cpu;
 
-    while (all_cpu_threads_idle()) {
+    while (all_cpu_threads_idle() && replay_can_wait()) {
         rr_stop_kick_timer();
         qemu_cond_wait_iothread(first_cpu->halt_cond);
     }
diff --git a/replay/replay.c b/replay/replay.c
index 0f7d766efe..e71cdbf819 100644
--- a/replay/replay.c
+++ b/replay/replay.c
@@ -338,6 +338,30 @@  void replay_start(void)
     replay_enable_events();
 }
 
+/*
+ * For none/record the answer is yes.
+ */
+bool replay_can_wait(void)
+{
+    if (replay_mode == REPLAY_MODE_PLAY) {
+        /*
+         * For playback we shouldn't ever be at a point we wait. If
+         * the instruction count has reached zero and we have an
+         * unconsumed event we should go around again and consume it.
+         */
+        if (replay_state.instruction_count == 0 && replay_state.has_unread_data) {
+            return false;
+        } else {
+            fprintf(stderr, "Error: Invalid replay state\n");
+            fprintf(stderr,"instruction_count = %d, has = %d, event_kind = %d\n",
+                    replay_state.instruction_count, replay_state.has_unread_data, replay_state.data_kind);
+            abort();
+        }
+    }
+    return true;
+}
+
+
 void replay_finish(void)
 {
     if (replay_mode == REPLAY_MODE_NONE) {