diff mbox series

[RFC,v1,7/9] cpus: move icount preparation out of tcg_exec_cpu

Message ID 20170403124524.10824-8-alex.bennee@linaro.org
State Superseded
Headers show
Series MTTCG and record/replay fixes for rc3 | expand

Commit Message

Alex Bennée April 3, 2017, 12:45 p.m. UTC
As icount is only supported for single-threaded execution due to the
requirement for determinism let's remove it from the common
tcg_exec_cpu path.

Also remove the additional fiddling which shouldn't be required as the
icount counters should all be rectified as you enter the loop.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

---
 cpus.c | 67 +++++++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 21 deletions(-)

-- 
2.11.0

Comments

Pavel Dovgalyuk April 4, 2017, 5:39 a.m. UTC | #1
I guess you are trying to fix the sympthoms of the case
when iothread is trying to access instruction count.

Maybe the solution is providing access to current_cpu for the iothread
coupled with your patch 8?

Pavel Dovgalyuk


> -----Original Message-----

> From: Alex Bennée [mailto:alex.bennee@linaro.org]

> Sent: Monday, April 03, 2017 3:45 PM

> To: dovgaluk@ispras.ru; rth@twiddle.net; pbonzini@redhat.com

> Cc: peter.maydell@linaro.org; qemu-devel@nongnu.org; mttcg@listserver.greensocs.com;

> fred.konrad@greensocs.com; a.rigo@virtualopensystems.com; cota@braap.org;

> bobby.prani@gmail.com; nikunj@linux.vnet.ibm.com; Alex Bennée; Peter Crosthwaite

> Subject: [RFC PATCH v1 7/9] cpus: move icount preparation out of tcg_exec_cpu

> 

> As icount is only supported for single-threaded execution due to the

> requirement for determinism let's remove it from the common

> tcg_exec_cpu path.

> 

> Also remove the additional fiddling which shouldn't be required as the

> icount counters should all be rectified as you enter the loop.

> 

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> ---

>  cpus.c | 67 +++++++++++++++++++++++++++++++++++++++++++++---------------------

>  1 file changed, 46 insertions(+), 21 deletions(-)

> 

> diff --git a/cpus.c b/cpus.c

> index 18b1746770..87638a75d2 100644

> --- a/cpus.c

> +++ b/cpus.c

> @@ -1179,47 +1179,66 @@ static void handle_icount_deadline(void)

>      }

>  }

> 

> -static int tcg_cpu_exec(CPUState *cpu)

> +static void prepare_icount_for_run(CPUState *cpu)

>  {

> -    int ret;

> -#ifdef CONFIG_PROFILER

> -    int64_t ti;

> -#endif

> -

> -#ifdef CONFIG_PROFILER

> -    ti = profile_getclock();

> -#endif

>      if (use_icount) {

>          int64_t count;

>          int decr;

> -        timers_state.qemu_icount -= (cpu->icount_decr.u16.low

> -                                    + cpu->icount_extra);

> -        cpu->icount_decr.u16.low = 0;

> -        cpu->icount_extra = 0;

> +

> +        /* These should always be cleared by process_icount_data after

> +         * each vCPU execution. However u16.high can be raised

> +         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt

> +         */

> +        g_assert(cpu->icount_decr.u16.low == 0);

> +        g_assert(cpu->icount_extra == 0);

> +

> +

>          count = tcg_get_icount_limit();

> +

>          timers_state.qemu_icount += count;

>          decr = (count > 0xffff) ? 0xffff : count;

>          count -= decr;

>          cpu->icount_decr.u16.low = decr;

>          cpu->icount_extra = count;

>      }

> -    qemu_mutex_unlock_iothread();

> -    cpu_exec_start(cpu);

> -    ret = cpu_exec(cpu);

> -    cpu_exec_end(cpu);

> -    qemu_mutex_lock_iothread();

> -#ifdef CONFIG_PROFILER

> -    tcg_time += profile_getclock() - ti;

> -#endif

> +}

> +

> +static void process_icount_data(CPUState *cpu)

> +{

>      if (use_icount) {

>          /* Fold pending instructions back into the

>             instruction counter, and clear the interrupt flag.  */

>          timers_state.qemu_icount -= (cpu->icount_decr.u16.low

>                          + cpu->icount_extra);

> +

> +        /* We must be under BQL here as cpu_exit can tweak

> +           icount_decr.u32 */

> +        g_assert(qemu_mutex_iothread_locked());

>          cpu->icount_decr.u32 = 0;

>          cpu->icount_extra = 0;

>          replay_account_executed_instructions();

>      }

> +}

> +

> +

> +static int tcg_cpu_exec(CPUState *cpu)

> +{

> +    int ret;

> +#ifdef CONFIG_PROFILER

> +    int64_t ti;

> +#endif

> +

> +#ifdef CONFIG_PROFILER

> +    ti = profile_getclock();

> +#endif

> +    qemu_mutex_unlock_iothread();

> +    cpu_exec_start(cpu);

> +    ret = cpu_exec(cpu);

> +    cpu_exec_end(cpu);

> +    qemu_mutex_lock_iothread();

> +#ifdef CONFIG_PROFILER

> +    tcg_time += profile_getclock() - ti;

> +#endif

>      return ret;

>  }

> 

> @@ -1306,7 +1325,13 @@ static void *qemu_tcg_rr_cpu_thread_fn(void *arg)

> 

>              if (cpu_can_run(cpu)) {

>                  int r;

> +

> +                prepare_icount_for_run(cpu);

> +

>                  r = tcg_cpu_exec(cpu);

> +

> +                process_icount_data(cpu);

> +

>                  if (r == EXCP_DEBUG) {

>                      cpu_handle_guest_debug(cpu);

>                      break;

> --

> 2.11.0
Alex Bennée April 4, 2017, 8:56 a.m. UTC | #2
Pavel Dovgalyuk <dovgaluk@ispras.ru> writes:

> I guess you are trying to fix the sympthoms of the case

> when iothread is trying to access instruction count.


In theory the main-loop should be sequenced before or after vCPU events
because of the BQL. I'm not sure why this is not currently the case.

> Maybe the solution is providing access to current_cpu for the iothread

> coupled with your patch 8?


Providing cross-thread access to CPU structures brings its own
challenges.

But it does occur to me we should probably ensure
timer_state.qemu_icount has appropriate barriers. This should be ensured
by the BQL but if it is ever accessed by 2 threads without a BQL
transition in-between then it is potentially racey.

>

> Pavel Dovgalyuk

>

>

>> -----Original Message-----

>> From: Alex Bennée [mailto:alex.bennee@linaro.org]

>> Sent: Monday, April 03, 2017 3:45 PM

>> To: dovgaluk@ispras.ru; rth@twiddle.net; pbonzini@redhat.com

>> Cc: peter.maydell@linaro.org; qemu-devel@nongnu.org; mttcg@listserver.greensocs.com;

>> fred.konrad@greensocs.com; a.rigo@virtualopensystems.com; cota@braap.org;

>> bobby.prani@gmail.com; nikunj@linux.vnet.ibm.com; Alex Bennée; Peter Crosthwaite

>> Subject: [RFC PATCH v1 7/9] cpus: move icount preparation out of tcg_exec_cpu

>>

>> As icount is only supported for single-threaded execution due to the

>> requirement for determinism let's remove it from the common

>> tcg_exec_cpu path.

>>

>> Also remove the additional fiddling which shouldn't be required as the

>> icount counters should all be rectified as you enter the loop.

>>

>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>> ---

>>  cpus.c | 67 +++++++++++++++++++++++++++++++++++++++++++++---------------------

>>  1 file changed, 46 insertions(+), 21 deletions(-)

>>

>> diff --git a/cpus.c b/cpus.c

>> index 18b1746770..87638a75d2 100644

>> --- a/cpus.c

>> +++ b/cpus.c

>> @@ -1179,47 +1179,66 @@ static void handle_icount_deadline(void)

>>      }

>>  }

>>

>> -static int tcg_cpu_exec(CPUState *cpu)

>> +static void prepare_icount_for_run(CPUState *cpu)

>>  {

>> -    int ret;

>> -#ifdef CONFIG_PROFILER

>> -    int64_t ti;

>> -#endif

>> -

>> -#ifdef CONFIG_PROFILER

>> -    ti = profile_getclock();

>> -#endif

>>      if (use_icount) {

>>          int64_t count;

>>          int decr;

>> -        timers_state.qemu_icount -= (cpu->icount_decr.u16.low

>> -                                    + cpu->icount_extra);

>> -        cpu->icount_decr.u16.low = 0;

>> -        cpu->icount_extra = 0;

>> +

>> +        /* These should always be cleared by process_icount_data after

>> +         * each vCPU execution. However u16.high can be raised

>> +         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt

>> +         */

>> +        g_assert(cpu->icount_decr.u16.low == 0);

>> +        g_assert(cpu->icount_extra == 0);

>> +

>> +

>>          count = tcg_get_icount_limit();

>> +

>>          timers_state.qemu_icount += count;

>>          decr = (count > 0xffff) ? 0xffff : count;

>>          count -= decr;

>>          cpu->icount_decr.u16.low = decr;

>>          cpu->icount_extra = count;

>>      }

>> -    qemu_mutex_unlock_iothread();

>> -    cpu_exec_start(cpu);

>> -    ret = cpu_exec(cpu);

>> -    cpu_exec_end(cpu);

>> -    qemu_mutex_lock_iothread();

>> -#ifdef CONFIG_PROFILER

>> -    tcg_time += profile_getclock() - ti;

>> -#endif

>> +}

>> +

>> +static void process_icount_data(CPUState *cpu)

>> +{

>>      if (use_icount) {

>>          /* Fold pending instructions back into the

>>             instruction counter, and clear the interrupt flag.  */

>>          timers_state.qemu_icount -= (cpu->icount_decr.u16.low

>>                          + cpu->icount_extra);

>> +

>> +        /* We must be under BQL here as cpu_exit can tweak

>> +           icount_decr.u32 */

>> +        g_assert(qemu_mutex_iothread_locked());

>>          cpu->icount_decr.u32 = 0;

>>          cpu->icount_extra = 0;

>>          replay_account_executed_instructions();

>>      }

>> +}

>> +

>> +

>> +static int tcg_cpu_exec(CPUState *cpu)

>> +{

>> +    int ret;

>> +#ifdef CONFIG_PROFILER

>> +    int64_t ti;

>> +#endif

>> +

>> +#ifdef CONFIG_PROFILER

>> +    ti = profile_getclock();

>> +#endif

>> +    qemu_mutex_unlock_iothread();

>> +    cpu_exec_start(cpu);

>> +    ret = cpu_exec(cpu);

>> +    cpu_exec_end(cpu);

>> +    qemu_mutex_lock_iothread();

>> +#ifdef CONFIG_PROFILER

>> +    tcg_time += profile_getclock() - ti;

>> +#endif

>>      return ret;

>>  }

>>

>> @@ -1306,7 +1325,13 @@ static void *qemu_tcg_rr_cpu_thread_fn(void *arg)

>>

>>              if (cpu_can_run(cpu)) {

>>                  int r;

>> +

>> +                prepare_icount_for_run(cpu);

>> +

>>                  r = tcg_cpu_exec(cpu);

>> +

>> +                process_icount_data(cpu);

>> +

>>                  if (r == EXCP_DEBUG) {

>>                      cpu_handle_guest_debug(cpu);

>>                      break;

>> --

>> 2.11.0



--
Alex Bennée
Alex Bennée April 4, 2017, 10:46 a.m. UTC | #3
Alex Bennée <alex.bennee@linaro.org> writes:

> Pavel Dovgalyuk <dovgaluk@ispras.ru> writes:

>

>> I guess you are trying to fix the sympthoms of the case

>> when iothread is trying to access instruction count.

>

> In theory the main-loop should be sequenced before or after vCPU events

> because of the BQL. I'm not sure why this is not currently the case.


It seems cpu_handle_exception doesn't take the BQL until
replay_exception() has done its thing. This is fixable but the function
is a mess so I'm trying to neaten that up first.

>

>> Maybe the solution is providing access to current_cpu for the iothread

>> coupled with your patch 8?

>

> Providing cross-thread access to CPU structures brings its own

> challenges.

>

> But it does occur to me we should probably ensure

> timer_state.qemu_icount has appropriate barriers. This should be ensured

> by the BQL but if it is ever accessed by 2 threads without a BQL

> transition in-between then it is potentially racey.

>

>>

>> Pavel Dovgalyuk

>>

>>

>>> -----Original Message-----

>>> From: Alex Bennée [mailto:alex.bennee@linaro.org]

>>> Sent: Monday, April 03, 2017 3:45 PM

>>> To: dovgaluk@ispras.ru; rth@twiddle.net; pbonzini@redhat.com

>>> Cc: peter.maydell@linaro.org; qemu-devel@nongnu.org; mttcg@listserver.greensocs.com;

>>> fred.konrad@greensocs.com; a.rigo@virtualopensystems.com; cota@braap.org;

>>> bobby.prani@gmail.com; nikunj@linux.vnet.ibm.com; Alex Bennée; Peter Crosthwaite

>>> Subject: [RFC PATCH v1 7/9] cpus: move icount preparation out of tcg_exec_cpu

>>>

>>> As icount is only supported for single-threaded execution due to the

>>> requirement for determinism let's remove it from the common

>>> tcg_exec_cpu path.

>>>

>>> Also remove the additional fiddling which shouldn't be required as the

>>> icount counters should all be rectified as you enter the loop.

>>>

>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>>> ---

>>>  cpus.c | 67 +++++++++++++++++++++++++++++++++++++++++++++---------------------

>>>  1 file changed, 46 insertions(+), 21 deletions(-)

>>>

>>> diff --git a/cpus.c b/cpus.c

>>> index 18b1746770..87638a75d2 100644

>>> --- a/cpus.c

>>> +++ b/cpus.c

>>> @@ -1179,47 +1179,66 @@ static void handle_icount_deadline(void)

>>>      }

>>>  }

>>>

>>> -static int tcg_cpu_exec(CPUState *cpu)

>>> +static void prepare_icount_for_run(CPUState *cpu)

>>>  {

>>> -    int ret;

>>> -#ifdef CONFIG_PROFILER

>>> -    int64_t ti;

>>> -#endif

>>> -

>>> -#ifdef CONFIG_PROFILER

>>> -    ti = profile_getclock();

>>> -#endif

>>>      if (use_icount) {

>>>          int64_t count;

>>>          int decr;

>>> -        timers_state.qemu_icount -= (cpu->icount_decr.u16.low

>>> -                                    + cpu->icount_extra);

>>> -        cpu->icount_decr.u16.low = 0;

>>> -        cpu->icount_extra = 0;

>>> +

>>> +        /* These should always be cleared by process_icount_data after

>>> +         * each vCPU execution. However u16.high can be raised

>>> +         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt

>>> +         */

>>> +        g_assert(cpu->icount_decr.u16.low == 0);

>>> +        g_assert(cpu->icount_extra == 0);

>>> +

>>> +

>>>          count = tcg_get_icount_limit();

>>> +

>>>          timers_state.qemu_icount += count;

>>>          decr = (count > 0xffff) ? 0xffff : count;

>>>          count -= decr;

>>>          cpu->icount_decr.u16.low = decr;

>>>          cpu->icount_extra = count;

>>>      }

>>> -    qemu_mutex_unlock_iothread();

>>> -    cpu_exec_start(cpu);

>>> -    ret = cpu_exec(cpu);

>>> -    cpu_exec_end(cpu);

>>> -    qemu_mutex_lock_iothread();

>>> -#ifdef CONFIG_PROFILER

>>> -    tcg_time += profile_getclock() - ti;

>>> -#endif

>>> +}

>>> +

>>> +static void process_icount_data(CPUState *cpu)

>>> +{

>>>      if (use_icount) {

>>>          /* Fold pending instructions back into the

>>>             instruction counter, and clear the interrupt flag.  */

>>>          timers_state.qemu_icount -= (cpu->icount_decr.u16.low

>>>                          + cpu->icount_extra);

>>> +

>>> +        /* We must be under BQL here as cpu_exit can tweak

>>> +           icount_decr.u32 */

>>> +        g_assert(qemu_mutex_iothread_locked());

>>>          cpu->icount_decr.u32 = 0;

>>>          cpu->icount_extra = 0;

>>>          replay_account_executed_instructions();

>>>      }

>>> +}

>>> +

>>> +

>>> +static int tcg_cpu_exec(CPUState *cpu)

>>> +{

>>> +    int ret;

>>> +#ifdef CONFIG_PROFILER

>>> +    int64_t ti;

>>> +#endif

>>> +

>>> +#ifdef CONFIG_PROFILER

>>> +    ti = profile_getclock();

>>> +#endif

>>> +    qemu_mutex_unlock_iothread();

>>> +    cpu_exec_start(cpu);

>>> +    ret = cpu_exec(cpu);

>>> +    cpu_exec_end(cpu);

>>> +    qemu_mutex_lock_iothread();

>>> +#ifdef CONFIG_PROFILER

>>> +    tcg_time += profile_getclock() - ti;

>>> +#endif

>>>      return ret;

>>>  }

>>>

>>> @@ -1306,7 +1325,13 @@ static void *qemu_tcg_rr_cpu_thread_fn(void *arg)

>>>

>>>              if (cpu_can_run(cpu)) {

>>>                  int r;

>>> +

>>> +                prepare_icount_for_run(cpu);

>>> +

>>>                  r = tcg_cpu_exec(cpu);

>>> +

>>> +                process_icount_data(cpu);

>>> +

>>>                  if (r == EXCP_DEBUG) {

>>>                      cpu_handle_guest_debug(cpu);

>>>                      break;

>>> --

>>> 2.11.0



--
Alex Bennée
Paolo Bonzini April 4, 2017, 10:53 a.m. UTC | #4
On 04/04/2017 12:46, Alex Bennée wrote:
>> In theory the main-loop should be sequenced before or after vCPU events

>> because of the BQL. I'm not sure why this is not currently the case.

> 

> It seems cpu_handle_exception doesn't take the BQL until

> replay_exception() has done its thing. This is fixable but the function

> is a mess so I'm trying to neaten that up first.


Long term neither cpu_handle_exception nor cpu_handle_interrupt need the
BQL at all.

Paolo
Alex Bennée April 4, 2017, 12:31 p.m. UTC | #5
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 04/04/2017 12:46, Alex Bennée wrote:

>>> In theory the main-loop should be sequenced before or after vCPU events

>>> because of the BQL. I'm not sure why this is not currently the case.

>>

>> It seems cpu_handle_exception doesn't take the BQL until

>> replay_exception() has done its thing. This is fixable but the function

>> is a mess so I'm trying to neaten that up first.

>

> Long term neither cpu_handle_exception nor cpu_handle_interrupt need the

> BQL at all.


Well for record/replay they might. Otherwise we end up moving the record
stream on even though a checkpoint might be being written by the
main-loop.

As far as the cc->do_interrupt() stuff is concerned it will be guest
dependant because you could end up in device emulation code down this
path which must be protected by the BQL - the arm_gic code being a good
example.

>

> Paolo



--
Alex Bennée
Paolo Bonzini April 4, 2017, 12:37 p.m. UTC | #6
On 04/04/2017 14:31, Alex Bennée wrote:
> 

> Paolo Bonzini <pbonzini@redhat.com> writes:

> 

>> On 04/04/2017 12:46, Alex Bennée wrote:

>>>> In theory the main-loop should be sequenced before or after vCPU events

>>>> because of the BQL. I'm not sure why this is not currently the case.

>>>

>>> It seems cpu_handle_exception doesn't take the BQL until

>>> replay_exception() has done its thing. This is fixable but the function

>>> is a mess so I'm trying to neaten that up first.

>>

>> Long term neither cpu_handle_exception nor cpu_handle_interrupt need the

>> BQL at all.

> 

> Well for record/replay they might. Otherwise we end up moving the record

> stream on even though a checkpoint might be being written by the

> main-loop.

> 

> As far as the cc->do_interrupt() stuff is concerned it will be guest

> dependant because you could end up in device emulation code down this

> path which must be protected by the BQL - the arm_gic code being a good

> example.


I think recording an event could be split in two parts:

- recording the (icount, event) tuple and getting back a unique event id

- waiting for all events with lower event id to be complete before
starting to process this one

This doesn't require the BQL, you can use a condition variable on
replay_lock (but you do need to unlock/lock the BQL around it if
currently taken).

The complicated part is ensuring that there are no deadlocks where the
I/O thread needs the VCPU thread to proceed, but the VCPU thread is
waiting on the I/O thread's event processing.

Paolo
Alex Bennée April 4, 2017, 1:29 p.m. UTC | #7
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 04/04/2017 14:31, Alex Bennée wrote:

>>

>> Paolo Bonzini <pbonzini@redhat.com> writes:

>>

>>> On 04/04/2017 12:46, Alex Bennée wrote:

>>>>> In theory the main-loop should be sequenced before or after vCPU events

>>>>> because of the BQL. I'm not sure why this is not currently the case.

>>>>

>>>> It seems cpu_handle_exception doesn't take the BQL until

>>>> replay_exception() has done its thing. This is fixable but the function

>>>> is a mess so I'm trying to neaten that up first.

>>>

>>> Long term neither cpu_handle_exception nor cpu_handle_interrupt need the

>>> BQL at all.

>>

>> Well for record/replay they might. Otherwise we end up moving the record

>> stream on even though a checkpoint might be being written by the

>> main-loop.

>>

>> As far as the cc->do_interrupt() stuff is concerned it will be guest

>> dependant because you could end up in device emulation code down this

>> path which must be protected by the BQL - the arm_gic code being a good

>> example.

>

> I think recording an event could be split in two parts:

>

> - recording the (icount, event) tuple and getting back a unique event id

>

> - waiting for all events with lower event id to be complete before

> starting to process this one

>

> This doesn't require the BQL, you can use a condition variable on

> replay_lock (but you do need to unlock/lock the BQL around it if

> currently taken).


Would you then leave the recording to the stream to the main-loop
thread? I guess it would marshal all events that occurred before the
checkpoint first and then finish draining the queue after recording its
checkpoint?

Wrapping the exception stuff in the BQL does improve the repeat-ability
but of course it breaks if I take away the graceful handling of time
differences because there is a race between recording the exception
event (with current_step+insns so far) and getting back to the main loop
where insns is finally credited to timers_state.qemu_icount.

I guess we could improve the situation by updating
timers_state.qemu_icount (under BQL) as we record events. I don't know
how clunky that would get.

> The complicated part is ensuring that there are no deadlocks where the

> I/O thread needs the VCPU thread to proceed, but the VCPU thread is

> waiting on the I/O thread's event processing.


This sort of update sounds more like 2.10 material though.

--
Alex Bennée
Pavel Dovgalyuk April 5, 2017, 10:44 a.m. UTC | #8
> From: Alex Bennée [mailto:alex.bennee@linaro.org]

> Paolo Bonzini <pbonzini@redhat.com> writes:

> 

> > On 04/04/2017 14:31, Alex Bennée wrote:

> >>

> >> Paolo Bonzini <pbonzini@redhat.com> writes:

> >>

> >>> On 04/04/2017 12:46, Alex Bennée wrote:

> >>>>> In theory the main-loop should be sequenced before or after vCPU events

> >>>>> because of the BQL. I'm not sure why this is not currently the case.

> >>>>

> >>>> It seems cpu_handle_exception doesn't take the BQL until

> >>>> replay_exception() has done its thing. This is fixable but the function

> >>>> is a mess so I'm trying to neaten that up first.

> >>>

> >>> Long term neither cpu_handle_exception nor cpu_handle_interrupt need the

> >>> BQL at all.

> >>

> >> Well for record/replay they might. Otherwise we end up moving the record

> >> stream on even though a checkpoint might be being written by the

> >> main-loop.

> >>

> >> As far as the cc->do_interrupt() stuff is concerned it will be guest

> >> dependant because you could end up in device emulation code down this

> >> path which must be protected by the BQL - the arm_gic code being a good

> >> example.

> >

> > I think recording an event could be split in two parts:

> >

> > - recording the (icount, event) tuple and getting back a unique event id

> >

> > - waiting for all events with lower event id to be complete before

> > starting to process this one

> >

> > This doesn't require the BQL, you can use a condition variable on

> > replay_lock (but you do need to unlock/lock the BQL around it if

> > currently taken).

> 

> Would you then leave the recording to the stream to the main-loop

> thread? I guess it would marshal all events that occurred before the

> checkpoint first and then finish draining the queue after recording its

> checkpoint?

> 

> Wrapping the exception stuff in the BQL does improve the repeat-ability

> but of course it breaks if I take away the graceful handling of time

> differences because there is a race between recording the exception

> event (with current_step+insns so far) and getting back to the main loop

> where insns is finally credited to timers_state.qemu_icount.

> 

> I guess we could improve the situation by updating

> timers_state.qemu_icount (under BQL) as we record events. I don't know

> how clunky that would get.


Does io instructions make some lock to prevent races in virtual hardware?

vCPU thread updates icount in the beginning of the TB execution.
It means that checkpoints in the replay log will appear only at the boundaries
of TBs. However, the same log may be generated by different scenarios.
Consider the following cases:

1. Sequence: vCPU-block-begin vCPU-update-icount iothread-io vCPU-io vCPU-block-end
2. Sequence: vCPU-block-begin vCPU-update-icount vCPU-io iothread-io vCPU-block-end

These sequences will generate the same order of replay events, but different
states of virtual hardware.

Therefore we need some lock for the time while vCPU executes translation block
(or the whole sequence of blocks as in old times).

> > The complicated part is ensuring that there are no deadlocks where the

> > I/O thread needs the VCPU thread to proceed, but the VCPU thread is

> > waiting on the I/O thread's event processing.

> 

> This sort of update sounds more like 2.10 material though.


Pavel Dovgalyuk
Alex Bennée April 5, 2017, 11:18 a.m. UTC | #9
Pavel Dovgalyuk <dovgaluk@ispras.ru> writes:

>> From: Alex Bennée [mailto:alex.bennee@linaro.org]

>> Paolo Bonzini <pbonzini@redhat.com> writes:

>>

>> > On 04/04/2017 14:31, Alex Bennée wrote:

>> >>

>> >> Paolo Bonzini <pbonzini@redhat.com> writes:

>> >>

>> >>> On 04/04/2017 12:46, Alex Bennée wrote:

>> >>>>> In theory the main-loop should be sequenced before or after vCPU events

>> >>>>> because of the BQL. I'm not sure why this is not currently the case.

>> >>>>

>> >>>> It seems cpu_handle_exception doesn't take the BQL until

>> >>>> replay_exception() has done its thing. This is fixable but the function

>> >>>> is a mess so I'm trying to neaten that up first.

>> >>>

>> >>> Long term neither cpu_handle_exception nor cpu_handle_interrupt need the

>> >>> BQL at all.

>> >>

>> >> Well for record/replay they might. Otherwise we end up moving the record

>> >> stream on even though a checkpoint might be being written by the

>> >> main-loop.

>> >>

>> >> As far as the cc->do_interrupt() stuff is concerned it will be guest

>> >> dependant because you could end up in device emulation code down this

>> >> path which must be protected by the BQL - the arm_gic code being a good

>> >> example.

>> >

>> > I think recording an event could be split in two parts:

>> >

>> > - recording the (icount, event) tuple and getting back a unique event id

>> >

>> > - waiting for all events with lower event id to be complete before

>> > starting to process this one

>> >

>> > This doesn't require the BQL, you can use a condition variable on

>> > replay_lock (but you do need to unlock/lock the BQL around it if

>> > currently taken).

>>

>> Would you then leave the recording to the stream to the main-loop

>> thread? I guess it would marshal all events that occurred before the

>> checkpoint first and then finish draining the queue after recording its

>> checkpoint?

>>

>> Wrapping the exception stuff in the BQL does improve the repeat-ability

>> but of course it breaks if I take away the graceful handling of time

>> differences because there is a race between recording the exception

>> event (with current_step+insns so far) and getting back to the main loop

>> where insns is finally credited to timers_state.qemu_icount.

>>

>> I guess we could improve the situation by updating

>> timers_state.qemu_icount (under BQL) as we record events. I don't know

>> how clunky that would get.

>

> Does io instructions make some lock to prevent races in virtual

> hardware?


Yes the BQL is taken for MMIO operations unless the hardware explicitly
does its own locking. Other operations that trigger hardware emulation
(for example changing ARM_CP_IO registers) should also take the BQL.

> vCPU thread updates icount in the beginning of the TB execution.

> It means that checkpoints in the replay log will appear only at the boundaries

> of TBs.


Not quite. The vCPU thread takes into account "in flight" instructions
when calculating the icount. In the next series I'm about to post I've
ensured this reflects an update to the main-loop icount when we read the
value.

> However, the same log may be generated by different scenarios.

> Consider the following cases:

>

> 1. Sequence: vCPU-block-begin vCPU-update-icount iothread-io vCPU-io vCPU-block-end

> 2. Sequence: vCPU-block-begin vCPU-update-icount vCPU-io iothread-io vCPU-block-end

>

> These sequences will generate the same order of replay events, but different

> states of virtual hardware.

>

> Therefore we need some lock for the time while vCPU executes translation block

> (or the whole sequence of blocks as in old times).


Well this can't be the BQL anymore. Using it for serialisation of
multiple threads conflicts with the general aim of reducing BQL
contention across the code-base.

Perhaps we should just push replay_lock up the call-stack to
prepare/process_icount_data take and release the lock and we do the same
in the checkpoint code?

>

>> > The complicated part is ensuring that there are no deadlocks where the

>> > I/O thread needs the VCPU thread to proceed, but the VCPU thread is

>> > waiting on the I/O thread's event processing.

>>

>> This sort of update sounds more like 2.10 material though.

>

> Pavel Dovgalyuk



--
Alex Bennée
diff mbox series

Patch

diff --git a/cpus.c b/cpus.c
index 18b1746770..87638a75d2 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1179,47 +1179,66 @@  static void handle_icount_deadline(void)
     }
 }
 
-static int tcg_cpu_exec(CPUState *cpu)
+static void prepare_icount_for_run(CPUState *cpu)
 {
-    int ret;
-#ifdef CONFIG_PROFILER
-    int64_t ti;
-#endif
-
-#ifdef CONFIG_PROFILER
-    ti = profile_getclock();
-#endif
     if (use_icount) {
         int64_t count;
         int decr;
-        timers_state.qemu_icount -= (cpu->icount_decr.u16.low
-                                    + cpu->icount_extra);
-        cpu->icount_decr.u16.low = 0;
-        cpu->icount_extra = 0;
+
+        /* These should always be cleared by process_icount_data after
+         * each vCPU execution. However u16.high can be raised
+         * asynchronously by cpu_exit/cpu_interrupt/tcg_handle_interrupt
+         */
+        g_assert(cpu->icount_decr.u16.low == 0);
+        g_assert(cpu->icount_extra == 0);
+
+
         count = tcg_get_icount_limit();
+
         timers_state.qemu_icount += count;
         decr = (count > 0xffff) ? 0xffff : count;
         count -= decr;
         cpu->icount_decr.u16.low = decr;
         cpu->icount_extra = count;
     }
-    qemu_mutex_unlock_iothread();
-    cpu_exec_start(cpu);
-    ret = cpu_exec(cpu);
-    cpu_exec_end(cpu);
-    qemu_mutex_lock_iothread();
-#ifdef CONFIG_PROFILER
-    tcg_time += profile_getclock() - ti;
-#endif
+}
+
+static void process_icount_data(CPUState *cpu)
+{
     if (use_icount) {
         /* Fold pending instructions back into the
            instruction counter, and clear the interrupt flag.  */
         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
                         + cpu->icount_extra);
+
+        /* We must be under BQL here as cpu_exit can tweak
+           icount_decr.u32 */
+        g_assert(qemu_mutex_iothread_locked());
         cpu->icount_decr.u32 = 0;
         cpu->icount_extra = 0;
         replay_account_executed_instructions();
     }
+}
+
+
+static int tcg_cpu_exec(CPUState *cpu)
+{
+    int ret;
+#ifdef CONFIG_PROFILER
+    int64_t ti;
+#endif
+
+#ifdef CONFIG_PROFILER
+    ti = profile_getclock();
+#endif
+    qemu_mutex_unlock_iothread();
+    cpu_exec_start(cpu);
+    ret = cpu_exec(cpu);
+    cpu_exec_end(cpu);
+    qemu_mutex_lock_iothread();
+#ifdef CONFIG_PROFILER
+    tcg_time += profile_getclock() - ti;
+#endif
     return ret;
 }
 
@@ -1306,7 +1325,13 @@  static void *qemu_tcg_rr_cpu_thread_fn(void *arg)
 
             if (cpu_can_run(cpu)) {
                 int r;
+
+                prepare_icount_for_run(cpu);
+
                 r = tcg_cpu_exec(cpu);
+
+                process_icount_data(cpu);
+
                 if (r == EXCP_DEBUG) {
                     cpu_handle_guest_debug(cpu);
                     break;