diff mbox series

[PULL,22/24] target-arm: ensure all cross vCPUs TLB flushes complete

Message ID 20170224112109.3147-23-alex.bennee@linaro.org
State Accepted
Commit a67cf2772733e0ff40ed14cfed9e177b050c22a7
Headers show
Series MTTCG Base enabling patches with ARM enablement | expand

Commit Message

Alex Bennée Feb. 24, 2017, 11:21 a.m. UTC
Previously flushes on other vCPUs would only get serviced when they
exited their TranslationBlocks. While this isn't overly problematic it
violates the semantics of TLB flush from the point of view of source
vCPU.

To solve this we call the cputlb *_all_cpus_synced() functions to do
the flushes which ensures all flushes are completed by the time the
vCPU next schedules its own work. As the TLB instructions are modelled
as CP writes the TB ends at this point meaning cpu->exit_request will
be checked before the next instruction is executed.

Deferring the work until the architectural sync point is a possible
future optimisation.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

Reviewed-by: Richard Henderson <rth@twiddle.net>

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

---
 target/arm/helper.c | 165 ++++++++++++++++++++++------------------------------
 1 file changed, 69 insertions(+), 96 deletions(-)

-- 
2.11.0

Comments

Dmitry Osipenko Sept. 17, 2017, 1:07 p.m. UTC | #1
On 24.02.2017 14:21, Alex Bennée wrote:
> Previously flushes on other vCPUs would only get serviced when they

> exited their TranslationBlocks. While this isn't overly problematic it

> violates the semantics of TLB flush from the point of view of source

> vCPU.

> 

> To solve this we call the cputlb *_all_cpus_synced() functions to do

> the flushes which ensures all flushes are completed by the time the

> vCPU next schedules its own work. As the TLB instructions are modelled

> as CP writes the TB ends at this point meaning cpu->exit_request will

> be checked before the next instruction is executed.

> 

> Deferring the work until the architectural sync point is a possible

> future optimisation.

> 

> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

> Reviewed-by: Richard Henderson <rth@twiddle.net>

> Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

> ---

>  target/arm/helper.c | 165 ++++++++++++++++++++++------------------------------

>  1 file changed, 69 insertions(+), 96 deletions(-)

> 


Hello,

I have an issue with Linux kernel stopping to boot on a SMP 32bit ARM (haven't
checked 64bit) in a single-threaded TCG mode. Kernel reaches point where it
should mount rootfs over NFS and vCPUs stop. This issue is reproducible with any
32bit ARM machine type. Kernel boots fine with a MTTCG accel, only
single-threaded TCG is affected. Git bisection lead to this patch, any ideas?

Example:

qemu-system-arm -M vexpress-a9 -smp cpus=2 -accel accel=tcg,thread=single
-kernel arch/arm/boot/zImage -dtb arch/arm/boot/dts/vexpress-v2p-ca9.dtb -serial
stdio -net nic,model=lan9118 -net user -d in_asm,out_asm -D /tmp/qemulog

Last TB from the log:
----------------
IN:
0xc011a450:  ee080f73      mcr	15, 0, r0, cr8, cr3, {3}

OUT: [size=68]
0x7f32d8b93f80:  mov    -0x18(%r14),%ebp
0x7f32d8b93f84:  test   %ebp,%ebp
0x7f32d8b93f86:  jne    0x7f32d8b93fb8
0x7f32d8b93f8c:  mov    %r14,%rdi
0x7f32d8b93f8f:  mov    $0x5620f2aea5d0,%rsi
0x7f32d8b93f99:  mov    (%r14),%edx
0x7f32d8b93f9c:  mov    $0x5620f18107ca,%r10
0x7f32d8b93fa6:  callq  *%r10
0x7f32d8b93fa9:  movl   $0xc011a454,0x3c(%r14)
0x7f32d8b93fb1:  xor    %eax,%eax
0x7f32d8b93fb3:  jmpq   0x7f32d7a4e016
0x7f32d8b93fb8:  lea    -0x14aa07c(%rip),%rax        # 0x7f32d76e9f43
0x7f32d8b93fbf:  jmpq   0x7f32d7a4e016


> diff --git a/target/arm/helper.c b/target/arm/helper.c

> index b41d0494d1..bcedb4a808 100644

> --- a/target/arm/helper.c

> +++ b/target/arm/helper.c

> @@ -536,41 +536,33 @@ static void tlbimvaa_write(CPUARMState *env, const ARMCPRegInfo *ri,

>  static void tlbiall_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                               uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush(other_cs);

> -    }

> +    tlb_flush_all_cpus_synced(cs);

>  }

>  

>  static void tlbiasid_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                               uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush(other_cs);

> -    }

> +    tlb_flush_all_cpus_synced(cs);

>  }

>  

>  static void tlbimva_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                               uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_page(other_cs, value & TARGET_PAGE_MASK);

> -    }

> +    tlb_flush_page_all_cpus_synced(cs, value & TARGET_PAGE_MASK);

>  }

>  

>  static void tlbimvaa_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                               uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_page(other_cs, value & TARGET_PAGE_MASK);

> -    }

> +    tlb_flush_page_all_cpus_synced(cs, value & TARGET_PAGE_MASK);

>  }

>  

>  static void tlbiall_nsnh_write(CPUARMState *env, const ARMCPRegInfo *ri,

> @@ -587,14 +579,12 @@ static void tlbiall_nsnh_write(CPUARMState *env, const ARMCPRegInfo *ri,

>  static void tlbiall_nsnh_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                    uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_by_mmuidx(other_cs,

> -                            (1 << ARMMMUIdx_S12NSE1) |

> -                            (1 << ARMMMUIdx_S12NSE0) |

> -                            (1 << ARMMMUIdx_S2NS));

> -    }

> +    tlb_flush_by_mmuidx_all_cpus_synced(cs,

> +                                        (1 << ARMMMUIdx_S12NSE1) |

> +                                        (1 << ARMMMUIdx_S12NSE0) |

> +                                        (1 << ARMMMUIdx_S2NS));

>  }

>  

>  static void tlbiipas2_write(CPUARMState *env, const ARMCPRegInfo *ri,

> @@ -621,7 +611,7 @@ static void tlbiipas2_write(CPUARMState *env, const ARMCPRegInfo *ri,

>  static void tlbiipas2_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                 uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>      uint64_t pageaddr;

>  

>      if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) {

> @@ -630,9 +620,8 @@ static void tlbiipas2_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>  

>      pageaddr = sextract64(value << 12, 0, 40);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S2NS));

> -    }

> +    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

> +                                             (1 << ARMMMUIdx_S2NS));

>  }

>  

>  static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,

> @@ -646,11 +635,9 @@ static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,

>  static void tlbiall_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                   uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_by_mmuidx(other_cs, (1 << ARMMMUIdx_S1E2));

> -    }

> +    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E2));

>  }

>  

>  static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,

> @@ -665,12 +652,11 @@ static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,

>  static void tlbimva_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                   uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>      uint64_t pageaddr = value & ~MAKE_64BIT_MASK(0, 12);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S1E2));

> -    }

> +    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

> +                                             (1 << ARMMMUIdx_S1E2));

>  }

>  

>  static const ARMCPRegInfo cp_reginfo[] = {

> @@ -2904,8 +2890,7 @@ static CPAccessResult aa64_cacheop_access(CPUARMState *env,

>  static void tlbi_aa64_vmalle1_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                      uint64_t value)

>  {

> -    ARMCPU *cpu = arm_env_get_cpu(env);

> -    CPUState *cs = CPU(cpu);

> +    CPUState *cs = ENV_GET_CPU(env);

>  

>      if (arm_is_secure_below_el3(env)) {

>          tlb_flush_by_mmuidx(cs,

> @@ -2921,19 +2906,17 @@ static void tlbi_aa64_vmalle1_write(CPUARMState *env, const ARMCPRegInfo *ri,

>  static void tlbi_aa64_vmalle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                        uint64_t value)

>  {

> +    CPUState *cs = ENV_GET_CPU(env);

>      bool sec = arm_is_secure_below_el3(env);

> -    CPUState *other_cs;

>  

> -    CPU_FOREACH(other_cs) {

> -        if (sec) {

> -            tlb_flush_by_mmuidx(other_cs,

> -                                (1 << ARMMMUIdx_S1SE1) |

> -                                (1 << ARMMMUIdx_S1SE0));

> -        } else {

> -            tlb_flush_by_mmuidx(other_cs,

> -                                (1 << ARMMMUIdx_S12NSE1) |

> -                                (1 << ARMMMUIdx_S12NSE0));

> -        }

> +    if (sec) {

> +        tlb_flush_by_mmuidx_all_cpus_synced(cs,

> +                                            (1 << ARMMMUIdx_S1SE1) |

> +                                            (1 << ARMMMUIdx_S1SE0));

> +    } else {

> +        tlb_flush_by_mmuidx_all_cpus_synced(cs,

> +                                            (1 << ARMMMUIdx_S12NSE1) |

> +                                            (1 << ARMMMUIdx_S12NSE0));

>      }

>  }

>  

> @@ -2990,46 +2973,40 @@ static void tlbi_aa64_alle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>       * stage 2 translations, whereas most other scopes only invalidate

>       * stage 1 translations.

>       */

> +    CPUState *cs = ENV_GET_CPU(env);

>      bool sec = arm_is_secure_below_el3(env);

>      bool has_el2 = arm_feature(env, ARM_FEATURE_EL2);

> -    CPUState *other_cs;

> -

> -    CPU_FOREACH(other_cs) {

> -        if (sec) {

> -            tlb_flush_by_mmuidx(other_cs,

> -                                (1 << ARMMMUIdx_S1SE1) |

> -                                (1 << ARMMMUIdx_S1SE0));

> -        } else if (has_el2) {

> -            tlb_flush_by_mmuidx(other_cs,

> -                                (1 << ARMMMUIdx_S12NSE1) |

> -                                (1 << ARMMMUIdx_S12NSE0) |

> -                                (1 << ARMMMUIdx_S2NS));

> -        } else {

> -            tlb_flush_by_mmuidx(other_cs,

> -                                (1 << ARMMMUIdx_S12NSE1) |

> -                                (1 << ARMMMUIdx_S12NSE0));

> -        }

> +

> +    if (sec) {

> +        tlb_flush_by_mmuidx_all_cpus_synced(cs,

> +                                            (1 << ARMMMUIdx_S1SE1) |

> +                                            (1 << ARMMMUIdx_S1SE0));

> +    } else if (has_el2) {

> +        tlb_flush_by_mmuidx_all_cpus_synced(cs,

> +                                            (1 << ARMMMUIdx_S12NSE1) |

> +                                            (1 << ARMMMUIdx_S12NSE0) |

> +                                            (1 << ARMMMUIdx_S2NS));

> +    } else {

> +          tlb_flush_by_mmuidx_all_cpus_synced(cs,

> +                                              (1 << ARMMMUIdx_S12NSE1) |

> +                                              (1 << ARMMMUIdx_S12NSE0));

>      }

>  }

>  

>  static void tlbi_aa64_alle2is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                      uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_by_mmuidx(other_cs, (1 << ARMMMUIdx_S1E2));

> -    }

> +    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E2));

>  }

>  

>  static void tlbi_aa64_alle3is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                      uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_by_mmuidx(other_cs, (1 << ARMMMUIdx_S1E3));

> -    }

> +    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E3));

>  }

>  

>  static void tlbi_aa64_vae1_write(CPUARMState *env, const ARMCPRegInfo *ri,

> @@ -3086,43 +3063,40 @@ static void tlbi_aa64_vae3_write(CPUARMState *env, const ARMCPRegInfo *ri,

>  static void tlbi_aa64_vae1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                     uint64_t value)

>  {

> +    ARMCPU *cpu = arm_env_get_cpu(env);

> +    CPUState *cs = CPU(cpu);

>      bool sec = arm_is_secure_below_el3(env);

> -    CPUState *other_cs;

>      uint64_t pageaddr = sextract64(value << 12, 0, 56);

>  

> -    CPU_FOREACH(other_cs) {

> -        if (sec) {

> -            tlb_flush_page_by_mmuidx(other_cs, pageaddr,

> -                                     (1 << ARMMMUIdx_S1SE1) |

> -                                     (1 << ARMMMUIdx_S1SE0));

> -        } else {

> -            tlb_flush_page_by_mmuidx(other_cs, pageaddr,

> -                                     (1 << ARMMMUIdx_S12NSE1) |

> -                                     (1 << ARMMMUIdx_S12NSE0));

> -        }

> +    if (sec) {

> +        tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

> +                                                 (1 << ARMMMUIdx_S1SE1) |

> +                                                 (1 << ARMMMUIdx_S1SE0));

> +    } else {

> +        tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

> +                                                 (1 << ARMMMUIdx_S12NSE1) |

> +                                                 (1 << ARMMMUIdx_S12NSE0));

>      }

>  }

>  

>  static void tlbi_aa64_vae2is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                     uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>      uint64_t pageaddr = sextract64(value << 12, 0, 56);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S1E2));

> -    }

> +    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

> +                                             (1 << ARMMMUIdx_S1E2));

>  }

>  

>  static void tlbi_aa64_vae3is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                     uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>      uint64_t pageaddr = sextract64(value << 12, 0, 56);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S1E3));

> -    }

> +    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

> +                                             (1 << ARMMMUIdx_S1E3));

>  }

>  

>  static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri,

> @@ -3150,7 +3124,7 @@ static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri,

>  static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>                                        uint64_t value)

>  {

> -    CPUState *other_cs;

> +    CPUState *cs = ENV_GET_CPU(env);

>      uint64_t pageaddr;

>  

>      if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) {

> @@ -3159,9 +3133,8 @@ static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>  

>      pageaddr = sextract64(value << 12, 0, 48);

>  

> -    CPU_FOREACH(other_cs) {

> -        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S2NS));

> -    }

> +    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

> +                                             (1 << ARMMMUIdx_S2NS));

>  }

>  

>  static CPAccessResult aa64_zva_access(CPUARMState *env, const ARMCPRegInfo *ri,

> 



-- 
Dmitry
Alex Bennée Sept. 17, 2017, 1:22 p.m. UTC | #2
Dmitry Osipenko <digetx@gmail.com> writes:

> On 24.02.2017 14:21, Alex Bennée wrote:

>> Previously flushes on other vCPUs would only get serviced when they

>> exited their TranslationBlocks. While this isn't overly problematic it

>> violates the semantics of TLB flush from the point of view of source

>> vCPU.

>>

>> To solve this we call the cputlb *_all_cpus_synced() functions to do

>> the flushes which ensures all flushes are completed by the time the

>> vCPU next schedules its own work. As the TLB instructions are modelled

>> as CP writes the TB ends at this point meaning cpu->exit_request will

>> be checked before the next instruction is executed.

>>

>> Deferring the work until the architectural sync point is a possible

>> future optimisation.

>>

>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>> Reviewed-by: Richard Henderson <rth@twiddle.net>

>> Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

>> ---

>>  target/arm/helper.c | 165 ++++++++++++++++++++++------------------------------

>>  1 file changed, 69 insertions(+), 96 deletions(-)

>>

>

> Hello,

>

> I have an issue with Linux kernel stopping to boot on a SMP 32bit ARM (haven't

> checked 64bit) in a single-threaded TCG mode. Kernel reaches point where it

> should mount rootfs over NFS and vCPUs stop. This issue is reproducible with any

> 32bit ARM machine type. Kernel boots fine with a MTTCG accel, only

> single-threaded TCG is affected. Git bisection lead to this patch, any

> ideas?


It shouldn't cause a problem but can you obtain a backtrace of the
system when hung?

>

> Example:

>

> qemu-system-arm -M vexpress-a9 -smp cpus=2 -accel accel=tcg,thread=single

> -kernel arch/arm/boot/zImage -dtb arch/arm/boot/dts/vexpress-v2p-ca9.dtb -serial

> stdio -net nic,model=lan9118 -net user -d in_asm,out_asm -D /tmp/qemulog

>

> Last TB from the log:

> ----------------

> IN:

> 0xc011a450:  ee080f73      mcr	15, 0, r0, cr8, cr3, {3}

>

> OUT: [size=68]

> 0x7f32d8b93f80:  mov    -0x18(%r14),%ebp

> 0x7f32d8b93f84:  test   %ebp,%ebp

> 0x7f32d8b93f86:  jne    0x7f32d8b93fb8

> 0x7f32d8b93f8c:  mov    %r14,%rdi

> 0x7f32d8b93f8f:  mov    $0x5620f2aea5d0,%rsi

> 0x7f32d8b93f99:  mov    (%r14),%edx

> 0x7f32d8b93f9c:  mov    $0x5620f18107ca,%r10

> 0x7f32d8b93fa6:  callq  *%r10

> 0x7f32d8b93fa9:  movl   $0xc011a454,0x3c(%r14)

> 0x7f32d8b93fb1:  xor    %eax,%eax

> 0x7f32d8b93fb3:  jmpq   0x7f32d7a4e016

> 0x7f32d8b93fb8:  lea    -0x14aa07c(%rip),%rax        # 0x7f32d76e9f43

> 0x7f32d8b93fbf:  jmpq   0x7f32d7a4e016

>

>

>> diff --git a/target/arm/helper.c b/target/arm/helper.c

>> index b41d0494d1..bcedb4a808 100644

>> --- a/target/arm/helper.c

>> +++ b/target/arm/helper.c

>> @@ -536,41 +536,33 @@ static void tlbimvaa_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>  static void tlbiall_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                               uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush(other_cs);

>> -    }

>> +    tlb_flush_all_cpus_synced(cs);

>>  }

>>

>>  static void tlbiasid_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                               uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush(other_cs);

>> -    }

>> +    tlb_flush_all_cpus_synced(cs);

>>  }

>>

>>  static void tlbimva_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                               uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_page(other_cs, value & TARGET_PAGE_MASK);

>> -    }

>> +    tlb_flush_page_all_cpus_synced(cs, value & TARGET_PAGE_MASK);

>>  }

>>

>>  static void tlbimvaa_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                               uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_page(other_cs, value & TARGET_PAGE_MASK);

>> -    }

>> +    tlb_flush_page_all_cpus_synced(cs, value & TARGET_PAGE_MASK);

>>  }

>>

>>  static void tlbiall_nsnh_write(CPUARMState *env, const ARMCPRegInfo *ri,

>> @@ -587,14 +579,12 @@ static void tlbiall_nsnh_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>  static void tlbiall_nsnh_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                    uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_by_mmuidx(other_cs,

>> -                            (1 << ARMMMUIdx_S12NSE1) |

>> -                            (1 << ARMMMUIdx_S12NSE0) |

>> -                            (1 << ARMMMUIdx_S2NS));

>> -    }

>> +    tlb_flush_by_mmuidx_all_cpus_synced(cs,

>> +                                        (1 << ARMMMUIdx_S12NSE1) |

>> +                                        (1 << ARMMMUIdx_S12NSE0) |

>> +                                        (1 << ARMMMUIdx_S2NS));

>>  }

>>

>>  static void tlbiipas2_write(CPUARMState *env, const ARMCPRegInfo *ri,

>> @@ -621,7 +611,7 @@ static void tlbiipas2_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>  static void tlbiipas2_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                 uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>      uint64_t pageaddr;

>>

>>      if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) {

>> @@ -630,9 +620,8 @@ static void tlbiipas2_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>

>>      pageaddr = sextract64(value << 12, 0, 40);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S2NS));

>> -    }

>> +    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

>> +                                             (1 << ARMMMUIdx_S2NS));

>>  }

>>

>>  static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,

>> @@ -646,11 +635,9 @@ static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>  static void tlbiall_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                   uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_by_mmuidx(other_cs, (1 << ARMMMUIdx_S1E2));

>> -    }

>> +    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E2));

>>  }

>>

>>  static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,

>> @@ -665,12 +652,11 @@ static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>  static void tlbimva_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                   uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>      uint64_t pageaddr = value & ~MAKE_64BIT_MASK(0, 12);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S1E2));

>> -    }

>> +    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

>> +                                             (1 << ARMMMUIdx_S1E2));

>>  }

>>

>>  static const ARMCPRegInfo cp_reginfo[] = {

>> @@ -2904,8 +2890,7 @@ static CPAccessResult aa64_cacheop_access(CPUARMState *env,

>>  static void tlbi_aa64_vmalle1_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                      uint64_t value)

>>  {

>> -    ARMCPU *cpu = arm_env_get_cpu(env);

>> -    CPUState *cs = CPU(cpu);

>> +    CPUState *cs = ENV_GET_CPU(env);

>>

>>      if (arm_is_secure_below_el3(env)) {

>>          tlb_flush_by_mmuidx(cs,

>> @@ -2921,19 +2906,17 @@ static void tlbi_aa64_vmalle1_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>  static void tlbi_aa64_vmalle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                        uint64_t value)

>>  {

>> +    CPUState *cs = ENV_GET_CPU(env);

>>      bool sec = arm_is_secure_below_el3(env);

>> -    CPUState *other_cs;

>>

>> -    CPU_FOREACH(other_cs) {

>> -        if (sec) {

>> -            tlb_flush_by_mmuidx(other_cs,

>> -                                (1 << ARMMMUIdx_S1SE1) |

>> -                                (1 << ARMMMUIdx_S1SE0));

>> -        } else {

>> -            tlb_flush_by_mmuidx(other_cs,

>> -                                (1 << ARMMMUIdx_S12NSE1) |

>> -                                (1 << ARMMMUIdx_S12NSE0));

>> -        }

>> +    if (sec) {

>> +        tlb_flush_by_mmuidx_all_cpus_synced(cs,

>> +                                            (1 << ARMMMUIdx_S1SE1) |

>> +                                            (1 << ARMMMUIdx_S1SE0));

>> +    } else {

>> +        tlb_flush_by_mmuidx_all_cpus_synced(cs,

>> +                                            (1 << ARMMMUIdx_S12NSE1) |

>> +                                            (1 << ARMMMUIdx_S12NSE0));

>>      }

>>  }

>>

>> @@ -2990,46 +2973,40 @@ static void tlbi_aa64_alle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>       * stage 2 translations, whereas most other scopes only invalidate

>>       * stage 1 translations.

>>       */

>> +    CPUState *cs = ENV_GET_CPU(env);

>>      bool sec = arm_is_secure_below_el3(env);

>>      bool has_el2 = arm_feature(env, ARM_FEATURE_EL2);

>> -    CPUState *other_cs;

>> -

>> -    CPU_FOREACH(other_cs) {

>> -        if (sec) {

>> -            tlb_flush_by_mmuidx(other_cs,

>> -                                (1 << ARMMMUIdx_S1SE1) |

>> -                                (1 << ARMMMUIdx_S1SE0));

>> -        } else if (has_el2) {

>> -            tlb_flush_by_mmuidx(other_cs,

>> -                                (1 << ARMMMUIdx_S12NSE1) |

>> -                                (1 << ARMMMUIdx_S12NSE0) |

>> -                                (1 << ARMMMUIdx_S2NS));

>> -        } else {

>> -            tlb_flush_by_mmuidx(other_cs,

>> -                                (1 << ARMMMUIdx_S12NSE1) |

>> -                                (1 << ARMMMUIdx_S12NSE0));

>> -        }

>> +

>> +    if (sec) {

>> +        tlb_flush_by_mmuidx_all_cpus_synced(cs,

>> +                                            (1 << ARMMMUIdx_S1SE1) |

>> +                                            (1 << ARMMMUIdx_S1SE0));

>> +    } else if (has_el2) {

>> +        tlb_flush_by_mmuidx_all_cpus_synced(cs,

>> +                                            (1 << ARMMMUIdx_S12NSE1) |

>> +                                            (1 << ARMMMUIdx_S12NSE0) |

>> +                                            (1 << ARMMMUIdx_S2NS));

>> +    } else {

>> +          tlb_flush_by_mmuidx_all_cpus_synced(cs,

>> +                                              (1 << ARMMMUIdx_S12NSE1) |

>> +                                              (1 << ARMMMUIdx_S12NSE0));

>>      }

>>  }

>>

>>  static void tlbi_aa64_alle2is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                      uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_by_mmuidx(other_cs, (1 << ARMMMUIdx_S1E2));

>> -    }

>> +    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E2));

>>  }

>>

>>  static void tlbi_aa64_alle3is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                      uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_by_mmuidx(other_cs, (1 << ARMMMUIdx_S1E3));

>> -    }

>> +    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E3));

>>  }

>>

>>  static void tlbi_aa64_vae1_write(CPUARMState *env, const ARMCPRegInfo *ri,

>> @@ -3086,43 +3063,40 @@ static void tlbi_aa64_vae3_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>  static void tlbi_aa64_vae1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                     uint64_t value)

>>  {

>> +    ARMCPU *cpu = arm_env_get_cpu(env);

>> +    CPUState *cs = CPU(cpu);

>>      bool sec = arm_is_secure_below_el3(env);

>> -    CPUState *other_cs;

>>      uint64_t pageaddr = sextract64(value << 12, 0, 56);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        if (sec) {

>> -            tlb_flush_page_by_mmuidx(other_cs, pageaddr,

>> -                                     (1 << ARMMMUIdx_S1SE1) |

>> -                                     (1 << ARMMMUIdx_S1SE0));

>> -        } else {

>> -            tlb_flush_page_by_mmuidx(other_cs, pageaddr,

>> -                                     (1 << ARMMMUIdx_S12NSE1) |

>> -                                     (1 << ARMMMUIdx_S12NSE0));

>> -        }

>> +    if (sec) {

>> +        tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

>> +                                                 (1 << ARMMMUIdx_S1SE1) |

>> +                                                 (1 << ARMMMUIdx_S1SE0));

>> +    } else {

>> +        tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

>> +                                                 (1 << ARMMMUIdx_S12NSE1) |

>> +                                                 (1 << ARMMMUIdx_S12NSE0));

>>      }

>>  }

>>

>>  static void tlbi_aa64_vae2is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                     uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>      uint64_t pageaddr = sextract64(value << 12, 0, 56);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S1E2));

>> -    }

>> +    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

>> +                                             (1 << ARMMMUIdx_S1E2));

>>  }

>>

>>  static void tlbi_aa64_vae3is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                     uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>      uint64_t pageaddr = sextract64(value << 12, 0, 56);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S1E3));

>> -    }

>> +    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

>> +                                             (1 << ARMMMUIdx_S1E3));

>>  }

>>

>>  static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri,

>> @@ -3150,7 +3124,7 @@ static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>  static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>                                        uint64_t value)

>>  {

>> -    CPUState *other_cs;

>> +    CPUState *cs = ENV_GET_CPU(env);

>>      uint64_t pageaddr;

>>

>>      if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) {

>> @@ -3159,9 +3133,8 @@ static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri,

>>

>>      pageaddr = sextract64(value << 12, 0, 48);

>>

>> -    CPU_FOREACH(other_cs) {

>> -        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S2NS));

>> -    }

>> +    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,

>> +                                             (1 << ARMMMUIdx_S2NS));

>>  }

>>

>>  static CPAccessResult aa64_zva_access(CPUARMState *env, const ARMCPRegInfo *ri,

>>



--
Alex Bennée
Dmitry Osipenko Sept. 17, 2017, 1:46 p.m. UTC | #3
On 17.09.2017 16:22, Alex Bennée wrote:
> 

> Dmitry Osipenko <digetx@gmail.com> writes:

> 

>> On 24.02.2017 14:21, Alex Bennée wrote:

>>> Previously flushes on other vCPUs would only get serviced when they

>>> exited their TranslationBlocks. While this isn't overly problematic it

>>> violates the semantics of TLB flush from the point of view of source

>>> vCPU.

>>>

>>> To solve this we call the cputlb *_all_cpus_synced() functions to do

>>> the flushes which ensures all flushes are completed by the time the

>>> vCPU next schedules its own work. As the TLB instructions are modelled

>>> as CP writes the TB ends at this point meaning cpu->exit_request will

>>> be checked before the next instruction is executed.

>>>

>>> Deferring the work until the architectural sync point is a possible

>>> future optimisation.

>>>

>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>>> Reviewed-by: Richard Henderson <rth@twiddle.net>

>>> Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

>>> ---

>>>  target/arm/helper.c | 165 ++++++++++++++++++++++------------------------------

>>>  1 file changed, 69 insertions(+), 96 deletions(-)

>>>

>>

>> Hello,

>>

>> I have an issue with Linux kernel stopping to boot on a SMP 32bit ARM (haven't

>> checked 64bit) in a single-threaded TCG mode. Kernel reaches point where it

>> should mount rootfs over NFS and vCPUs stop. This issue is reproducible with any

>> 32bit ARM machine type. Kernel boots fine with a MTTCG accel, only

>> single-threaded TCG is affected. Git bisection lead to this patch, any

>> ideas?

> 

> It shouldn't cause a problem but can you obtain a backtrace of the

> system when hung?

> 


Actually, it looks like TCG enters infinite loop. Do you mean backtrace of QEMU
by 'backtrace of the system'? If so, here it is:

Thread 4 (Thread 0x7ffa37f10700 (LWP 20716)):

#0  0x00007ffa601888bd in poll () at ../sysdeps/unix/syscall-template.S:84

#1  0x00007ffa5e3aa561 in poll (__timeout=-1, __nfds=2, __fds=0x7ffa30006dc0) at
/usr/include/bits/poll2.h:46
#2  poll_func (ufds=0x7ffa30006dc0, nfds=2, timeout=-1, userdata=0x557bd603eae0)
at
/var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/thread-mainloop.c:69
#3  0x00007ffa5e39bbb1 in pa_mainloop_poll (m=m@entry=0x557bd60401f0) at
/var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:844
#4  0x00007ffa5e39c24e in pa_mainloop_iterate (m=0x557bd60401f0,
block=<optimized out>, retval=0x0) at
/var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:926
#5  0x00007ffa5e39c300 in pa_mainloop_run (m=0x557bd60401f0,
retval=retval@entry=0x0) at
/var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:944

#6  0x00007ffa5e3aa4a9 in thread (userdata=0x557bd60400f0) at
/var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/thread-mainloop.c:100

#7  0x00007ffa599eea38 in internal_thread_func (userdata=0x557bd603e090) at
/var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulsecore/thread-posix.c:81

#8  0x00007ffa60453657 in start_thread (arg=0x7ffa37f10700) at
pthread_create.c:456

#9  0x00007ffa60193c5f in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:97





Thread 3 (Thread 0x7ffa4adff700 (LWP 20715)):


#0  0x00007ffa53e51caf in code_gen_buffer ()


#1  0x0000557bd2fa7f17 in cpu_tb_exec (cpu=0x557bd56160a0, itb=0x7ffa53e51b80
<code_gen_buffer+15481686>) at /home/dima/vl/qemu-tests/accel/tcg/cpu-exec.c:166

#2  0x0000557bd2fa8e0f in cpu_loop_exec_tb (cpu=0x557bd56160a0,
tb=0x7ffa53e51b80 <code_gen_buffer+15481686>, last_tb=0x7ffa4adfea68,
tb_exit=0x7ffa4adfea64) at /home/dima/vl/qemu-tests/accel/tcg/cpu-exec.c:613
#3  0x0000557bd2fa90ff in cpu_exec (cpu=0x557bd56160a0) at
/home/dima/vl/qemu-tests/accel/tcg/cpu-exec.c:711

#4  0x0000557bd2f6dcba in tcg_cpu_exec (cpu=0x557bd56160a0) at
/home/dima/vl/qemu-tests/cpus.c:1270

#5  0x0000557bd2f6dee1 in qemu_tcg_rr_cpu_thread_fn (arg=0x557bd5598e20) at
/home/dima/vl/qemu-tests/cpus.c:1365

#6  0x00007ffa60453657 in start_thread (arg=0x7ffa4adff700) at
pthread_create.c:456

#7  0x00007ffa60193c5f in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:97







Thread 2 (Thread 0x7ffa561bf700 (LWP 20714)):



#0  syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38



#1  0x0000557bd34e1eaa in qemu_futex_wait (f=0x557bd4031798
<rcu_call_ready_event>, val=4294967295) at
/home/dima/vl/qemu-tests/include/qemu/futex.h:26


#2  0x0000557bd34e2071 in qemu_event_wait (ev=0x557bd4031798
<rcu_call_ready_event>) at util/qemu-thread-posix.c:442


#3  0x0000557bd34f9b1f in call_rcu_thread (opaque=0x0) at util/rcu.c:249



#4  0x00007ffa60453657 in start_thread (arg=0x7ffa561bf700) at
pthread_create.c:456


#5  0x00007ffa60193c5f in clone () at
../sysdeps/unix/sysv/linux/x86_64/clone.S:97







Thread 1 (Thread 0x7ffa67502600 (LWP 20713)):



#0  0x00007ffa601889ab in __GI_ppoll (fds=0x557bd5bbf160, nfds=11,
timeout=<optimized out>, sigmask=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:39


#1  0x0000557bd34dc460 in qemu_poll_ns (fds=0x557bd5bbf160, nfds=11,
timeout=29841115) at util/qemu-timer.c:334


#2  0x0000557bd34dd488 in os_host_main_loop_wait (timeout=29841115) at
util/main-loop.c:255


#3  0x0000557bd34dd557 in main_loop_wait (nonblocking=0) at util/main-loop.c:515



#4  0x0000557bd3120f0e in main_loop () at vl.c:1999



#5  0x0000557bd3128d4a in main (argc=17, argv=0x7ffe7de2a248,
envp=0x7ffe7de2a2d8) at vl.c:4877

>>

>> Example:

>>

>> qemu-system-arm -M vexpress-a9 -smp cpus=2 -accel accel=tcg,thread=single

>> -kernel arch/arm/boot/zImage -dtb arch/arm/boot/dts/vexpress-v2p-ca9.dtb -serial

>> stdio -net nic,model=lan9118 -net user -d in_asm,out_asm -D /tmp/qemulog

>>

>> Last TB from the log:

>> ----------------

>> IN:

>> 0xc011a450:  ee080f73      mcr	15, 0, r0, cr8, cr3, {3}

>>

>> OUT: [size=68]

>> 0x7f32d8b93f80:  mov    -0x18(%r14),%ebp

>> 0x7f32d8b93f84:  test   %ebp,%ebp

>> 0x7f32d8b93f86:  jne    0x7f32d8b93fb8

>> 0x7f32d8b93f8c:  mov    %r14,%rdi

>> 0x7f32d8b93f8f:  mov    $0x5620f2aea5d0,%rsi

>> 0x7f32d8b93f99:  mov    (%r14),%edx

>> 0x7f32d8b93f9c:  mov    $0x5620f18107ca,%r10

>> 0x7f32d8b93fa6:  callq  *%r10

>> 0x7f32d8b93fa9:  movl   $0xc011a454,0x3c(%r14)

>> 0x7f32d8b93fb1:  xor    %eax,%eax

>> 0x7f32d8b93fb3:  jmpq   0x7f32d7a4e016

>> 0x7f32d8b93fb8:  lea    -0x14aa07c(%rip),%rax        # 0x7f32d76e9f43

>> 0x7f32d8b93fbf:  jmpq   0x7f32d7a4e016

-- 
Dmitry
Alex Bennée Sept. 18, 2017, 10:10 a.m. UTC | #4
Dmitry Osipenko <digetx@gmail.com> writes:

> On 17.09.2017 16:22, Alex Bennée wrote:

>>

>> Dmitry Osipenko <digetx@gmail.com> writes:

>>

>>> On 24.02.2017 14:21, Alex Bennée wrote:

>>>> Previously flushes on other vCPUs would only get serviced when they

>>>> exited their TranslationBlocks. While this isn't overly problematic it

>>>> violates the semantics of TLB flush from the point of view of source

>>>> vCPU.

>>>>

>>>> To solve this we call the cputlb *_all_cpus_synced() functions to do

>>>> the flushes which ensures all flushes are completed by the time the

>>>> vCPU next schedules its own work. As the TLB instructions are modelled

>>>> as CP writes the TB ends at this point meaning cpu->exit_request will

>>>> be checked before the next instruction is executed.

>>>>

>>>> Deferring the work until the architectural sync point is a possible

>>>> future optimisation.

>>>>

>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>>>> Reviewed-by: Richard Henderson <rth@twiddle.net>

>>>> Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

>>>> ---

>>>>  target/arm/helper.c | 165 ++++++++++++++++++++++------------------------------

>>>>  1 file changed, 69 insertions(+), 96 deletions(-)

>>>>

>>>

>>> Hello,

>>>

>>> I have an issue with Linux kernel stopping to boot on a SMP 32bit ARM (haven't

>>> checked 64bit) in a single-threaded TCG mode. Kernel reaches point where it

>>> should mount rootfs over NFS and vCPUs stop. This issue is reproducible with any

>>> 32bit ARM machine type. Kernel boots fine with a MTTCG accel, only

>>> single-threaded TCG is affected. Git bisection lead to this patch, any

>>> ideas?

>>

>> It shouldn't cause a problem but can you obtain a backtrace of the

>> system when hung?

>>

>

> Actually, it looks like TCG enters infinite loop. Do you mean backtrace of QEMU

> by 'backtrace of the system'? If so, here it is:

>

> Thread 4 (Thread 0x7ffa37f10700 (LWP 20716)):

>

> #0  0x00007ffa601888bd in poll () at ../sysdeps/unix/syscall-template.S:84

>

> #1  0x00007ffa5e3aa561 in poll (__timeout=-1, __nfds=2, __fds=0x7ffa30006dc0) at

> /usr/include/bits/poll2.h:46

> #2  poll_func (ufds=0x7ffa30006dc0, nfds=2, timeout=-1, userdata=0x557bd603eae0)

> at

> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/thread-mainloop.c:69

> #3  0x00007ffa5e39bbb1 in pa_mainloop_poll (m=m@entry=0x557bd60401f0) at

> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:844

> #4  0x00007ffa5e39c24e in pa_mainloop_iterate (m=0x557bd60401f0,

> block=<optimized out>, retval=0x0) at

> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:926

> #5  0x00007ffa5e39c300 in pa_mainloop_run (m=0x557bd60401f0,

> retval=retval@entry=0x0) at

> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:944

>

> #6  0x00007ffa5e3aa4a9 in thread (userdata=0x557bd60400f0) at

> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/thread-mainloop.c:100

>

> #7  0x00007ffa599eea38 in internal_thread_func (userdata=0x557bd603e090) at

> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulsecore/thread-posix.c:81

>

> #8  0x00007ffa60453657 in start_thread (arg=0x7ffa37f10700) at

> pthread_create.c:456

>

> #9  0x00007ffa60193c5f in clone () at

> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97

>

>

>

>

>

> Thread 3 (Thread 0x7ffa4adff700 (LWP 20715)):

>

>

> #0  0x00007ffa53e51caf in code_gen_buffer ()

>


Well it's not locked up in servicing any flush tasks as it's executing
code. Maybe the guest code is spinning on something?

In the monitor:

  info registers

Will show you where things are, see if the ip is moving each time. Also
you can do a disassemble dump from there to see what code it is stuck
on.

>

> #1  0x0000557bd2fa7f17 in cpu_tb_exec (cpu=0x557bd56160a0, itb=0x7ffa53e51b80

> <code_gen_buffer+15481686>) at /home/dima/vl/qemu-tests/accel/tcg/cpu-exec.c:166

>

> #2  0x0000557bd2fa8e0f in cpu_loop_exec_tb (cpu=0x557bd56160a0,

> tb=0x7ffa53e51b80 <code_gen_buffer+15481686>, last_tb=0x7ffa4adfea68,

> tb_exit=0x7ffa4adfea64) at /home/dima/vl/qemu-tests/accel/tcg/cpu-exec.c:613

> #3  0x0000557bd2fa90ff in cpu_exec (cpu=0x557bd56160a0) at

> /home/dima/vl/qemu-tests/accel/tcg/cpu-exec.c:711

>

> #4  0x0000557bd2f6dcba in tcg_cpu_exec (cpu=0x557bd56160a0) at

> /home/dima/vl/qemu-tests/cpus.c:1270

>

> #5  0x0000557bd2f6dee1 in qemu_tcg_rr_cpu_thread_fn (arg=0x557bd5598e20) at

> /home/dima/vl/qemu-tests/cpus.c:1365

>

> #6  0x00007ffa60453657 in start_thread (arg=0x7ffa4adff700) at

> pthread_create.c:456

>

> #7  0x00007ffa60193c5f in clone () at

> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97

>

>

>

>

>

>

>

> Thread 2 (Thread 0x7ffa561bf700 (LWP 20714)):

>

>

>

> #0  syscall () at ../sysdeps/unix/sysv/linux/x86_64/syscall.S:38

>

>

>

> #1  0x0000557bd34e1eaa in qemu_futex_wait (f=0x557bd4031798

> <rcu_call_ready_event>, val=4294967295) at

> /home/dima/vl/qemu-tests/include/qemu/futex.h:26

>

>

> #2  0x0000557bd34e2071 in qemu_event_wait (ev=0x557bd4031798

> <rcu_call_ready_event>) at util/qemu-thread-posix.c:442

>

>

> #3  0x0000557bd34f9b1f in call_rcu_thread (opaque=0x0) at util/rcu.c:249

>

>

>

> #4  0x00007ffa60453657 in start_thread (arg=0x7ffa561bf700) at

> pthread_create.c:456

>

>

> #5  0x00007ffa60193c5f in clone () at

> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97

>

>

>

>

>

>

>

> Thread 1 (Thread 0x7ffa67502600 (LWP 20713)):

>

>

>

> #0  0x00007ffa601889ab in __GI_ppoll (fds=0x557bd5bbf160, nfds=11,

> timeout=<optimized out>, sigmask=0x0) at ../sysdeps/unix/sysv/linux/ppoll.c:39

>

>

> #1  0x0000557bd34dc460 in qemu_poll_ns (fds=0x557bd5bbf160, nfds=11,

> timeout=29841115) at util/qemu-timer.c:334

>

>

> #2  0x0000557bd34dd488 in os_host_main_loop_wait (timeout=29841115) at

> util/main-loop.c:255

>

>

> #3  0x0000557bd34dd557 in main_loop_wait (nonblocking=0) at util/main-loop.c:515

>

>

>

> #4  0x0000557bd3120f0e in main_loop () at vl.c:1999

>

>

>

> #5  0x0000557bd3128d4a in main (argc=17, argv=0x7ffe7de2a248,

> envp=0x7ffe7de2a2d8) at vl.c:4877

>

>>>

>>> Example:

>>>

>>> qemu-system-arm -M vexpress-a9 -smp cpus=2 -accel accel=tcg,thread=single

>>> -kernel arch/arm/boot/zImage -dtb arch/arm/boot/dts/vexpress-v2p-ca9.dtb -serial

>>> stdio -net nic,model=lan9118 -net user -d in_asm,out_asm -D /tmp/qemulog

>>>

>>> Last TB from the log:

>>> ----------------

>>> IN:

>>> 0xc011a450:  ee080f73      mcr	15, 0, r0, cr8, cr3, {3}

>>>

>>> OUT: [size=68]

>>> 0x7f32d8b93f80:  mov    -0x18(%r14),%ebp

>>> 0x7f32d8b93f84:  test   %ebp,%ebp

>>> 0x7f32d8b93f86:  jne    0x7f32d8b93fb8

>>> 0x7f32d8b93f8c:  mov    %r14,%rdi

>>> 0x7f32d8b93f8f:  mov    $0x5620f2aea5d0,%rsi

>>> 0x7f32d8b93f99:  mov    (%r14),%edx

>>> 0x7f32d8b93f9c:  mov    $0x5620f18107ca,%r10

>>> 0x7f32d8b93fa6:  callq  *%r10

>>> 0x7f32d8b93fa9:  movl   $0xc011a454,0x3c(%r14)

>>> 0x7f32d8b93fb1:  xor    %eax,%eax

>>> 0x7f32d8b93fb3:  jmpq   0x7f32d7a4e016

>>> 0x7f32d8b93fb8:  lea    -0x14aa07c(%rip),%rax        # 0x7f32d76e9f43

>>> 0x7f32d8b93fbf:  jmpq   0x7f32d7a4e016



--
Alex Bennée
Dmitry Osipenko Sept. 18, 2017, 12:23 p.m. UTC | #5
On 18.09.2017 13:10, Alex Bennée wrote:
> 

> Dmitry Osipenko <digetx@gmail.com> writes:

> 

>> On 17.09.2017 16:22, Alex Bennée wrote:

>>>

>>> Dmitry Osipenko <digetx@gmail.com> writes:

>>>

>>>> On 24.02.2017 14:21, Alex Bennée wrote:

>>>>> Previously flushes on other vCPUs would only get serviced when they

>>>>> exited their TranslationBlocks. While this isn't overly problematic it

>>>>> violates the semantics of TLB flush from the point of view of source

>>>>> vCPU.

>>>>>

>>>>> To solve this we call the cputlb *_all_cpus_synced() functions to do

>>>>> the flushes which ensures all flushes are completed by the time the

>>>>> vCPU next schedules its own work. As the TLB instructions are modelled

>>>>> as CP writes the TB ends at this point meaning cpu->exit_request will

>>>>> be checked before the next instruction is executed.

>>>>>

>>>>> Deferring the work until the architectural sync point is a possible

>>>>> future optimisation.

>>>>>

>>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>>>>> Reviewed-by: Richard Henderson <rth@twiddle.net>

>>>>> Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

>>>>> ---

>>>>>  target/arm/helper.c | 165 ++++++++++++++++++++++------------------------------

>>>>>  1 file changed, 69 insertions(+), 96 deletions(-)

>>>>>

>>>>

>>>> Hello,

>>>>

>>>> I have an issue with Linux kernel stopping to boot on a SMP 32bit ARM (haven't

>>>> checked 64bit) in a single-threaded TCG mode. Kernel reaches point where it

>>>> should mount rootfs over NFS and vCPUs stop. This issue is reproducible with any

>>>> 32bit ARM machine type. Kernel boots fine with a MTTCG accel, only

>>>> single-threaded TCG is affected. Git bisection lead to this patch, any

>>>> ideas?

>>>

>>> It shouldn't cause a problem but can you obtain a backtrace of the

>>> system when hung?

>>>

>>

>> Actually, it looks like TCG enters infinite loop. Do you mean backtrace of QEMU

>> by 'backtrace of the system'? If so, here it is:

>>

>> Thread 4 (Thread 0x7ffa37f10700 (LWP 20716)):

>>

>> #0  0x00007ffa601888bd in poll () at ../sysdeps/unix/syscall-template.S:84

>>

>> #1  0x00007ffa5e3aa561 in poll (__timeout=-1, __nfds=2, __fds=0x7ffa30006dc0) at

>> /usr/include/bits/poll2.h:46

>> #2  poll_func (ufds=0x7ffa30006dc0, nfds=2, timeout=-1, userdata=0x557bd603eae0)

>> at

>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/thread-mainloop.c:69

>> #3  0x00007ffa5e39bbb1 in pa_mainloop_poll (m=m@entry=0x557bd60401f0) at

>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:844

>> #4  0x00007ffa5e39c24e in pa_mainloop_iterate (m=0x557bd60401f0,

>> block=<optimized out>, retval=0x0) at

>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:926

>> #5  0x00007ffa5e39c300 in pa_mainloop_run (m=0x557bd60401f0,

>> retval=retval@entry=0x0) at

>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:944

>>

>> #6  0x00007ffa5e3aa4a9 in thread (userdata=0x557bd60400f0) at

>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/thread-mainloop.c:100

>>

>> #7  0x00007ffa599eea38 in internal_thread_func (userdata=0x557bd603e090) at

>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulsecore/thread-posix.c:81

>>

>> #8  0x00007ffa60453657 in start_thread (arg=0x7ffa37f10700) at

>> pthread_create.c:456

>>

>> #9  0x00007ffa60193c5f in clone () at

>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97

>>

>>

>>

>>

>>

>> Thread 3 (Thread 0x7ffa4adff700 (LWP 20715)):

>>

>>

>> #0  0x00007ffa53e51caf in code_gen_buffer ()

>>

> 

> Well it's not locked up in servicing any flush tasks as it's executing

> code. Maybe the guest code is spinning on something?

> 


Indeed, I should have used 'exec' instead of 'in_asm'.

> In the monitor:

> 

>   info registers

> 

> Will show you where things are, see if the ip is moving each time. Also

> you can do a disassemble dump from there to see what code it is stuck

> on.

> 


I've attached with GDB to QEMU to see where it got stuck. Turned out it is
caused by CONFIG_STRICT_KERNEL_RWX=y of the Linux kernel. Upon boot completion
kernel changes memory permissions and that changing is executed on a dedicated
CPU, while other CPUs are 'stopped' in a busy loop.

This patch just introduced a noticeable performance regression for a
single-threaded TCG, which is probably fine since MTTCG is the default now.
Thank you very much for the suggestions and all your work on MTTCG!

-- 
Dmitry
Alex Bennée Sept. 18, 2017, 2 p.m. UTC | #6
Dmitry Osipenko <digetx@gmail.com> writes:

> On 18.09.2017 13:10, Alex Bennée wrote:

>>

>> Dmitry Osipenko <digetx@gmail.com> writes:

>>

>>> On 17.09.2017 16:22, Alex Bennée wrote:

>>>>

>>>> Dmitry Osipenko <digetx@gmail.com> writes:

>>>>

>>>>> On 24.02.2017 14:21, Alex Bennée wrote:

>>>>>> Previously flushes on other vCPUs would only get serviced when they

>>>>>> exited their TranslationBlocks. While this isn't overly problematic it

>>>>>> violates the semantics of TLB flush from the point of view of source

>>>>>> vCPU.

>>>>>>

>>>>>> To solve this we call the cputlb *_all_cpus_synced() functions to do

>>>>>> the flushes which ensures all flushes are completed by the time the

>>>>>> vCPU next schedules its own work. As the TLB instructions are modelled

>>>>>> as CP writes the TB ends at this point meaning cpu->exit_request will

>>>>>> be checked before the next instruction is executed.

>>>>>>

>>>>>> Deferring the work until the architectural sync point is a possible

>>>>>> future optimisation.

>>>>>>

>>>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>>>>>> Reviewed-by: Richard Henderson <rth@twiddle.net>

>>>>>> Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

>>>>>> ---

>>>>>>  target/arm/helper.c | 165 ++++++++++++++++++++++------------------------------

>>>>>>  1 file changed, 69 insertions(+), 96 deletions(-)

>>>>>>

>>>>>

>>>>> Hello,

>>>>>

>>>>> I have an issue with Linux kernel stopping to boot on a SMP 32bit ARM (haven't

>>>>> checked 64bit) in a single-threaded TCG mode. Kernel reaches point where it

>>>>> should mount rootfs over NFS and vCPUs stop. This issue is reproducible with any

>>>>> 32bit ARM machine type. Kernel boots fine with a MTTCG accel, only

>>>>> single-threaded TCG is affected. Git bisection lead to this patch, any

>>>>> ideas?

>>>>

>>>> It shouldn't cause a problem but can you obtain a backtrace of the

>>>> system when hung?

>>>>

>>>

>>> Actually, it looks like TCG enters infinite loop. Do you mean backtrace of QEMU

>>> by 'backtrace of the system'? If so, here it is:

>>>

>>> Thread 4 (Thread 0x7ffa37f10700 (LWP 20716)):

>>>

>>> #0  0x00007ffa601888bd in poll () at ../sysdeps/unix/syscall-template.S:84

>>>

>>> #1  0x00007ffa5e3aa561 in poll (__timeout=-1, __nfds=2, __fds=0x7ffa30006dc0) at

>>> /usr/include/bits/poll2.h:46

>>> #2  poll_func (ufds=0x7ffa30006dc0, nfds=2, timeout=-1, userdata=0x557bd603eae0)

>>> at

>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/thread-mainloop.c:69

>>> #3  0x00007ffa5e39bbb1 in pa_mainloop_poll (m=m@entry=0x557bd60401f0) at

>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:844

>>> #4  0x00007ffa5e39c24e in pa_mainloop_iterate (m=0x557bd60401f0,

>>> block=<optimized out>, retval=0x0) at

>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:926

>>> #5  0x00007ffa5e39c300 in pa_mainloop_run (m=0x557bd60401f0,

>>> retval=retval@entry=0x0) at

>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:944

>>>

>>> #6  0x00007ffa5e3aa4a9 in thread (userdata=0x557bd60400f0) at

>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/thread-mainloop.c:100

>>>

>>> #7  0x00007ffa599eea38 in internal_thread_func (userdata=0x557bd603e090) at

>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulsecore/thread-posix.c:81

>>>

>>> #8  0x00007ffa60453657 in start_thread (arg=0x7ffa37f10700) at

>>> pthread_create.c:456

>>>

>>> #9  0x00007ffa60193c5f in clone () at

>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97

>>>

>>>

>>>

>>>

>>>

>>> Thread 3 (Thread 0x7ffa4adff700 (LWP 20715)):

>>>

>>>

>>> #0  0x00007ffa53e51caf in code_gen_buffer ()

>>>

>>

>> Well it's not locked up in servicing any flush tasks as it's executing

>> code. Maybe the guest code is spinning on something?

>>

>

> Indeed, I should have used 'exec' instead of 'in_asm'.

>

>> In the monitor:

>>

>>   info registers

>>

>> Will show you where things are, see if the ip is moving each time. Also

>> you can do a disassemble dump from there to see what code it is stuck

>> on.

>>

>

> I've attached with GDB to QEMU to see where it got stuck. Turned out it is

> caused by CONFIG_STRICT_KERNEL_RWX=y of the Linux kernel. Upon boot completion

> kernel changes memory permissions and that changing is executed on a dedicated

> CPU, while other CPUs are 'stopped' in a busy loop.

>

> This patch just introduced a noticeable performance regression for a

> single-threaded TCG, which is probably fine since MTTCG is the default now.

> Thank you very much for the suggestions and all your work on MTTCG!


Hmm well it would be nice to know the exact mechanism for that failure.
If we just end up with a very long list of tasks in
cpu->queued_work_first then I guess that explains it but it would be
nice to quantify the problem.

I had trouble seeing where this loop is in the kernel code, got a pointer?

--
Alex Bennée
Dmitry Osipenko Sept. 18, 2017, 3:32 p.m. UTC | #7
On 18.09.2017 17:00, Alex Bennée wrote:
> 

> Dmitry Osipenko <digetx@gmail.com> writes:

> 

>> On 18.09.2017 13:10, Alex Bennée wrote:

>>>

>>> Dmitry Osipenko <digetx@gmail.com> writes:

>>>

>>>> On 17.09.2017 16:22, Alex Bennée wrote:

>>>>>

>>>>> Dmitry Osipenko <digetx@gmail.com> writes:

>>>>>

>>>>>> On 24.02.2017 14:21, Alex Bennée wrote:

>>>>>>> Previously flushes on other vCPUs would only get serviced when they

>>>>>>> exited their TranslationBlocks. While this isn't overly problematic it

>>>>>>> violates the semantics of TLB flush from the point of view of source

>>>>>>> vCPU.

>>>>>>>

>>>>>>> To solve this we call the cputlb *_all_cpus_synced() functions to do

>>>>>>> the flushes which ensures all flushes are completed by the time the

>>>>>>> vCPU next schedules its own work. As the TLB instructions are modelled

>>>>>>> as CP writes the TB ends at this point meaning cpu->exit_request will

>>>>>>> be checked before the next instruction is executed.

>>>>>>>

>>>>>>> Deferring the work until the architectural sync point is a possible

>>>>>>> future optimisation.

>>>>>>>

>>>>>>> Signed-off-by: Alex Bennée <alex.bennee@linaro.org>

>>>>>>> Reviewed-by: Richard Henderson <rth@twiddle.net>

>>>>>>> Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

>>>>>>> ---

>>>>>>>  target/arm/helper.c | 165 ++++++++++++++++++++++------------------------------

>>>>>>>  1 file changed, 69 insertions(+), 96 deletions(-)

>>>>>>>

>>>>>>

>>>>>> Hello,

>>>>>>

>>>>>> I have an issue with Linux kernel stopping to boot on a SMP 32bit ARM (haven't

>>>>>> checked 64bit) in a single-threaded TCG mode. Kernel reaches point where it

>>>>>> should mount rootfs over NFS and vCPUs stop. This issue is reproducible with any

>>>>>> 32bit ARM machine type. Kernel boots fine with a MTTCG accel, only

>>>>>> single-threaded TCG is affected. Git bisection lead to this patch, any

>>>>>> ideas?

>>>>>

>>>>> It shouldn't cause a problem but can you obtain a backtrace of the

>>>>> system when hung?

>>>>>

>>>>

>>>> Actually, it looks like TCG enters infinite loop. Do you mean backtrace of QEMU

>>>> by 'backtrace of the system'? If so, here it is:

>>>>

>>>> Thread 4 (Thread 0x7ffa37f10700 (LWP 20716)):

>>>>

>>>> #0  0x00007ffa601888bd in poll () at ../sysdeps/unix/syscall-template.S:84

>>>>

>>>> #1  0x00007ffa5e3aa561 in poll (__timeout=-1, __nfds=2, __fds=0x7ffa30006dc0) at

>>>> /usr/include/bits/poll2.h:46

>>>> #2  poll_func (ufds=0x7ffa30006dc0, nfds=2, timeout=-1, userdata=0x557bd603eae0)

>>>> at

>>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/thread-mainloop.c:69

>>>> #3  0x00007ffa5e39bbb1 in pa_mainloop_poll (m=m@entry=0x557bd60401f0) at

>>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:844

>>>> #4  0x00007ffa5e39c24e in pa_mainloop_iterate (m=0x557bd60401f0,

>>>> block=<optimized out>, retval=0x0) at

>>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:926

>>>> #5  0x00007ffa5e39c300 in pa_mainloop_run (m=0x557bd60401f0,

>>>> retval=retval@entry=0x0) at

>>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/mainloop.c:944

>>>>

>>>> #6  0x00007ffa5e3aa4a9 in thread (userdata=0x557bd60400f0) at

>>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulse/thread-mainloop.c:100

>>>>

>>>> #7  0x00007ffa599eea38 in internal_thread_func (userdata=0x557bd603e090) at

>>>> /var/tmp/portage/media-sound/pulseaudio-10.0/work/pulseaudio-10.0/src/pulsecore/thread-posix.c:81

>>>>

>>>> #8  0x00007ffa60453657 in start_thread (arg=0x7ffa37f10700) at

>>>> pthread_create.c:456

>>>>

>>>> #9  0x00007ffa60193c5f in clone () at

>>>> ../sysdeps/unix/sysv/linux/x86_64/clone.S:97

>>>>

>>>>

>>>>

>>>>

>>>>

>>>> Thread 3 (Thread 0x7ffa4adff700 (LWP 20715)):

>>>>

>>>>

>>>> #0  0x00007ffa53e51caf in code_gen_buffer ()

>>>>

>>>

>>> Well it's not locked up in servicing any flush tasks as it's executing

>>> code. Maybe the guest code is spinning on something?

>>>

>>

>> Indeed, I should have used 'exec' instead of 'in_asm'.

>>

>>> In the monitor:

>>>

>>>   info registers

>>>

>>> Will show you where things are, see if the ip is moving each time. Also

>>> you can do a disassemble dump from there to see what code it is stuck

>>> on.

>>>

>>

>> I've attached with GDB to QEMU to see where it got stuck. Turned out it is

>> caused by CONFIG_STRICT_KERNEL_RWX=y of the Linux kernel. Upon boot completion

>> kernel changes memory permissions and that changing is executed on a dedicated

>> CPU, while other CPUs are 'stopped' in a busy loop.

>>

>> This patch just introduced a noticeable performance regression for a

>> single-threaded TCG, which is probably fine since MTTCG is the default now.

>> Thank you very much for the suggestions and all your work on MTTCG!

> 

> Hmm well it would be nice to know the exact mechanism for that failure.

> If we just end up with a very long list of tasks in

> cpu->queued_work_first then I guess that explains it but it would be

> nice to quantify the problem.

> 

> I had trouble seeing where this loop is in the kernel code, got a pointer?

> 

The memory permissions changing starts here:

http://elixir.free-electrons.com/linux/v4.14-rc1/source/arch/arm/mm/init.c#L739

The busy loop is here:

http://elixir.free-electrons.com/linux/v4.14-rc1/source/kernel/stop_machine.c#L195

Interestingly, I tried to attach to a 'hanged' QEMU another time and got into
other code. That code has the same pattern, one CPU flushes cache a lot in
shmem_rename2()->need_update()->memchr_inv() and the other is executing something.

So seems busy loop isn't the problem, it's just the TLB flushing is very-very
expensive in TCG. On the other hand I don't see such a problem with MTTCG, so
not sure what's going on with a single-threaded TCG.

-- 
Dmitry
diff mbox series

Patch

diff --git a/target/arm/helper.c b/target/arm/helper.c
index b41d0494d1..bcedb4a808 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -536,41 +536,33 @@  static void tlbimvaa_write(CPUARMState *env, const ARMCPRegInfo *ri,
 static void tlbiall_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                              uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush(other_cs);
-    }
+    tlb_flush_all_cpus_synced(cs);
 }
 
 static void tlbiasid_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                              uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush(other_cs);
-    }
+    tlb_flush_all_cpus_synced(cs);
 }
 
 static void tlbimva_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                              uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_page(other_cs, value & TARGET_PAGE_MASK);
-    }
+    tlb_flush_page_all_cpus_synced(cs, value & TARGET_PAGE_MASK);
 }
 
 static void tlbimvaa_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                              uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_page(other_cs, value & TARGET_PAGE_MASK);
-    }
+    tlb_flush_page_all_cpus_synced(cs, value & TARGET_PAGE_MASK);
 }
 
 static void tlbiall_nsnh_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -587,14 +579,12 @@  static void tlbiall_nsnh_write(CPUARMState *env, const ARMCPRegInfo *ri,
 static void tlbiall_nsnh_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                   uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_by_mmuidx(other_cs,
-                            (1 << ARMMMUIdx_S12NSE1) |
-                            (1 << ARMMMUIdx_S12NSE0) |
-                            (1 << ARMMMUIdx_S2NS));
-    }
+    tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                        (1 << ARMMMUIdx_S12NSE1) |
+                                        (1 << ARMMMUIdx_S12NSE0) |
+                                        (1 << ARMMMUIdx_S2NS));
 }
 
 static void tlbiipas2_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -621,7 +611,7 @@  static void tlbiipas2_write(CPUARMState *env, const ARMCPRegInfo *ri,
 static void tlbiipas2_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
     uint64_t pageaddr;
 
     if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) {
@@ -630,9 +620,8 @@  static void tlbiipas2_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
 
     pageaddr = sextract64(value << 12, 0, 40);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S2NS));
-    }
+    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                             (1 << ARMMMUIdx_S2NS));
 }
 
 static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -646,11 +635,9 @@  static void tlbiall_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,
 static void tlbiall_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                  uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_by_mmuidx(other_cs, (1 << ARMMMUIdx_S1E2));
-    }
+    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E2));
 }
 
 static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -665,12 +652,11 @@  static void tlbimva_hyp_write(CPUARMState *env, const ARMCPRegInfo *ri,
 static void tlbimva_hyp_is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                  uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
     uint64_t pageaddr = value & ~MAKE_64BIT_MASK(0, 12);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S1E2));
-    }
+    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                             (1 << ARMMMUIdx_S1E2));
 }
 
 static const ARMCPRegInfo cp_reginfo[] = {
@@ -2904,8 +2890,7 @@  static CPAccessResult aa64_cacheop_access(CPUARMState *env,
 static void tlbi_aa64_vmalle1_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                     uint64_t value)
 {
-    ARMCPU *cpu = arm_env_get_cpu(env);
-    CPUState *cs = CPU(cpu);
+    CPUState *cs = ENV_GET_CPU(env);
 
     if (arm_is_secure_below_el3(env)) {
         tlb_flush_by_mmuidx(cs,
@@ -2921,19 +2906,17 @@  static void tlbi_aa64_vmalle1_write(CPUARMState *env, const ARMCPRegInfo *ri,
 static void tlbi_aa64_vmalle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                       uint64_t value)
 {
+    CPUState *cs = ENV_GET_CPU(env);
     bool sec = arm_is_secure_below_el3(env);
-    CPUState *other_cs;
 
-    CPU_FOREACH(other_cs) {
-        if (sec) {
-            tlb_flush_by_mmuidx(other_cs,
-                                (1 << ARMMMUIdx_S1SE1) |
-                                (1 << ARMMMUIdx_S1SE0));
-        } else {
-            tlb_flush_by_mmuidx(other_cs,
-                                (1 << ARMMMUIdx_S12NSE1) |
-                                (1 << ARMMMUIdx_S12NSE0));
-        }
+    if (sec) {
+        tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                            (1 << ARMMMUIdx_S1SE1) |
+                                            (1 << ARMMMUIdx_S1SE0));
+    } else {
+        tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                            (1 << ARMMMUIdx_S12NSE1) |
+                                            (1 << ARMMMUIdx_S12NSE0));
     }
 }
 
@@ -2990,46 +2973,40 @@  static void tlbi_aa64_alle1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
      * stage 2 translations, whereas most other scopes only invalidate
      * stage 1 translations.
      */
+    CPUState *cs = ENV_GET_CPU(env);
     bool sec = arm_is_secure_below_el3(env);
     bool has_el2 = arm_feature(env, ARM_FEATURE_EL2);
-    CPUState *other_cs;
-
-    CPU_FOREACH(other_cs) {
-        if (sec) {
-            tlb_flush_by_mmuidx(other_cs,
-                                (1 << ARMMMUIdx_S1SE1) |
-                                (1 << ARMMMUIdx_S1SE0));
-        } else if (has_el2) {
-            tlb_flush_by_mmuidx(other_cs,
-                                (1 << ARMMMUIdx_S12NSE1) |
-                                (1 << ARMMMUIdx_S12NSE0) |
-                                (1 << ARMMMUIdx_S2NS));
-        } else {
-            tlb_flush_by_mmuidx(other_cs,
-                                (1 << ARMMMUIdx_S12NSE1) |
-                                (1 << ARMMMUIdx_S12NSE0));
-        }
+
+    if (sec) {
+        tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                            (1 << ARMMMUIdx_S1SE1) |
+                                            (1 << ARMMMUIdx_S1SE0));
+    } else if (has_el2) {
+        tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                            (1 << ARMMMUIdx_S12NSE1) |
+                                            (1 << ARMMMUIdx_S12NSE0) |
+                                            (1 << ARMMMUIdx_S2NS));
+    } else {
+          tlb_flush_by_mmuidx_all_cpus_synced(cs,
+                                              (1 << ARMMMUIdx_S12NSE1) |
+                                              (1 << ARMMMUIdx_S12NSE0));
     }
 }
 
 static void tlbi_aa64_alle2is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                     uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_by_mmuidx(other_cs, (1 << ARMMMUIdx_S1E2));
-    }
+    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E2));
 }
 
 static void tlbi_aa64_alle3is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                     uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_by_mmuidx(other_cs, (1 << ARMMMUIdx_S1E3));
-    }
+    tlb_flush_by_mmuidx_all_cpus_synced(cs, (1 << ARMMMUIdx_S1E3));
 }
 
 static void tlbi_aa64_vae1_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -3086,43 +3063,40 @@  static void tlbi_aa64_vae3_write(CPUARMState *env, const ARMCPRegInfo *ri,
 static void tlbi_aa64_vae1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                    uint64_t value)
 {
+    ARMCPU *cpu = arm_env_get_cpu(env);
+    CPUState *cs = CPU(cpu);
     bool sec = arm_is_secure_below_el3(env);
-    CPUState *other_cs;
     uint64_t pageaddr = sextract64(value << 12, 0, 56);
 
-    CPU_FOREACH(other_cs) {
-        if (sec) {
-            tlb_flush_page_by_mmuidx(other_cs, pageaddr,
-                                     (1 << ARMMMUIdx_S1SE1) |
-                                     (1 << ARMMMUIdx_S1SE0));
-        } else {
-            tlb_flush_page_by_mmuidx(other_cs, pageaddr,
-                                     (1 << ARMMMUIdx_S12NSE1) |
-                                     (1 << ARMMMUIdx_S12NSE0));
-        }
+    if (sec) {
+        tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                                 (1 << ARMMMUIdx_S1SE1) |
+                                                 (1 << ARMMMUIdx_S1SE0));
+    } else {
+        tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                                 (1 << ARMMMUIdx_S12NSE1) |
+                                                 (1 << ARMMMUIdx_S12NSE0));
     }
 }
 
 static void tlbi_aa64_vae2is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                    uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
     uint64_t pageaddr = sextract64(value << 12, 0, 56);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S1E2));
-    }
+    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                             (1 << ARMMMUIdx_S1E2));
 }
 
 static void tlbi_aa64_vae3is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                    uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
     uint64_t pageaddr = sextract64(value << 12, 0, 56);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S1E3));
-    }
+    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                             (1 << ARMMMUIdx_S1E3));
 }
 
 static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri,
@@ -3150,7 +3124,7 @@  static void tlbi_aa64_ipas2e1_write(CPUARMState *env, const ARMCPRegInfo *ri,
 static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
                                       uint64_t value)
 {
-    CPUState *other_cs;
+    CPUState *cs = ENV_GET_CPU(env);
     uint64_t pageaddr;
 
     if (!arm_feature(env, ARM_FEATURE_EL2) || !(env->cp15.scr_el3 & SCR_NS)) {
@@ -3159,9 +3133,8 @@  static void tlbi_aa64_ipas2e1is_write(CPUARMState *env, const ARMCPRegInfo *ri,
 
     pageaddr = sextract64(value << 12, 0, 48);
 
-    CPU_FOREACH(other_cs) {
-        tlb_flush_page_by_mmuidx(other_cs, pageaddr, (1 << ARMMMUIdx_S2NS));
-    }
+    tlb_flush_page_by_mmuidx_all_cpus_synced(cs, pageaddr,
+                                             (1 << ARMMMUIdx_S2NS));
 }
 
 static CPAccessResult aa64_zva_access(CPUARMState *env, const ARMCPRegInfo *ri,