diff mbox series

[4/4] target/arm: Rely on hflags correct in cpu_get_tb_cpu_state

Message ID 20190214040652.4811-5-richard.henderson@linaro.org
State New
Headers show
Series target/arm: Reduce overhead of cpu_get_tb_cpu_state | expand

Commit Message

Richard Henderson Feb. 14, 2019, 4:06 a.m. UTC
This is the payoff.

From perf record -g data of ubuntu 18 boot and shutdown:

BEFORE:

-   23.02%     2.82%  qemu-system-aar  [.] helper_lookup_tb_ptr
   - 20.22% helper_lookup_tb_ptr
      + 10.05% tb_htable_lookup
      - 9.13% cpu_get_tb_cpu_state
           3.20% aa64_va_parameters_both
           0.55% fp_exception_el

-   11.66%     4.74%  qemu-system-aar  [.] cpu_get_tb_cpu_state
   - 6.96% cpu_get_tb_cpu_state
        3.63% aa64_va_parameters_both
        0.60% fp_exception_el
        0.53% sve_exception_el

AFTER:

-   16.40%     3.40%  qemu-system-aar  [.] helper_lookup_tb_ptr
   - 13.03% helper_lookup_tb_ptr
      + 11.19% tb_htable_lookup
        0.55% cpu_get_tb_cpu_state

     0.98%     0.71%  qemu-system-aar  [.] cpu_get_tb_cpu_state

     0.87%     0.24%  qemu-system-aar  [.] rebuild_hflags_a64

Before, helper_lookup_tb_ptr is the second hottest function in the
application, consuming almost a quarter of the runtime.  Within the
entire execution, cpu_get_tb_cpu_state consumes about 12%.

After, helper_lookup_tb_ptr has dropped to the fourth hottest function,
with consumption dropping to a sixth of the runtime.  Within the
entire execution, cpu_get_tb_cpu_state has dropped below 1%, and the
supporting function to rebuild hflags also consumes about 1%.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 target/arm/helper.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

-- 
2.17.1

Comments

Alex Bennée Feb. 19, 2019, 8:17 p.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> This is the payoff.


\o/

Running my own silly pigz benchmark:

Before:
  Time (mean ± σ):     66.035 s ±  1.425 s

After:
  Time (mean ± σ):     57.191 s ±  0.675 s

>

> From perf record -g data of ubuntu 18 boot and shutdown:

>

> BEFORE:

>

> -   23.02%     2.82%  qemu-system-aar  [.] helper_lookup_tb_ptr

>    - 20.22% helper_lookup_tb_ptr

>       + 10.05% tb_htable_lookup

>       - 9.13% cpu_get_tb_cpu_state

>            3.20% aa64_va_parameters_both

>            0.55% fp_exception_el

>

> -   11.66%     4.74%  qemu-system-aar  [.] cpu_get_tb_cpu_state

>    - 6.96% cpu_get_tb_cpu_state

>         3.63% aa64_va_parameters_both

>         0.60% fp_exception_el

>         0.53% sve_exception_el

>

> AFTER:

>

> -   16.40%     3.40%  qemu-system-aar  [.] helper_lookup_tb_ptr

>    - 13.03% helper_lookup_tb_ptr

>       + 11.19% tb_htable_lookup

>         0.55% cpu_get_tb_cpu_state

>

>      0.98%     0.71%  qemu-system-aar  [.] cpu_get_tb_cpu_state

>

>      0.87%     0.24%  qemu-system-aar  [.] rebuild_hflags_a64

>

> Before, helper_lookup_tb_ptr is the second hottest function in the

> application, consuming almost a quarter of the runtime.  Within the

> entire execution, cpu_get_tb_cpu_state consumes about 12%.

>

> After, helper_lookup_tb_ptr has dropped to the fourth hottest function,

> with consumption dropping to a sixth of the runtime.  Within the

> entire execution, cpu_get_tb_cpu_state has dropped below 1%, and the

> supporting function to rebuild hflags also consumes about 1%.

>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  target/arm/helper.c | 9 ++-------

>  1 file changed, 2 insertions(+), 7 deletions(-)

>

> diff --git a/target/arm/helper.c b/target/arm/helper.c

> index 3c8724883d..1bdb87267e 100644

> --- a/target/arm/helper.c

> +++ b/target/arm/helper.c

> @@ -13894,21 +13894,16 @@ void HELPER(rebuild_hflags_a64)(CPUARMState *env, uint32_t el)

>  void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,

>                            target_ulong *cs_base, uint32_t *pflags)

>  {

> -    int current_el = arm_current_el(env);

> -    uint32_t flags;

> +    uint32_t flags = env->hflags;

>      uint32_t pstate_for_ss;

>

>      *cs_base = 0;

> -    if (is_a64(env)) {

> +    if (FIELD_EX32(flags, TBFLAG_ANY, AARCH64_STATE)) {

>          *pc = env->pc;

> -        flags = rebuild_hflags_a64(env, current_el);

> -        assert(flags == env->hflags);


While debugging I came up with this monstrosity:

    if (FIELD_EX32(flags, TBFLAG_ANY, AARCH64_STATE)) {
#ifdef CONFIG_DEBUG_TCG
        static uint32_t tb_state = 0;
        uint32_t recalc_flags = rebuild_hflags_a64(env, arm_current_el(env));
        tb_state++;
        if (flags != recalc_flags) {
            fprintf(stderr, "%s: flags %#x, should be %#x (%#x/%d)\n", __func__,
                    flags, recalc_flags, flags ^ recalc_flags, tb_state);
            abort();
        }
#endif
        *pc = env->pc;
        flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
        pstate_for_ss = env->pstate;
    } else {

I suspect given cached flags are common about the translator it might be
worth coming up with a nicer macro to encapsulate the check for
DEBUG_TCG builds.

>          flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);

>          pstate_for_ss = env->pstate;

>      } else {

>          *pc = env->regs[15];

> -        flags = rebuild_hflags_a32(env, current_el);

> -        assert(flags == env->hflags);

>          flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);

>          flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);

>          pstate_for_ss = env->uncached_cpsr;


--
Alex Bennée
Richard Henderson Feb. 19, 2019, 10:40 p.m. UTC | #2
On 2/19/19 12:17 PM, Alex Bennée wrote:
> While debugging I came up with this monstrosity:

> 

>     if (FIELD_EX32(flags, TBFLAG_ANY, AARCH64_STATE)) {

> #ifdef CONFIG_DEBUG_TCG

>         static uint32_t tb_state = 0;

>         uint32_t recalc_flags = rebuild_hflags_a64(env, arm_current_el(env));

>         tb_state++;

>         if (flags != recalc_flags) {

>             fprintf(stderr, "%s: flags %#x, should be %#x (%#x/%d)\n", __func__,

>                     flags, recalc_flags, flags ^ recalc_flags, tb_state);

>             abort();

>         }

> #endif

>         *pc = env->pc;

>         flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);

>         pstate_for_ss = env->pstate;

>     } else {


I have now included

+#ifdef CONFIG_DEBUG_TCG
+    {
+        int el = arm_current_el(env);
+        uint32_t check_flags;
+        if (is_a64(env)) {
+            check_flags = rebuild_hflags_a64(env, el);
+        } else {
+            check_flags = rebuild_hflags_a32(env, el);
+        }
+        g_assert_cmphex(flags, ==, check_flags);
+    }
+#endif


r~
diff mbox series

Patch

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 3c8724883d..1bdb87267e 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -13894,21 +13894,16 @@  void HELPER(rebuild_hflags_a64)(CPUARMState *env, uint32_t el)
 void cpu_get_tb_cpu_state(CPUARMState *env, target_ulong *pc,
                           target_ulong *cs_base, uint32_t *pflags)
 {
-    int current_el = arm_current_el(env);
-    uint32_t flags;
+    uint32_t flags = env->hflags;
     uint32_t pstate_for_ss;
 
     *cs_base = 0;
-    if (is_a64(env)) {
+    if (FIELD_EX32(flags, TBFLAG_ANY, AARCH64_STATE)) {
         *pc = env->pc;
-        flags = rebuild_hflags_a64(env, current_el);
-        assert(flags == env->hflags);
         flags = FIELD_DP32(flags, TBFLAG_A64, BTYPE, env->btype);
         pstate_for_ss = env->pstate;
     } else {
         *pc = env->regs[15];
-        flags = rebuild_hflags_a32(env, current_el);
-        assert(flags == env->hflags);
         flags = FIELD_DP32(flags, TBFLAG_A32, THUMB, env->thumb);
         flags = FIELD_DP32(flags, TBFLAG_A32, CONDEXEC, env->condexec_bits);
         pstate_for_ss = env->uncached_cpsr;