diff mbox series

[for-7.2,14/21] accel/tcg: Hoist get_page_addr_code out of tb_lookup

Message ID 20220812180806.2128593-15-richard.henderson@linaro.org
State New
Headers show
Series accel/tcg: minimize tlb lookups during translate + user-only PROT_EXEC fixes | expand

Commit Message

Richard Henderson Aug. 12, 2022, 6:07 p.m. UTC
We will want to re-use the result of get_page_addr_code
beyond the scope of tb_lookup.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 accel/tcg/cpu-exec.c | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

Comments

Ilya Leoshkevich Aug. 16, 2022, 11:43 p.m. UTC | #1
On Fri, 2022-08-12 at 11:07 -0700, Richard Henderson wrote:
> We will want to re-use the result of get_page_addr_code
> beyond the scope of tb_lookup.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  accel/tcg/cpu-exec.c | 34 ++++++++++++++++++++++++----------
>  1 file changed, 24 insertions(+), 10 deletions(-)
> 
> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> index a9b7053274..889355b341 100644
> --- a/accel/tcg/cpu-exec.c
> +++ b/accel/tcg/cpu-exec.c
> @@ -209,13 +209,12 @@ static bool tb_lookup_cmp(const void *p, const
> void *d)
>  }
>  
>  /* Might cause an exception, so have a longjmp destination ready */
> -static TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
> -                                   target_ulong cs_base,
> +static TranslationBlock *tb_lookup(CPUState *cpu, tb_page_addr_t
> phys_pc,
> +                                   target_ulong pc, target_ulong
> cs_base,
>                                     uint32_t flags, uint32_t cflags)
>  {
>      CPUArchState *env = cpu->env_ptr;
>      TranslationBlock *tb;
> -    tb_page_addr_t phys_pc;
>      struct tb_desc desc;
>      uint32_t jmp_hash, tb_hash;
>  
> @@ -240,11 +239,8 @@ static TranslationBlock *tb_lookup(CPUState
> *cpu, target_ulong pc,
>      desc.cflags = cflags;
>      desc.trace_vcpu_dstate = *cpu->trace_dstate;
>      desc.pc = pc;
> -    phys_pc = get_page_addr_code(desc.env, pc);
> -    if (phys_pc == -1) {
> -        return NULL;
> -    }
>      desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
> +
>      tb_hash = tb_hash_func(phys_pc, pc, flags, cflags, *cpu-
> >trace_dstate);
>      tb = qht_lookup_custom(&tb_ctx.htable, &desc, tb_hash,
> tb_lookup_cmp);
>      if (tb == NULL) {
> @@ -371,6 +367,7 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState
> *env)
>      TranslationBlock *tb;
>      target_ulong cs_base, pc;
>      uint32_t flags, cflags;
> +    tb_page_addr_t phys_pc;
>  
>      cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>  
> @@ -379,7 +376,12 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState
> *env)
>          cpu_loop_exit(cpu);
>      }
>  
> -    tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
> +    phys_pc = get_page_addr_code(env, pc);
> +    if (phys_pc == -1) {
> +        return tcg_code_gen_epilogue;
> +    }
> +
> +    tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags, cflags);
>      if (tb == NULL) {
>          return tcg_code_gen_epilogue;
>      }
> @@ -482,6 +484,7 @@ void cpu_exec_step_atomic(CPUState *cpu)
>      TranslationBlock *tb;
>      target_ulong cs_base, pc;
>      uint32_t flags, cflags;
> +    tb_page_addr_t phys_pc;
>      int tb_exit;
>  
>      if (sigsetjmp(cpu->jmp_env, 0) == 0) {
> @@ -504,7 +507,12 @@ void cpu_exec_step_atomic(CPUState *cpu)
>           * Any breakpoint for this insn will have been recognized
> earlier.
>           */
>  
> -        tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
> +        phys_pc = get_page_addr_code(env, pc);
> +        if (phys_pc == -1) {
> +            tb = NULL;
> +        } else {
> +            tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags,
> cflags);
> +        }
>          if (tb == NULL) {
>              mmap_lock();
>              tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
> @@ -949,6 +957,7 @@ int cpu_exec(CPUState *cpu)
>              TranslationBlock *tb;
>              target_ulong cs_base, pc;
>              uint32_t flags, cflags;
> +            tb_page_addr_t phys_pc;
>  
>              cpu_get_tb_cpu_state(cpu->env_ptr, &pc, &cs_base,
> &flags);
>  
> @@ -970,7 +979,12 @@ int cpu_exec(CPUState *cpu)
>                  break;
>              }
>  
> -            tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
> +            phys_pc = get_page_addr_code(cpu->env_ptr, pc);
> +            if (phys_pc == -1) {
> +                tb = NULL;
> +            } else {
> +                tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags,
> cflags);
> +            }
>              if (tb == NULL) {
>                  mmap_lock();
>                  tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);

This patch did not make it into v2, but having get_page_addr_code()
before tb_lookup() in helper_lookup_tb_ptr() helped raise the exception
when trying to execute a no-longer-executable TB.

Was it dropped for performance reasons?
Richard Henderson Aug. 17, 2022, 1:42 a.m. UTC | #2
On 8/16/22 18:43, Ilya Leoshkevich wrote:
> On Fri, 2022-08-12 at 11:07 -0700, Richard Henderson wrote:
>> We will want to re-use the result of get_page_addr_code
>> beyond the scope of tb_lookup.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
>> ---
>>   accel/tcg/cpu-exec.c | 34 ++++++++++++++++++++++++----------
>>   1 file changed, 24 insertions(+), 10 deletions(-)
>>
>> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
>> index a9b7053274..889355b341 100644
>> --- a/accel/tcg/cpu-exec.c
>> +++ b/accel/tcg/cpu-exec.c
>> @@ -209,13 +209,12 @@ static bool tb_lookup_cmp(const void *p, const
>> void *d)
>>   }
>>   
>>   /* Might cause an exception, so have a longjmp destination ready */
>> -static TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
>> -                                   target_ulong cs_base,
>> +static TranslationBlock *tb_lookup(CPUState *cpu, tb_page_addr_t
>> phys_pc,
>> +                                   target_ulong pc, target_ulong
>> cs_base,
>>                                      uint32_t flags, uint32_t cflags)
>>   {
>>       CPUArchState *env = cpu->env_ptr;
>>       TranslationBlock *tb;
>> -    tb_page_addr_t phys_pc;
>>       struct tb_desc desc;
>>       uint32_t jmp_hash, tb_hash;
>>   
>> @@ -240,11 +239,8 @@ static TranslationBlock *tb_lookup(CPUState
>> *cpu, target_ulong pc,
>>       desc.cflags = cflags;
>>       desc.trace_vcpu_dstate = *cpu->trace_dstate;
>>       desc.pc = pc;
>> -    phys_pc = get_page_addr_code(desc.env, pc);
>> -    if (phys_pc == -1) {
>> -        return NULL;
>> -    }
>>       desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
>> +
>>       tb_hash = tb_hash_func(phys_pc, pc, flags, cflags, *cpu-
>>> trace_dstate);
>>       tb = qht_lookup_custom(&tb_ctx.htable, &desc, tb_hash,
>> tb_lookup_cmp);
>>       if (tb == NULL) {
>> @@ -371,6 +367,7 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState
>> *env)
>>       TranslationBlock *tb;
>>       target_ulong cs_base, pc;
>>       uint32_t flags, cflags;
>> +    tb_page_addr_t phys_pc;
>>   
>>       cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
>>   
>> @@ -379,7 +376,12 @@ const void *HELPER(lookup_tb_ptr)(CPUArchState
>> *env)
>>           cpu_loop_exit(cpu);
>>       }
>>   
>> -    tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
>> +    phys_pc = get_page_addr_code(env, pc);
>> +    if (phys_pc == -1) {
>> +        return tcg_code_gen_epilogue;
>> +    }
>> +
>> +    tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags, cflags);
>>       if (tb == NULL) {
>>           return tcg_code_gen_epilogue;
>>       }
>> @@ -482,6 +484,7 @@ void cpu_exec_step_atomic(CPUState *cpu)
>>       TranslationBlock *tb;
>>       target_ulong cs_base, pc;
>>       uint32_t flags, cflags;
>> +    tb_page_addr_t phys_pc;
>>       int tb_exit;
>>   
>>       if (sigsetjmp(cpu->jmp_env, 0) == 0) {
>> @@ -504,7 +507,12 @@ void cpu_exec_step_atomic(CPUState *cpu)
>>            * Any breakpoint for this insn will have been recognized
>> earlier.
>>            */
>>   
>> -        tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
>> +        phys_pc = get_page_addr_code(env, pc);
>> +        if (phys_pc == -1) {
>> +            tb = NULL;
>> +        } else {
>> +            tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags,
>> cflags);
>> +        }
>>           if (tb == NULL) {
>>               mmap_lock();
>>               tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
>> @@ -949,6 +957,7 @@ int cpu_exec(CPUState *cpu)
>>               TranslationBlock *tb;
>>               target_ulong cs_base, pc;
>>               uint32_t flags, cflags;
>> +            tb_page_addr_t phys_pc;
>>   
>>               cpu_get_tb_cpu_state(cpu->env_ptr, &pc, &cs_base,
>> &flags);
>>   
>> @@ -970,7 +979,12 @@ int cpu_exec(CPUState *cpu)
>>                   break;
>>               }
>>   
>> -            tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
>> +            phys_pc = get_page_addr_code(cpu->env_ptr, pc);
>> +            if (phys_pc == -1) {
>> +                tb = NULL;
>> +            } else {
>> +                tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags,
>> cflags);
>> +            }
>>               if (tb == NULL) {
>>                   mmap_lock();
>>                   tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
> 
> This patch did not make it into v2, but having get_page_addr_code()
> before tb_lookup() in helper_lookup_tb_ptr() helped raise the exception
> when trying to execute a no-longer-executable TB.
> 
> Was it dropped for performance reasons?

Ah, yes.  I dropped it because I ran into some regression, and started minimizing the 
tree.  Because of the extra lock that needed to be held (next patch, also dropped), I 
couldn't prove this actually helped.

I think the bit that's causing your user-only failure at the moment is the jump cache. 
This patch hoisted the page table check before the jmp_cache.  For system, cputlb.c takes 
care of flushing the jump cache with page table changes; we still don't have anything in 
user-only that takes care of that.


r~
Ilya Leoshkevich Aug. 17, 2022, 11:08 a.m. UTC | #3
On Tue, 2022-08-16 at 20:42 -0500, Richard Henderson wrote:
> On 8/16/22 18:43, Ilya Leoshkevich wrote:
> > On Fri, 2022-08-12 at 11:07 -0700, Richard Henderson wrote:
> > > We will want to re-use the result of get_page_addr_code
> > > beyond the scope of tb_lookup.
> > > 
> > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> > > ---
> > >   accel/tcg/cpu-exec.c | 34 ++++++++++++++++++++++++----------
> > >   1 file changed, 24 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> > > index a9b7053274..889355b341 100644
> > > --- a/accel/tcg/cpu-exec.c
> > > +++ b/accel/tcg/cpu-exec.c
> > > @@ -209,13 +209,12 @@ static bool tb_lookup_cmp(const void *p,
> > > const
> > > void *d)
> > >   }
> > >   
> > >   /* Might cause an exception, so have a longjmp destination
> > > ready */
> > > -static TranslationBlock *tb_lookup(CPUState *cpu, target_ulong
> > > pc,
> > > -                                   target_ulong cs_base,
> > > +static TranslationBlock *tb_lookup(CPUState *cpu, tb_page_addr_t
> > > phys_pc,
> > > +                                   target_ulong pc, target_ulong
> > > cs_base,
> > >                                      uint32_t flags, uint32_t
> > > cflags)
> > >   {
> > >       CPUArchState *env = cpu->env_ptr;
> > >       TranslationBlock *tb;
> > > -    tb_page_addr_t phys_pc;
> > >       struct tb_desc desc;
> > >       uint32_t jmp_hash, tb_hash;
> > >   
> > > @@ -240,11 +239,8 @@ static TranslationBlock *tb_lookup(CPUState
> > > *cpu, target_ulong pc,
> > >       desc.cflags = cflags;
> > >       desc.trace_vcpu_dstate = *cpu->trace_dstate;
> > >       desc.pc = pc;
> > > -    phys_pc = get_page_addr_code(desc.env, pc);
> > > -    if (phys_pc == -1) {
> > > -        return NULL;
> > > -    }
> > >       desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
> > > +
> > >       tb_hash = tb_hash_func(phys_pc, pc, flags, cflags, *cpu-
> > > > trace_dstate);
> > >       tb = qht_lookup_custom(&tb_ctx.htable, &desc, tb_hash,
> > > tb_lookup_cmp);
> > >       if (tb == NULL) {
> > > @@ -371,6 +367,7 @@ const void
> > > *HELPER(lookup_tb_ptr)(CPUArchState
> > > *env)
> > >       TranslationBlock *tb;
> > >       target_ulong cs_base, pc;
> > >       uint32_t flags, cflags;
> > > +    tb_page_addr_t phys_pc;
> > >   
> > >       cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
> > >   
> > > @@ -379,7 +376,12 @@ const void
> > > *HELPER(lookup_tb_ptr)(CPUArchState
> > > *env)
> > >           cpu_loop_exit(cpu);
> > >       }
> > >   
> > > -    tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
> > > +    phys_pc = get_page_addr_code(env, pc);
> > > +    if (phys_pc == -1) {
> > > +        return tcg_code_gen_epilogue;
> > > +    }
> > > +
> > > +    tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags, cflags);
> > >       if (tb == NULL) {
> > >           return tcg_code_gen_epilogue;
> > >       }
> > > @@ -482,6 +484,7 @@ void cpu_exec_step_atomic(CPUState *cpu)
> > >       TranslationBlock *tb;
> > >       target_ulong cs_base, pc;
> > >       uint32_t flags, cflags;
> > > +    tb_page_addr_t phys_pc;
> > >       int tb_exit;
> > >   
> > >       if (sigsetjmp(cpu->jmp_env, 0) == 0) {
> > > @@ -504,7 +507,12 @@ void cpu_exec_step_atomic(CPUState *cpu)
> > >            * Any breakpoint for this insn will have been
> > > recognized
> > > earlier.
> > >            */
> > >   
> > > -        tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
> > > +        phys_pc = get_page_addr_code(env, pc);
> > > +        if (phys_pc == -1) {
> > > +            tb = NULL;
> > > +        } else {
> > > +            tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags,
> > > cflags);
> > > +        }
> > >           if (tb == NULL) {
> > >               mmap_lock();
> > >               tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
> > > @@ -949,6 +957,7 @@ int cpu_exec(CPUState *cpu)
> > >               TranslationBlock *tb;
> > >               target_ulong cs_base, pc;
> > >               uint32_t flags, cflags;
> > > +            tb_page_addr_t phys_pc;
> > >   
> > >               cpu_get_tb_cpu_state(cpu->env_ptr, &pc, &cs_base,
> > > &flags);
> > >   
> > > @@ -970,7 +979,12 @@ int cpu_exec(CPUState *cpu)
> > >                   break;
> > >               }
> > >   
> > > -            tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
> > > +            phys_pc = get_page_addr_code(cpu->env_ptr, pc);
> > > +            if (phys_pc == -1) {
> > > +                tb = NULL;
> > > +            } else {
> > > +                tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags,
> > > cflags);
> > > +            }
> > >               if (tb == NULL) {
> > >                   mmap_lock();
> > >                   tb = tb_gen_code(cpu, pc, cs_base, flags,
> > > cflags);
> > 
> > This patch did not make it into v2, but having get_page_addr_code()
> > before tb_lookup() in helper_lookup_tb_ptr() helped raise the
> > exception
> > when trying to execute a no-longer-executable TB.
> > 
> > Was it dropped for performance reasons?
> 
> Ah, yes.  I dropped it because I ran into some regression, and
> started minimizing the 
> tree.  Because of the extra lock that needed to be held (next patch,
> also dropped), I 
> couldn't prove this actually helped.
> 
> I think the bit that's causing your user-only failure at the moment
> is the jump cache. 
> This patch hoisted the page table check before the jmp_cache.  For
> system, cputlb.c takes 
> care of flushing the jump cache with page table changes; we still
> don't have anything in 
> user-only that takes care of that.
> 
> 
> r~
> 

Would something like this be okay?

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 27435b97dbd..9421c84d991 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1152,6 +1152,27 @@ static inline void
tb_jmp_unlink(TranslationBlock *dest)
     qemu_spin_unlock(&dest->jmp_lock);
 }
 
+static void cpu_tb_jmp_cache_remove(TranslationBlock *tb)
+{
+    CPUState *cpu;
+    uint32_t h;
+
+    /* remove the TB from the hash list */
+    if (TARGET_TB_PCREL) {
+        /* Any TB may be at any virtual address */
+        CPU_FOREACH(cpu) {
+            cpu_tb_jmp_cache_clear(cpu);
+        }
+    } else {
+        h = tb_jmp_cache_hash_func(tb_pc(tb));
+        CPU_FOREACH(cpu) {
+            if (qatomic_read(&cpu->tb_jmp_cache[h].tb) == tb) {
+                qatomic_set(&cpu->tb_jmp_cache[h].tb, NULL);
+            }
+        }
+    }
+}
+
 /*
  * In user-mode, call with mmap_lock held.
  * In !user-mode, if @rm_from_page_list is set, call with the TB's
pages'
@@ -1159,7 +1180,6 @@ static inline void tb_jmp_unlink(TranslationBlock
*dest)
  */
 static void do_tb_phys_invalidate(TranslationBlock *tb, bool
rm_from_page_list)
 {
-    CPUState *cpu;
     PageDesc *p;
     uint32_t h;
     tb_page_addr_t phys_pc;
@@ -1190,20 +1210,7 @@ static void
do_tb_phys_invalidate(TranslationBlock *tb, bool rm_from_page_list)
         }
     }
 
-    /* remove the TB from the hash list */
-    if (TARGET_TB_PCREL) {
-        /* Any TB may be at any virtual address */
-        CPU_FOREACH(cpu) {
-            cpu_tb_jmp_cache_clear(cpu);
-        }
-    } else {
-        h = tb_jmp_cache_hash_func(tb_pc(tb));
-        CPU_FOREACH(cpu) {
-            if (qatomic_read(&cpu->tb_jmp_cache[h].tb) == tb) {
-                qatomic_set(&cpu->tb_jmp_cache[h].tb, NULL);
-            }
-        }
-    }
+    cpu_tb_jmp_cache_remove(tb);
 
     /* suppress this TB from the two jump lists */
     tb_remove_from_jmp_list(tb, 0);
@@ -2243,6 +2250,13 @@ void page_set_flags(target_ulong start,
target_ulong end, int flags)
             (flags & PAGE_WRITE) &&
             p->first_tb) {
             tb_invalidate_phys_page(addr, 0);
+        } else {
+            TranslationBlock *tb;
+            int n;
+
+            PAGE_FOR_EACH_TB(p, tb, n) {
+                cpu_tb_jmp_cache_remove(tb);
+            }
         }
         if (reset_target_data) {
             g_free(p->target_data);
Richard Henderson Aug. 17, 2022, 1:15 p.m. UTC | #4
On 8/17/22 06:08, Ilya Leoshkevich wrote:
> @@ -2243,6 +2250,13 @@ void page_set_flags(target_ulong start,
> target_ulong end, int flags)
>               (flags & PAGE_WRITE) &&
>               p->first_tb) {
>               tb_invalidate_phys_page(addr, 0);
> +        } else {
> +            TranslationBlock *tb;
> +            int n;
> +
> +            PAGE_FOR_EACH_TB(p, tb, n) {
> +                cpu_tb_jmp_cache_remove(tb);
> +            }
>           }

Here you would use tb_jmp_cache_clear_page(), which should be moved out of cputlb.c.


r~
Ilya Leoshkevich Aug. 17, 2022, 1:27 p.m. UTC | #5
On Wed, 2022-08-17 at 08:15 -0500, Richard Henderson wrote:
> On 8/17/22 06:08, Ilya Leoshkevich wrote:
> > @@ -2243,6 +2250,13 @@ void page_set_flags(target_ulong start,
> > target_ulong end, int flags)
> >               (flags & PAGE_WRITE) &&
> >               p->first_tb) {
> >               tb_invalidate_phys_page(addr, 0);
> > +        } else {
> > +            TranslationBlock *tb;
> > +            int n;
> > +
> > +            PAGE_FOR_EACH_TB(p, tb, n) {
> > +                cpu_tb_jmp_cache_remove(tb);
> > +            }
> >           }
> 
> Here you would use tb_jmp_cache_clear_page(), which should be moved
> out of cputlb.c.

That was actually the first thing I tried.

Unfortunately tb_jmp_cache_clear_page() relies on
tb_jmp_cache_hash_func() returning the same top bits for addresses on
the same page.  This is not the case for qemu-user: there this property
was traded for better hashing with quite impressive performance
improvements (6f1653180f570).
Richard Henderson Aug. 17, 2022, 1:38 p.m. UTC | #6
On 8/17/22 08:27, Ilya Leoshkevich wrote:
> On Wed, 2022-08-17 at 08:15 -0500, Richard Henderson wrote:
>> On 8/17/22 06:08, Ilya Leoshkevich wrote:
>>> @@ -2243,6 +2250,13 @@ void page_set_flags(target_ulong start,
>>> target_ulong end, int flags)
>>>                (flags & PAGE_WRITE) &&
>>>                p->first_tb) {
>>>                tb_invalidate_phys_page(addr, 0);
>>> +        } else {
>>> +            TranslationBlock *tb;
>>> +            int n;
>>> +
>>> +            PAGE_FOR_EACH_TB(p, tb, n) {
>>> +                cpu_tb_jmp_cache_remove(tb);
>>> +            }
>>>            }
>>
>> Here you would use tb_jmp_cache_clear_page(), which should be moved
>> out of cputlb.c.
> 
> That was actually the first thing I tried.
> 
> Unfortunately tb_jmp_cache_clear_page() relies on
> tb_jmp_cache_hash_func() returning the same top bits for addresses on
> the same page.  This is not the case for qemu-user: there this property
> was traded for better hashing with quite impressive performance
> improvements (6f1653180f570).

Oh my.  Well, we could

(1) revert that patch because the premise is wrong,
(2) go with your per-tb clearing,
(3) clear the whole thing with cpu_tb_jmp_cache_clear

Ideally we'd have some benchmark numbers to inform the choice...


r~
Richard Henderson Aug. 17, 2022, 1:42 p.m. UTC | #7
On 8/17/22 06:08, Ilya Leoshkevich wrote:
> +static void cpu_tb_jmp_cache_remove(TranslationBlock *tb)
> +{
> +    CPUState *cpu;
> +    uint32_t h;
> +
> +    /* remove the TB from the hash list */
> +    if (TARGET_TB_PCREL) {
> +        /* Any TB may be at any virtual address */
> +        CPU_FOREACH(cpu) {
> +            cpu_tb_jmp_cache_clear(cpu);
> +        }

This comment is not currently true for user-only.  Although there's an outstanding bug 
report about our failure to manage virtual aliasing in user-only...

> +            PAGE_FOR_EACH_TB(p, tb, n) {
> +                cpu_tb_jmp_cache_remove(tb);
> +            }

You wouldn't want to call cpu_tb_jmp_cache_clear() 99 times for the 99 tb's on the page.

For user-only, I think mprotect is rare enough that just clearing the whole cache once is 
sufficient.


r~
Ilya Leoshkevich Aug. 17, 2022, 2:07 p.m. UTC | #8
On Wed, 2022-08-17 at 08:38 -0500, Richard Henderson wrote:
> On 8/17/22 08:27, Ilya Leoshkevich wrote:
> > On Wed, 2022-08-17 at 08:15 -0500, Richard Henderson wrote:
> > > On 8/17/22 06:08, Ilya Leoshkevich wrote:
> > > > @@ -2243,6 +2250,13 @@ void page_set_flags(target_ulong start,
> > > > target_ulong end, int flags)
> > > >                (flags & PAGE_WRITE) &&
> > > >                p->first_tb) {
> > > >                tb_invalidate_phys_page(addr, 0);
> > > > +        } else {
> > > > +            TranslationBlock *tb;
> > > > +            int n;
> > > > +
> > > > +            PAGE_FOR_EACH_TB(p, tb, n) {
> > > > +                cpu_tb_jmp_cache_remove(tb);
> > > > +            }
> > > >            }
> > > 
> > > Here you would use tb_jmp_cache_clear_page(), which should be
> > > moved
> > > out of cputlb.c.
> > 
> > That was actually the first thing I tried.
> > 
> > Unfortunately tb_jmp_cache_clear_page() relies on
> > tb_jmp_cache_hash_func() returning the same top bits for addresses
> > on
> > the same page.  This is not the case for qemu-user: there this
> > property
> > was traded for better hashing with quite impressive performance
> > improvements (6f1653180f570).
> 
> Oh my.  Well, we could
> 
> (1) revert that patch because the premise is wrong,
> (2) go with your per-tb clearing,
> (3) clear the whole thing with cpu_tb_jmp_cache_clear
> 
> Ideally we'd have some benchmark numbers to inform the choice...

FWIW 6f1653180f570 still looks useful.
Reverting it caused 620.omnetpp_s to take ~4% more time.
I ran the benchmark with reduced values in omnetpp.ini so as not to
wait forever, therefore the real figures might be closer to what the
commit message says. In any case this still shows that the patch has
measurable impact.
Richard Henderson Aug. 17, 2022, 4:07 p.m. UTC | #9
On 8/17/22 09:07, Ilya Leoshkevich wrote:
>> Oh my.  Well, we could
>>
>> (1) revert that patch because the premise is wrong,
>> (2) go with your per-tb clearing,
>> (3) clear the whole thing with cpu_tb_jmp_cache_clear
>>
>> Ideally we'd have some benchmark numbers to inform the choice...
> 
> FWIW 6f1653180f570 still looks useful.
> Reverting it caused 620.omnetpp_s to take ~4% more time.
> I ran the benchmark with reduced values in omnetpp.ini so as not to
> wait forever, therefore the real figures might be closer to what the
> commit message says. In any case this still shows that the patch has
> measurable impact.

Thanks for the testing.

I think option (3) will be best for user-only, because mprotect/munmap of existing code 
pages is rare -- usually only at process startup.


r~
diff mbox series

Patch

diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index a9b7053274..889355b341 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -209,13 +209,12 @@  static bool tb_lookup_cmp(const void *p, const void *d)
 }
 
 /* Might cause an exception, so have a longjmp destination ready */
-static TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
-                                   target_ulong cs_base,
+static TranslationBlock *tb_lookup(CPUState *cpu, tb_page_addr_t phys_pc,
+                                   target_ulong pc, target_ulong cs_base,
                                    uint32_t flags, uint32_t cflags)
 {
     CPUArchState *env = cpu->env_ptr;
     TranslationBlock *tb;
-    tb_page_addr_t phys_pc;
     struct tb_desc desc;
     uint32_t jmp_hash, tb_hash;
 
@@ -240,11 +239,8 @@  static TranslationBlock *tb_lookup(CPUState *cpu, target_ulong pc,
     desc.cflags = cflags;
     desc.trace_vcpu_dstate = *cpu->trace_dstate;
     desc.pc = pc;
-    phys_pc = get_page_addr_code(desc.env, pc);
-    if (phys_pc == -1) {
-        return NULL;
-    }
     desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
+
     tb_hash = tb_hash_func(phys_pc, pc, flags, cflags, *cpu->trace_dstate);
     tb = qht_lookup_custom(&tb_ctx.htable, &desc, tb_hash, tb_lookup_cmp);
     if (tb == NULL) {
@@ -371,6 +367,7 @@  const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
     TranslationBlock *tb;
     target_ulong cs_base, pc;
     uint32_t flags, cflags;
+    tb_page_addr_t phys_pc;
 
     cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
 
@@ -379,7 +376,12 @@  const void *HELPER(lookup_tb_ptr)(CPUArchState *env)
         cpu_loop_exit(cpu);
     }
 
-    tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
+    phys_pc = get_page_addr_code(env, pc);
+    if (phys_pc == -1) {
+        return tcg_code_gen_epilogue;
+    }
+
+    tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags, cflags);
     if (tb == NULL) {
         return tcg_code_gen_epilogue;
     }
@@ -482,6 +484,7 @@  void cpu_exec_step_atomic(CPUState *cpu)
     TranslationBlock *tb;
     target_ulong cs_base, pc;
     uint32_t flags, cflags;
+    tb_page_addr_t phys_pc;
     int tb_exit;
 
     if (sigsetjmp(cpu->jmp_env, 0) == 0) {
@@ -504,7 +507,12 @@  void cpu_exec_step_atomic(CPUState *cpu)
          * Any breakpoint for this insn will have been recognized earlier.
          */
 
-        tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
+        phys_pc = get_page_addr_code(env, pc);
+        if (phys_pc == -1) {
+            tb = NULL;
+        } else {
+            tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags, cflags);
+        }
         if (tb == NULL) {
             mmap_lock();
             tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
@@ -949,6 +957,7 @@  int cpu_exec(CPUState *cpu)
             TranslationBlock *tb;
             target_ulong cs_base, pc;
             uint32_t flags, cflags;
+            tb_page_addr_t phys_pc;
 
             cpu_get_tb_cpu_state(cpu->env_ptr, &pc, &cs_base, &flags);
 
@@ -970,7 +979,12 @@  int cpu_exec(CPUState *cpu)
                 break;
             }
 
-            tb = tb_lookup(cpu, pc, cs_base, flags, cflags);
+            phys_pc = get_page_addr_code(cpu->env_ptr, pc);
+            if (phys_pc == -1) {
+                tb = NULL;
+            } else {
+                tb = tb_lookup(cpu, phys_pc, pc, cs_base, flags, cflags);
+            }
             if (tb == NULL) {
                 mmap_lock();
                 tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);