diff mbox

[2/2] cpu-exec: remove tb_lock from the hot-path

Message ID 1467389770-9738-3-git-send-email-alex.bennee@linaro.org
State Superseded
Headers show

Commit Message

Alex Bennée July 1, 2016, 4:16 p.m. UTC
Lock contention in the hot path of moving between existing patched
TranslationBlocks is the main drag in multithreaded performance. This
patch pushes the tb_lock() usage down to the two places that really need
it:

  - code generation (tb_gen_code)
  - jump patching (tb_add_jump)

The rest of the code doesn't really need to hold a lock as it is either
using per-CPU structures, atomically updated or designed to be used in
concurrent read situations (qht_lookup).

To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the
locks become NOPs anyway until the MTTCG work is completed.

Signed-off-by: Alex Bennée <alex.bennee@linaro.org>


---
v3
  - fix merge conflicts with Sergey's patch
v4
  - revert name tweaking
  - drop test jmp_list_next outside lock
  - mention lock NOPs in comments
---
 cpu-exec.c | 49 ++++++++++++++++++++++---------------------------
 1 file changed, 22 insertions(+), 27 deletions(-)

-- 
2.7.4

Comments

Alex Bennée July 4, 2016, 11:45 a.m. UTC | #1
Emilio G. Cota <cota@braap.org> writes:

> On Fri, Jul 01, 2016 at 17:16:10 +0100, Alex Bennée wrote:

>> Lock contention in the hot path of moving between existing patched

>> TranslationBlocks is the main drag in multithreaded performance. This

>> patch pushes the tb_lock() usage down to the two places that really need

>> it:

>>

>>   - code generation (tb_gen_code)

>>   - jump patching (tb_add_jump)

>>

>> The rest of the code doesn't really need to hold a lock as it is either

>> using per-CPU structures, atomically updated or designed to be used in

>> concurrent read situations (qht_lookup).

>>

>> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the

>> locks become NOPs anyway until the MTTCG work is completed.

>

> From a scalability point of view it would be better to have a single

> critical section.


You mean merge the critical region for patching and code-generation?

>

> From a correctness point of view, we're reading tb->page_addr[1]

> without holding a lock. This field is set after qht_insert(tb),

> so we might read a yet-uninitialized value.

>

> I propose to just extend the critical section, like we used to

> do with tcg_lock_reset.

>

> 		Emilio



--
Alex Bennée
Alex Bennée July 5, 2016, 11:14 a.m. UTC | #2
Emilio G. Cota <cota@braap.org> writes:

> On Mon, Jul 04, 2016 at 12:45:52 +0100, Alex Bennée wrote:

>>

>> Emilio G. Cota <cota@braap.org> writes:

>>

>> > On Fri, Jul 01, 2016 at 17:16:10 +0100, Alex Bennée wrote:

>> >> Lock contention in the hot path of moving between existing patched

>> >> TranslationBlocks is the main drag in multithreaded performance. This

>> >> patch pushes the tb_lock() usage down to the two places that really need

>> >> it:

>> >>

>> >>   - code generation (tb_gen_code)

>> >>   - jump patching (tb_add_jump)

>> >>

>> >> The rest of the code doesn't really need to hold a lock as it is either

>> >> using per-CPU structures, atomically updated or designed to be used in

>> >> concurrent read situations (qht_lookup).

>> >>

>> >> To keep things simple I removed the #ifdef CONFIG_USER_ONLY stuff as the

>> >> locks become NOPs anyway until the MTTCG work is completed.

>> >

>> > From a scalability point of view it would be better to have a single

>> > critical section.

>>

>> You mean merge the critical region for patching and code-generation?

>

> Yes, I'd keep the lock held and drop it (if it was held) after the patching

> is done, like IIRC we used to do:

> (snip)

>> > I propose to just extend the critical section, like we used to

>> > do with tcg_lock_reset.


Hmm, so I came up with this:

/*
 * Patch the last TB with a jump to the current TB.
 *
 * Modification of the TB has to be protected with tb_lock which can
 * either be already held or taken here.
 */
static inline void maybe_patch_last_tb(CPUState *cpu,
                                       TranslationBlock *tb,
                                       TranslationBlock **last_tb,
                                       int tb_exit,
                                       bool locked)
{
    if (cpu->tb_flushed) {
        /* Ensure that no TB jump will be modified as the
         * translation buffer has been flushed.
         */
        *last_tb = NULL;
        cpu->tb_flushed = false;
    }
#ifndef CONFIG_USER_ONLY
    /* We don't take care of direct jumps when address mapping changes in
     * system emulation. So it's not safe to make a direct jump to a TB
     * spanning two pages because the mapping for the second page can change.
     */
    if (tb->page_addr[1] != -1) {
        *last_tb = NULL;
    }
#endif
    /* See if we can patch the calling TB. */
    if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
        if (!locked) {
            tb_lock();
        }
        tb_add_jump(*last_tb, tb_exit, tb);
        if (!locked) {
            tb_unlock();
        }
    }
}

/*
 * tb_find - find next TB, possibly generating it
 *
 * There is a multi-level lookup for finding the next TB which avoids
 * locks unless generation is required.
 *
 * 1. Lookup via the per-vcpu tb_jmp_cache
 * 2. Lookup via tb_find_physical (using QHT)
 *
 * If both of those fail then we need to grab the mmap_lock and
 * tb_lock and do code generation.
 *
 * As the jump patching of code also needs to be protected by locks we
 * have multiple paths into maybe_patch_last_tb taking advantage of
 * the fact we may already have locks held for code generation.
 */
static TranslationBlock *tb_find(CPUState *cpu,
                                 TranslationBlock **last_tb,
                                 int tb_exit)
{
    CPUArchState *env = (CPUArchState *)cpu->env_ptr;
    TranslationBlock *tb;
    target_ulong cs_base, pc;
    unsigned int h;
    uint32_t flags;

    /* we record a subset of the CPU state. It will
       always be the same before a given translated block
       is executed. */
    cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
    h = tb_jmp_cache_hash_func(pc);
    tb = atomic_read(&cpu->tb_jmp_cache[h]);

    if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                 tb->flags != flags)) {

        /* Ensure that we won't find a TB in the shared hash table
         * if it is being invalidated by some other thread.
         * Otherwise we'd put it back to CPU's local cache.
         * Pairs with smp_wmb() in tb_phys_invalidate(). */
        smp_rmb();
        tb = tb_find_physical(cpu, pc, cs_base, flags);

        if (!tb) {
            /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
             * taken outside tb_lock. As system emulation is currently
             * single threaded the locks are NOPs.
             */
            mmap_lock();
            tb_lock();

            /* There's a chance that our desired tb has been translated while
             * taking the locks so we check again inside the lock.
             */
            tb = tb_find_physical(cpu, pc, cs_base, flags);
            if (!tb) {
                /* if no translated code available, then translate it now */
                tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
            }
            maybe_patch_last_tb(cpu, tb, last_tb, tb_exit, true);

            tb_unlock();
            mmap_unlock();
        } else {
            maybe_patch_last_tb(cpu, tb, last_tb, tb_exit, false);
        }

        /* We update the TB in the virtual pc hash table for the fast lookup */
        atomic_set(&cpu->tb_jmp_cache[h], tb);
    } else {
        maybe_patch_last_tb(cpu, tb, last_tb, tb_exit, false);
    }

    return tb;
}

But it doesn't seem to make much difference to the microbenchmark and I
think makes the code a lot trickier to follow.

Is it worth it?

--
Alex Bennée
Alex Bennée July 5, 2016, 1:11 p.m. UTC | #3
Paolo Bonzini <pbonzini@redhat.com> writes:

> On 05/07/2016 13:14, Alex Bennée wrote:

>> /*

>>  * Patch the last TB with a jump to the current TB.

>>  *

>>  * Modification of the TB has to be protected with tb_lock which can

>>  * either be already held or taken here.

>>  */

>> static inline void maybe_patch_last_tb(CPUState *cpu,

>>                                        TranslationBlock *tb,

>>                                        TranslationBlock **last_tb,

>>                                        int tb_exit,

>>                                        bool locked)

>> {

>>     if (cpu->tb_flushed) {

>>         /* Ensure that no TB jump will be modified as the

>>          * translation buffer has been flushed.

>>          */

>>         *last_tb = NULL;

>>         cpu->tb_flushed = false;

>>     }

>> #ifndef CONFIG_USER_ONLY

>>     /* We don't take care of direct jumps when address mapping changes in

>>      * system emulation. So it's not safe to make a direct jump to a TB

>>      * spanning two pages because the mapping for the second page can change.

>>      */

>>     if (tb->page_addr[1] != -1) {

>>         *last_tb = NULL;

>>     }

>> #endif

>>     /* See if we can patch the calling TB. */

>>     if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {

>>         if (!locked) {

>>             tb_lock();

>>         }

>>         tb_add_jump(*last_tb, tb_exit, tb);

>>         if (!locked) {

>>             tb_unlock();

>>         }

>>     }

>> }

>

> Why not add tb_lock_recursive() and tb_lock_reset()?


I thought we didn't like having recursive locking? I agree it would make
things a little neater though.

--
Alex Bennée
Alex Bennée July 5, 2016, 4 p.m. UTC | #4
Sergey Fedorov <serge.fdrv@gmail.com> writes:

> On 05/07/16 16:42, Paolo Bonzini wrote:

>>

>> On 05/07/2016 15:11, Alex Bennée wrote:

>>> Paolo Bonzini <pbonzini@redhat.com> writes:

>>>

>>>> On 05/07/2016 13:14, Alex Bennée wrote:

>>>>> /*

>>>>>  * Patch the last TB with a jump to the current TB.

>>>>>  *

>>>>>  * Modification of the TB has to be protected with tb_lock which can

>>>>>  * either be already held or taken here.

>>>>>  */

>>>>> static inline void maybe_patch_last_tb(CPUState *cpu,

>>>>>                                        TranslationBlock *tb,

>>>>>                                        TranslationBlock **last_tb,

>>>>>                                        int tb_exit,

>>>>>                                        bool locked)

>>>>> {

>>>>>     if (cpu->tb_flushed) {

>>>>>         /* Ensure that no TB jump will be modified as the

>>>>>          * translation buffer has been flushed.

>>>>>          */

>>>>>         *last_tb = NULL;

>>>>>         cpu->tb_flushed = false;

>>>>>     }

>>>>> #ifndef CONFIG_USER_ONLY

>>>>>     /* We don't take care of direct jumps when address mapping changes in

>>>>>      * system emulation. So it's not safe to make a direct jump to a TB

>>>>>      * spanning two pages because the mapping for the second page can change.

>>>>>      */

>>>>>     if (tb->page_addr[1] != -1) {

>>>>>         *last_tb = NULL;

>>>>>     }

>>>>> #endif

>>>>>     /* See if we can patch the calling TB. */

>>>>>     if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {

>>>>>         if (!locked) {

>>>>>             tb_lock();

>>>>>         }

>>>>>         tb_add_jump(*last_tb, tb_exit, tb);

>>>>>         if (!locked) {

>>>>>             tb_unlock();

>>>>>         }

>>>>>     }

>>>>> }

>>>> Why not add tb_lock_recursive() and tb_lock_reset()?

>>> I thought we didn't like having recursive locking? I agree it would make

>>> things a little neater though.

>> I didn't like having recursive mutexes (because recursive mutexes

>> encourage you to be sloppy).  Explicitly tagging some tb_lock()s as

>> recursive is fine, though.  The explicit tag tells you to be careful.

>

> We could also introduce mmap_lock_reset() similar to tb_lock_reset().

> Then we can remove tb_unlock() and mmap_unlock() from tb_find_slow(),

> call tb_lock() and mmap_lock() right before tb_add_jump(), and then call

> tb_lock_reset() and mmap_lock_reset() at the end of tb_find_fast(). This

> would render tb_lock() pretty useless though, since it would be always

> held under mmap_lock().


I'm about to post v2. I've put all this aggressive extension of the
critical path as additional patches as I'm not sure it is really worth
it.

The big win is doing the tb_jmp_cache and first tb_find_physical lookups
out of the lock. Trying to avoid bouncing the tb_lock() when patching
doesn't seem to make any difference in my micro benchmarks.

>

> Kind regards,

> Sergey



--
Alex Bennée
diff mbox

Patch

diff --git a/cpu-exec.c b/cpu-exec.c
index 10ce1cb..b731774 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -291,35 +291,29 @@  static TranslationBlock *tb_find_slow(CPUState *cpu,
      * Pairs with smp_wmb() in tb_phys_invalidate(). */
     smp_rmb();
     tb = tb_find_physical(cpu, pc, cs_base, flags);
-    if (tb) {
-        goto found;
-    }
+    if (!tb) {
 
-#ifdef CONFIG_USER_ONLY
-    /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
-     * taken outside tb_lock.  Since we're momentarily dropping
-     * tb_lock, there's a chance that our desired tb has been
-     * translated.
-     */
-    tb_unlock();
-    mmap_lock();
-    tb_lock();
-    tb = tb_find_physical(cpu, pc, cs_base, flags);
-    if (tb) {
-        mmap_unlock();
-        goto found;
-    }
-#endif
+        /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
+         * taken outside tb_lock. As system emulation is currently
+         * single threaded the locks are NOPs.
+         */
+        mmap_lock();
+        tb_lock();
 
-    /* if no translated code available, then translate it now */
-    tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+        /* There's a chance that our desired tb has been translated while
+         * taking the locks so we check again inside the lock.
+         */
+        tb = tb_find_physical(cpu, pc, cs_base, flags);
+        if (!tb) {
+            /* if no translated code available, then translate it now */
+            tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+        }
 
-#ifdef CONFIG_USER_ONLY
-    mmap_unlock();
-#endif
+        tb_unlock();
+        mmap_unlock();
+    }
 
-found:
-    /* we add the TB in the virtual pc hash table */
+    /* We add the TB in the virtual pc hash table for the fast lookup */
     atomic_set(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
     return tb;
 }
@@ -337,7 +331,6 @@  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
        always be the same before a given translated block
        is executed. */
     cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
-    tb_lock();
     tb = atomic_read(&cpu->tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
     if (unlikely(!tb || tb->pc != pc || tb->cs_base != cs_base ||
                  tb->flags != flags)) {
@@ -359,11 +352,13 @@  static inline TranslationBlock *tb_find_fast(CPUState *cpu,
         *last_tb = NULL;
     }
 #endif
+
     /* See if we can patch the calling TB. */
     if (*last_tb && !qemu_loglevel_mask(CPU_LOG_TB_NOCHAIN)) {
+        tb_lock();
         tb_add_jump(*last_tb, tb_exit, tb);
+        tb_unlock();
     }
-    tb_unlock();
     return tb;
 }