diff mbox series

[for-4.0,v2,09/37] tcg/i386: Use TCG_TARGET_NEED_LDST_OOL_LABELS

Message ID 20181123144558.5048-10-richard.henderson@linaro.org
State New
Headers show
Series tcg: Assorted cleanups | expand

Commit Message

Richard Henderson Nov. 23, 2018, 2:45 p.m. UTC
Move the entire memory operation out of line.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 tcg/i386/tcg-target.h     |   2 +-
 tcg/i386/tcg-target.inc.c | 391 ++++++++++++++++----------------------
 2 files changed, 162 insertions(+), 231 deletions(-)

-- 
2.17.2

Comments

Alex Bennée Nov. 30, 2018, 5:22 p.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> Move the entire memory operation out of line.


Given Emilio's numbers is it likely we will want to support both options
given the variability on x86?

>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  tcg/i386/tcg-target.h     |   2 +-

>  tcg/i386/tcg-target.inc.c | 391 ++++++++++++++++----------------------

>  2 files changed, 162 insertions(+), 231 deletions(-)

>

> diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h

> index 2441658865..1b2d4e1b0d 100644

> --- a/tcg/i386/tcg-target.h

> +++ b/tcg/i386/tcg-target.h

> @@ -220,7 +220,7 @@ static inline void tb_target_set_jmp_target(uintptr_t tc_ptr,

>  #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)

>

>  #ifdef CONFIG_SOFTMMU

> -#define TCG_TARGET_NEED_LDST_LABELS

> +#define TCG_TARGET_NEED_LDST_OOL_LABELS

>  #endif

>  #define TCG_TARGET_NEED_POOL_LABELS

>

> diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c

> index 50e5dc31b3..5c68cbd43d 100644

> --- a/tcg/i386/tcg-target.inc.c

> +++ b/tcg/i386/tcg-target.inc.c

> @@ -1643,7 +1643,7 @@ static void tcg_out_nopn(TCGContext *s, int n)

>  }

>

>  #if defined(CONFIG_SOFTMMU)

> -#include "tcg-ldst.inc.c"

> +#include "tcg-ldst-ool.inc.c"

>

>  /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,

>   *                                     int mmu_idx, uintptr_t ra)

> @@ -1656,6 +1656,14 @@ static void * const qemu_ld_helpers[16] = {

>      [MO_BEUW] = helper_be_lduw_mmu,

>      [MO_BEUL] = helper_be_ldul_mmu,

>      [MO_BEQ]  = helper_be_ldq_mmu,

> +

> +    [MO_SB]   = helper_ret_ldsb_mmu,

> +    [MO_LESW] = helper_le_ldsw_mmu,

> +    [MO_BESW] = helper_be_ldsw_mmu,

> +#if TCG_TARGET_REG_BITS == 64

> +    [MO_LESL] = helper_le_ldsl_mmu,

> +    [MO_BESL] = helper_be_ldsl_mmu,

> +#endif


Can we mention why these are added in the commit message please?

 rth: why has qemu_ld_helpers been filled out? Did those loads not
    happen before?
<rth> stsquad, previously we performed sign-extensions inline after
    returning from the helper; with the change to a tail call we can't
    do that anymore.
 rth: maybe that could go in the commit message then...


>  };

>

>  /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,

> @@ -1765,18 +1773,18 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,

>      }

>

>      /* jne slow_path */

> -    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);

> +    tcg_out_opc(s, OPC_JCC_short + JCC_JNE, 0, 0, 0);

>      label_ptr[0] = s->code_ptr;

> -    s->code_ptr += 4;

> +    s->code_ptr += 1;

>

>      if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

>          /* cmp 4(r0), addrhi */

>          tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);

>

>          /* jne slow_path */

> -        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);

> +        tcg_out_opc(s, OPC_JCC_short + JCC_JNE, 0, 0, 0);

>          label_ptr[1] = s->code_ptr;

> -        s->code_ptr += 4;

> +        s->code_ptr += 1;

>      }

>

>      /* TLB Hit.  */

> @@ -1788,181 +1796,6 @@ static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,

>      return base;

>  }

>

> -/*

> - * Record the context of a call to the out of line helper code for the slow path

> - * for a load or store, so that we can later generate the correct helper code

> - */

> -static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,

> -                                TCGReg datalo, TCGReg datahi,

> -                                TCGReg addrlo, TCGReg addrhi,

> -                                tcg_insn_unit *raddr,

> -                                tcg_insn_unit **label_ptr)

> -{

> -    TCGLabelQemuLdst *label = new_ldst_label(s);

> -

> -    label->is_ld = is_ld;

> -    label->oi = oi;

> -    label->datalo_reg = datalo;

> -    label->datahi_reg = datahi;

> -    label->addrlo_reg = addrlo;

> -    label->addrhi_reg = addrhi;

> -    label->raddr = raddr;

> -    label->label_ptr[0] = label_ptr[0];

> -    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

> -        label->label_ptr[1] = label_ptr[1];

> -    }

> -}

> -

> -/*

> - * Generate code for the slow path for a load at the end of block

> - */

> -static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)

> -{

> -    TCGMemOpIdx oi = l->oi;

> -    TCGMemOp opc = get_memop(oi);

> -    TCGReg data_reg;

> -    tcg_insn_unit **label_ptr = &l->label_ptr[0];

> -

> -    /* resolve label address */

> -    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);

> -    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

> -        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);

> -    }

> -

> -    if (TCG_TARGET_REG_BITS == 32) {

> -        int ofs = 0;

> -

> -        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);

> -        ofs += 4;

> -

> -        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);

> -        ofs += 4;

> -

> -        if (TARGET_LONG_BITS == 64) {

> -            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);

> -            ofs += 4;

> -        }

> -

> -        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);

> -        ofs += 4;

> -

> -        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);

> -    } else {

> -        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);

> -        /* The second argument is already loaded with addrlo.  */

> -        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);

> -        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],

> -                     (uintptr_t)l->raddr);

> -    }

> -

> -    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);

> -

> -    data_reg = l->datalo_reg;

> -    switch (opc & MO_SSIZE) {

> -    case MO_SB:

> -        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);

> -        break;

> -    case MO_SW:

> -        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);

> -        break;

> -#if TCG_TARGET_REG_BITS == 64

> -    case MO_SL:

> -        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);

> -        break;

> -#endif

> -    case MO_UB:

> -    case MO_UW:

> -        /* Note that the helpers have zero-extended to tcg_target_long.  */

> -    case MO_UL:

> -        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);

> -        break;

> -    case MO_Q:

> -        if (TCG_TARGET_REG_BITS == 64) {

> -            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);

> -        } else if (data_reg == TCG_REG_EDX) {

> -            /* xchg %edx, %eax */

> -            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);

> -            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);

> -        } else {

> -            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);

> -            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);

> -        }

> -        break;

> -    default:

> -        tcg_abort();

> -    }

> -

> -    /* Jump to the code corresponding to next IR of qemu_st */

> -    tcg_out_jmp(s, l->raddr);

> -}

> -

> -/*

> - * Generate code for the slow path for a store at the end of block

> - */

> -static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)

> -{

> -    TCGMemOpIdx oi = l->oi;

> -    TCGMemOp opc = get_memop(oi);

> -    TCGMemOp s_bits = opc & MO_SIZE;

> -    tcg_insn_unit **label_ptr = &l->label_ptr[0];

> -    TCGReg retaddr;

> -

> -    /* resolve label address */

> -    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);

> -    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

> -        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);

> -    }

> -

> -    if (TCG_TARGET_REG_BITS == 32) {

> -        int ofs = 0;

> -

> -        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);

> -        ofs += 4;

> -

> -        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);

> -        ofs += 4;

> -

> -        if (TARGET_LONG_BITS == 64) {

> -            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);

> -            ofs += 4;

> -        }

> -

> -        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);

> -        ofs += 4;

> -

> -        if (s_bits == MO_64) {

> -            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);

> -            ofs += 4;

> -        }

> -

> -        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);

> -        ofs += 4;

> -

> -        retaddr = TCG_REG_EAX;

> -        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);

> -        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);

> -    } else {

> -        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);

> -        /* The second argument is already loaded with addrlo.  */

> -        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),

> -                    tcg_target_call_iarg_regs[2], l->datalo_reg);

> -        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);

> -

> -        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {

> -            retaddr = tcg_target_call_iarg_regs[4];

> -            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);

> -        } else {

> -            retaddr = TCG_REG_RAX;

> -            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);

> -            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,

> -                       TCG_TARGET_CALL_STACK_OFFSET);

> -        }

> -    }

> -

> -    /* "Tail call" to the helper, with the return address back inline.  */

> -    tcg_out_push(s, retaddr);

> -    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);

> -}

>  #elif defined(__x86_64__) && defined(__linux__)

>  # include <asm/prctl.h>

>  # include <sys/prctl.h>

> @@ -2091,7 +1924,6 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)

>      TCGReg datahi __attribute__((unused)) = -1;

>      TCGReg addrhi __attribute__((unused)) = -1;

>      TCGMemOpIdx oi;

> -    TCGMemOp opc;

>      int i = -1;

>

>      datalo = args[++i];

> @@ -2103,35 +1935,25 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)

>          addrhi = args[++i];

>      }

>      oi = args[++i];

> -    opc = get_memop(oi);

>

>  #if defined(CONFIG_SOFTMMU)

> -    {

> -        int mem_index = get_mmuidx(oi);

> -        tcg_insn_unit *label_ptr[2];

> -        TCGReg base;

> -

> -        tcg_debug_assert(datalo == softmmu_arg(ARG_LDVAL, is64, 0));

> -        if (TCG_TARGET_REG_BITS == 32 && is64) {

> -            tcg_debug_assert(datahi == softmmu_arg(ARG_LDVAL, is64, 1));

> -        }

> -        tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));

> -        if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

> -            tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));

> -        }

> -

> -        base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,

> -                                label_ptr, offsetof(CPUTLBEntry, addr_read));

> -

> -        /* TLB Hit.  */

> -        tcg_out_qemu_ld_direct(s, datalo, datahi, base, -1, 0, 0, opc);

> -

> -        /* Record the current context of a load into ldst label */

> -        add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,

> -                            s->code_ptr, label_ptr);

> +    /* Assert that we've set up the constraints properly.  */

> +    tcg_debug_assert(datalo == softmmu_arg(ARG_LDVAL, is64, 0));

> +    if (TCG_TARGET_REG_BITS == 32 && is64) {

> +        tcg_debug_assert(datahi == softmmu_arg(ARG_LDVAL, is64, 1));

>      }

> +    tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));

> +    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

> +        tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));

> +    }

> +

> +    /* Call to thunk.  */

> +    tcg_out8(s, OPC_CALL_Jz);

> +    add_ldst_ool_label(s, true, is64, oi, R_386_PC32, -4);

> +    s->code_ptr += 4;

>  #else

>      {

> +        TCGMemOp opc = get_memop(oi);

>          int32_t offset = guest_base;

>          TCGReg base = addrlo;

>          int index = -1;

> @@ -2246,7 +2068,6 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)

>      TCGReg datahi __attribute__((unused)) = -1;

>      TCGReg addrhi __attribute__((unused)) = -1;

>      TCGMemOpIdx oi;

> -    TCGMemOp opc;

>      int i = -1;

>

>      datalo = args[++i];

> @@ -2258,35 +2079,25 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)

>          addrhi = args[++i];

>      }

>      oi = args[++i];

> -    opc = get_memop(oi);

>

>  #if defined(CONFIG_SOFTMMU)

> -    {

> -        int mem_index = get_mmuidx(oi);

> -        tcg_insn_unit *label_ptr[2];

> -        TCGReg base;

> -

> -        tcg_debug_assert(datalo == softmmu_arg(ARG_STVAL, is64, 0));

> -        if (TCG_TARGET_REG_BITS == 32 && is64) {

> -            tcg_debug_assert(datahi == softmmu_arg(ARG_STVAL, is64, 1));

> -        }

> -        tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));

> -        if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

> -            tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));

> -        }

> -

> -        base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,

> -                                label_ptr, offsetof(CPUTLBEntry, addr_write));

> -

> -        /* TLB Hit.  */

> -        tcg_out_qemu_st_direct(s, datalo, datahi, base, 0, 0, opc);

> -

> -        /* Record the current context of a store into ldst label */

> -        add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,

> -                            s->code_ptr, label_ptr);

> +    /* Assert that we've set up the constraints properly.  */

> +    tcg_debug_assert(datalo == softmmu_arg(ARG_STVAL, is64, 0));

> +    if (TCG_TARGET_REG_BITS == 32 && is64) {

> +        tcg_debug_assert(datahi == softmmu_arg(ARG_STVAL, is64, 1));

>      }

> +    tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));

> +    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

> +        tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));

> +    }

> +

> +    /* Call to thunk.  */

> +    tcg_out8(s, OPC_CALL_Jz);

> +    add_ldst_ool_label(s, false, is64, oi, R_386_PC32, -4);

> +    s->code_ptr += 4;

>  #else

>      {

> +        TCGMemOp opc = get_memop(oi);

>          int32_t offset = guest_base;

>          TCGReg base = addrlo;

>          int seg = 0;

> @@ -2321,6 +2132,126 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)

>  #endif

>  }

>

> +#if defined(CONFIG_SOFTMMU)

> +/*

> + * Generate code for an out-of-line thunk performing a load.

> + */

> +static tcg_insn_unit *tcg_out_qemu_ldst_ool(TCGContext *s, bool is_ld,

> +                                            bool is_64, TCGMemOpIdx oi)

> +{

> +    TCGMemOp opc = get_memop(oi);

> +    int mem_index = get_mmuidx(oi);

> +    tcg_insn_unit *label_ptr[2], *thunk;

> +    TCGReg datalo, addrlo, base;

> +    TCGReg datahi __attribute__((unused)) = -1;

> +    TCGReg addrhi __attribute__((unused)) = -1;

> +    int i;

> +

> +    /* Since we're amortizing the cost, align the thunk.  */

> +    thunk = QEMU_ALIGN_PTR_UP(s->code_ptr, 16);

> +    if (thunk != s->code_ptr) {

> +        memset(s->code_ptr, 0x90, thunk - s->code_ptr);

> +        s->code_ptr = thunk;

> +    }

> +

> +    /* Discover where the inputs are held.  */

> +    addrlo = softmmu_arg(ARG_ADDR, 0, 0);

> +    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

> +        addrhi = softmmu_arg(ARG_ADDR, 0, 1);

> +    }

> +    datalo = softmmu_arg(is_ld ? ARG_LDVAL : ARG_STVAL, is_64, 0);

> +    if (TCG_TARGET_REG_BITS == 32 && is_64) {

> +        datahi = softmmu_arg(is_ld ? ARG_LDVAL : ARG_STVAL, is_64, 1);

> +    }

> +

> +    base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, label_ptr,

> +                            is_ld ? offsetof(CPUTLBEntry, addr_read)

> +                            : offsetof(CPUTLBEntry, addr_write));

> +

> +    /* TLB Hit.  */

> +    if (is_ld) {

> +        tcg_out_qemu_ld_direct(s, datalo, datahi, base, -1, 0, 0, opc);

> +    } else {

> +        tcg_out_qemu_st_direct(s, datalo, datahi, base, 0, 0, opc);

> +    }

> +    tcg_out_opc(s, OPC_RET, 0, 0, 0);

> +

> +    /* TLB Miss.  */

> +

> +    /* resolve label address */

> +    tcg_patch8(label_ptr[0], s->code_ptr - label_ptr[0] - 1);

> +    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {

> +        tcg_patch8(label_ptr[1], s->code_ptr - label_ptr[1] - 1);

> +    }

> +

> +    if (TCG_TARGET_REG_BITS == 32) {

> +        /* Copy the return address into a temporary.  */

> +        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_ESP, 0);

> +        i = 4;

> +

> +        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, i);

> +        i += 4;

> +

> +        tcg_out_st(s, TCG_TYPE_I32, addrlo, TCG_REG_ESP, i);

> +        i += 4;

> +

> +        if (TARGET_LONG_BITS == 64) {

> +            tcg_out_st(s, TCG_TYPE_I32, addrhi, TCG_REG_ESP, i);

> +            i += 4;

> +        }

> +

> +        if (!is_ld) {

> +            tcg_out_st(s, TCG_TYPE_I32, datalo, TCG_REG_ESP, i);

> +            i += 4;

> +

> +            if (is_64) {

> +                tcg_out_st(s, TCG_TYPE_I32, datahi, TCG_REG_ESP, i);

> +                i += 4;

> +            }

> +        }

> +

> +        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, i);

> +        i += 4;

> +

> +        tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_ESP, i);

> +    } else {

> +        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);

> +

> +        /* The address and data values have been placed by constraints.  */

> +        tcg_debug_assert(addrlo == tcg_target_call_iarg_regs[1]);

> +        if (is_ld) {

> +            i = 2;

> +        } else {

> +            tcg_debug_assert(datalo == tcg_target_call_iarg_regs[2]);

> +            i = 3;

> +        }

> +

> +        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[i++], oi);

> +

> +        /* Copy the return address from the stack to the rvalue argument.

> +         * WIN64 runs out of argument registers for stores.

> +         */

> +        if (i < (int)ARRAY_SIZE(tcg_target_call_iarg_regs)) {

> +            tcg_out_ld(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[i],

> +                       TCG_REG_ESP, 0);

> +        } else {

> +            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_RAX, TCG_REG_ESP, 0);

> +            tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_RAX, TCG_REG_ESP,

> +                       TCG_TARGET_CALL_STACK_OFFSET + 8);

> +        }

> +    }

> +

> +    /* Tail call to the helper.  */

> +    if (is_ld) {

> +        tcg_out_jmp(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]);

> +    } else {

> +        tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);

> +    }

> +

> +    return thunk;

> +}

> +#endif

> +

>  static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,

>                                const TCGArg *args, const int *const_args)

>  {


Otherwise:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>



--
Alex Bennée
Richard Henderson Nov. 30, 2018, 5:37 p.m. UTC | #2
On 11/30/18 9:22 AM, Alex Bennée wrote:
> 

> Richard Henderson <richard.henderson@linaro.org> writes:

> 

>> Move the entire memory operation out of line.

> 

> Given Emilio's numbers is it likely we will want to support both options

> given the variability on x86?


No, I don't want to support two methods in any one tcg backend.
Which is why I'm not really sure what to do about Emilio's results.


r~
Alex Bennée Nov. 30, 2018, 5:52 p.m. UTC | #3
Richard Henderson <richard.henderson@linaro.org> writes:

> On 11/30/18 9:22 AM, Alex Bennée wrote:

>>

>> Richard Henderson <richard.henderson@linaro.org> writes:

>>

>>> Move the entire memory operation out of line.

>>

>> Given Emilio's numbers is it likely we will want to support both options

>> given the variability on x86?

>

> No, I don't want to support two methods in any one tcg backend.

> Which is why I'm not really sure what to do about Emilio's results.


They at least seem pretty positive on aarch64 backends....

--
Alex Bennée
diff mbox series

Patch

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 2441658865..1b2d4e1b0d 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -220,7 +220,7 @@  static inline void tb_target_set_jmp_target(uintptr_t tc_ptr,
 #define TCG_TARGET_DEFAULT_MO (TCG_MO_ALL & ~TCG_MO_ST_LD)
 
 #ifdef CONFIG_SOFTMMU
-#define TCG_TARGET_NEED_LDST_LABELS
+#define TCG_TARGET_NEED_LDST_OOL_LABELS
 #endif
 #define TCG_TARGET_NEED_POOL_LABELS
 
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 50e5dc31b3..5c68cbd43d 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1643,7 +1643,7 @@  static void tcg_out_nopn(TCGContext *s, int n)
 }
 
 #if defined(CONFIG_SOFTMMU)
-#include "tcg-ldst.inc.c"
+#include "tcg-ldst-ool.inc.c"
 
 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
  *                                     int mmu_idx, uintptr_t ra)
@@ -1656,6 +1656,14 @@  static void * const qemu_ld_helpers[16] = {
     [MO_BEUW] = helper_be_lduw_mmu,
     [MO_BEUL] = helper_be_ldul_mmu,
     [MO_BEQ]  = helper_be_ldq_mmu,
+
+    [MO_SB]   = helper_ret_ldsb_mmu,
+    [MO_LESW] = helper_le_ldsw_mmu,
+    [MO_BESW] = helper_be_ldsw_mmu,
+#if TCG_TARGET_REG_BITS == 64
+    [MO_LESL] = helper_le_ldsl_mmu,
+    [MO_BESL] = helper_be_ldsl_mmu,
+#endif
 };
 
 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
@@ -1765,18 +1773,18 @@  static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     }
 
     /* jne slow_path */
-    tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+    tcg_out_opc(s, OPC_JCC_short + JCC_JNE, 0, 0, 0);
     label_ptr[0] = s->code_ptr;
-    s->code_ptr += 4;
+    s->code_ptr += 1;
 
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
         /* cmp 4(r0), addrhi */
         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
 
         /* jne slow_path */
-        tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
+        tcg_out_opc(s, OPC_JCC_short + JCC_JNE, 0, 0, 0);
         label_ptr[1] = s->code_ptr;
-        s->code_ptr += 4;
+        s->code_ptr += 1;
     }
 
     /* TLB Hit.  */
@@ -1788,181 +1796,6 @@  static TCGReg tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
     return base;
 }
 
-/*
- * Record the context of a call to the out of line helper code for the slow path
- * for a load or store, so that we can later generate the correct helper code
- */
-static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
-                                TCGReg datalo, TCGReg datahi,
-                                TCGReg addrlo, TCGReg addrhi,
-                                tcg_insn_unit *raddr,
-                                tcg_insn_unit **label_ptr)
-{
-    TCGLabelQemuLdst *label = new_ldst_label(s);
-
-    label->is_ld = is_ld;
-    label->oi = oi;
-    label->datalo_reg = datalo;
-    label->datahi_reg = datahi;
-    label->addrlo_reg = addrlo;
-    label->addrhi_reg = addrhi;
-    label->raddr = raddr;
-    label->label_ptr[0] = label_ptr[0];
-    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        label->label_ptr[1] = label_ptr[1];
-    }
-}
-
-/*
- * Generate code for the slow path for a load at the end of block
- */
-static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-    TCGMemOpIdx oi = l->oi;
-    TCGMemOp opc = get_memop(oi);
-    TCGReg data_reg;
-    tcg_insn_unit **label_ptr = &l->label_ptr[0];
-
-    /* resolve label address */
-    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
-    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
-    }
-
-    if (TCG_TARGET_REG_BITS == 32) {
-        int ofs = 0;
-
-        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        if (TARGET_LONG_BITS == 64) {
-            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
-            ofs += 4;
-        }
-
-        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
-    } else {
-        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
-        /* The second argument is already loaded with addrlo.  */
-        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
-        tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
-                     (uintptr_t)l->raddr);
-    }
-
-    tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
-
-    data_reg = l->datalo_reg;
-    switch (opc & MO_SSIZE) {
-    case MO_SB:
-        tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
-        break;
-    case MO_SW:
-        tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
-        break;
-#if TCG_TARGET_REG_BITS == 64
-    case MO_SL:
-        tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
-        break;
-#endif
-    case MO_UB:
-    case MO_UW:
-        /* Note that the helpers have zero-extended to tcg_target_long.  */
-    case MO_UL:
-        tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-        break;
-    case MO_Q:
-        if (TCG_TARGET_REG_BITS == 64) {
-            tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
-        } else if (data_reg == TCG_REG_EDX) {
-            /* xchg %edx, %eax */
-            tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
-            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
-        } else {
-            tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
-            tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
-        }
-        break;
-    default:
-        tcg_abort();
-    }
-
-    /* Jump to the code corresponding to next IR of qemu_st */
-    tcg_out_jmp(s, l->raddr);
-}
-
-/*
- * Generate code for the slow path for a store at the end of block
- */
-static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
-{
-    TCGMemOpIdx oi = l->oi;
-    TCGMemOp opc = get_memop(oi);
-    TCGMemOp s_bits = opc & MO_SIZE;
-    tcg_insn_unit **label_ptr = &l->label_ptr[0];
-    TCGReg retaddr;
-
-    /* resolve label address */
-    tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
-    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-        tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
-    }
-
-    if (TCG_TARGET_REG_BITS == 32) {
-        int ofs = 0;
-
-        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        if (TARGET_LONG_BITS == 64) {
-            tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
-            ofs += 4;
-        }
-
-        tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        if (s_bits == MO_64) {
-            tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
-            ofs += 4;
-        }
-
-        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
-        ofs += 4;
-
-        retaddr = TCG_REG_EAX;
-        tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
-        tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
-    } else {
-        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
-        /* The second argument is already loaded with addrlo.  */
-        tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
-                    tcg_target_call_iarg_regs[2], l->datalo_reg);
-        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
-
-        if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
-            retaddr = tcg_target_call_iarg_regs[4];
-            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
-        } else {
-            retaddr = TCG_REG_RAX;
-            tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
-            tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
-                       TCG_TARGET_CALL_STACK_OFFSET);
-        }
-    }
-
-    /* "Tail call" to the helper, with the return address back inline.  */
-    tcg_out_push(s, retaddr);
-    tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
-}
 #elif defined(__x86_64__) && defined(__linux__)
 # include <asm/prctl.h>
 # include <sys/prctl.h>
@@ -2091,7 +1924,6 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
     TCGReg datahi __attribute__((unused)) = -1;
     TCGReg addrhi __attribute__((unused)) = -1;
     TCGMemOpIdx oi;
-    TCGMemOp opc;
     int i = -1;
 
     datalo = args[++i];
@@ -2103,35 +1935,25 @@  static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
         addrhi = args[++i];
     }
     oi = args[++i];
-    opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
-    {
-        int mem_index = get_mmuidx(oi);
-        tcg_insn_unit *label_ptr[2];
-        TCGReg base;
-
-        tcg_debug_assert(datalo == softmmu_arg(ARG_LDVAL, is64, 0));
-        if (TCG_TARGET_REG_BITS == 32 && is64) {
-            tcg_debug_assert(datahi == softmmu_arg(ARG_LDVAL, is64, 1));
-        }
-        tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));
-        if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-            tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));
-        }
-
-        base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
-                                label_ptr, offsetof(CPUTLBEntry, addr_read));
-
-        /* TLB Hit.  */
-        tcg_out_qemu_ld_direct(s, datalo, datahi, base, -1, 0, 0, opc);
-
-        /* Record the current context of a load into ldst label */
-        add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
-                            s->code_ptr, label_ptr);
+    /* Assert that we've set up the constraints properly.  */
+    tcg_debug_assert(datalo == softmmu_arg(ARG_LDVAL, is64, 0));
+    if (TCG_TARGET_REG_BITS == 32 && is64) {
+        tcg_debug_assert(datahi == softmmu_arg(ARG_LDVAL, is64, 1));
     }
+    tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));
+    }
+
+    /* Call to thunk.  */
+    tcg_out8(s, OPC_CALL_Jz);
+    add_ldst_ool_label(s, true, is64, oi, R_386_PC32, -4);
+    s->code_ptr += 4;
 #else
     {
+        TCGMemOp opc = get_memop(oi);
         int32_t offset = guest_base;
         TCGReg base = addrlo;
         int index = -1;
@@ -2246,7 +2068,6 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
     TCGReg datahi __attribute__((unused)) = -1;
     TCGReg addrhi __attribute__((unused)) = -1;
     TCGMemOpIdx oi;
-    TCGMemOp opc;
     int i = -1;
 
     datalo = args[++i];
@@ -2258,35 +2079,25 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
         addrhi = args[++i];
     }
     oi = args[++i];
-    opc = get_memop(oi);
 
 #if defined(CONFIG_SOFTMMU)
-    {
-        int mem_index = get_mmuidx(oi);
-        tcg_insn_unit *label_ptr[2];
-        TCGReg base;
-
-        tcg_debug_assert(datalo == softmmu_arg(ARG_STVAL, is64, 0));
-        if (TCG_TARGET_REG_BITS == 32 && is64) {
-            tcg_debug_assert(datahi == softmmu_arg(ARG_STVAL, is64, 1));
-        }
-        tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));
-        if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
-            tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));
-        }
-
-        base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
-                                label_ptr, offsetof(CPUTLBEntry, addr_write));
-
-        /* TLB Hit.  */
-        tcg_out_qemu_st_direct(s, datalo, datahi, base, 0, 0, opc);
-
-        /* Record the current context of a store into ldst label */
-        add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
-                            s->code_ptr, label_ptr);
+    /* Assert that we've set up the constraints properly.  */
+    tcg_debug_assert(datalo == softmmu_arg(ARG_STVAL, is64, 0));
+    if (TCG_TARGET_REG_BITS == 32 && is64) {
+        tcg_debug_assert(datahi == softmmu_arg(ARG_STVAL, is64, 1));
     }
+    tcg_debug_assert(addrlo == softmmu_arg(ARG_ADDR, 0, 0));
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        tcg_debug_assert(addrhi == softmmu_arg(ARG_ADDR, 0, 1));
+    }
+
+    /* Call to thunk.  */
+    tcg_out8(s, OPC_CALL_Jz);
+    add_ldst_ool_label(s, false, is64, oi, R_386_PC32, -4);
+    s->code_ptr += 4;
 #else
     {
+        TCGMemOp opc = get_memop(oi);
         int32_t offset = guest_base;
         TCGReg base = addrlo;
         int seg = 0;
@@ -2321,6 +2132,126 @@  static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
 #endif
 }
 
+#if defined(CONFIG_SOFTMMU)
+/*
+ * Generate code for an out-of-line thunk performing a load.
+ */
+static tcg_insn_unit *tcg_out_qemu_ldst_ool(TCGContext *s, bool is_ld,
+                                            bool is_64, TCGMemOpIdx oi)
+{
+    TCGMemOp opc = get_memop(oi);
+    int mem_index = get_mmuidx(oi);
+    tcg_insn_unit *label_ptr[2], *thunk;
+    TCGReg datalo, addrlo, base;
+    TCGReg datahi __attribute__((unused)) = -1;
+    TCGReg addrhi __attribute__((unused)) = -1;
+    int i;
+
+    /* Since we're amortizing the cost, align the thunk.  */
+    thunk = QEMU_ALIGN_PTR_UP(s->code_ptr, 16);
+    if (thunk != s->code_ptr) {
+        memset(s->code_ptr, 0x90, thunk - s->code_ptr);
+        s->code_ptr = thunk;
+    }
+
+    /* Discover where the inputs are held.  */
+    addrlo = softmmu_arg(ARG_ADDR, 0, 0);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        addrhi = softmmu_arg(ARG_ADDR, 0, 1);
+    }
+    datalo = softmmu_arg(is_ld ? ARG_LDVAL : ARG_STVAL, is_64, 0);
+    if (TCG_TARGET_REG_BITS == 32 && is_64) {
+        datahi = softmmu_arg(is_ld ? ARG_LDVAL : ARG_STVAL, is_64, 1);
+    }
+
+    base = tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc, label_ptr,
+                            is_ld ? offsetof(CPUTLBEntry, addr_read)
+                            : offsetof(CPUTLBEntry, addr_write));
+
+    /* TLB Hit.  */
+    if (is_ld) {
+        tcg_out_qemu_ld_direct(s, datalo, datahi, base, -1, 0, 0, opc);
+    } else {
+        tcg_out_qemu_st_direct(s, datalo, datahi, base, 0, 0, opc);
+    }
+    tcg_out_opc(s, OPC_RET, 0, 0, 0);
+
+    /* TLB Miss.  */
+
+    /* resolve label address */
+    tcg_patch8(label_ptr[0], s->code_ptr - label_ptr[0] - 1);
+    if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
+        tcg_patch8(label_ptr[1], s->code_ptr - label_ptr[1] - 1);
+    }
+
+    if (TCG_TARGET_REG_BITS == 32) {
+        /* Copy the return address into a temporary.  */
+        tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_ESP, 0);
+        i = 4;
+
+        tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, i);
+        i += 4;
+
+        tcg_out_st(s, TCG_TYPE_I32, addrlo, TCG_REG_ESP, i);
+        i += 4;
+
+        if (TARGET_LONG_BITS == 64) {
+            tcg_out_st(s, TCG_TYPE_I32, addrhi, TCG_REG_ESP, i);
+            i += 4;
+        }
+
+        if (!is_ld) {
+            tcg_out_st(s, TCG_TYPE_I32, datalo, TCG_REG_ESP, i);
+            i += 4;
+
+            if (is_64) {
+                tcg_out_st(s, TCG_TYPE_I32, datahi, TCG_REG_ESP, i);
+                i += 4;
+            }
+        }
+
+        tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, i);
+        i += 4;
+
+        tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_ESP, i);
+    } else {
+        tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
+
+        /* The address and data values have been placed by constraints.  */
+        tcg_debug_assert(addrlo == tcg_target_call_iarg_regs[1]);
+        if (is_ld) {
+            i = 2;
+        } else {
+            tcg_debug_assert(datalo == tcg_target_call_iarg_regs[2]);
+            i = 3;
+        }
+
+        tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[i++], oi);
+
+        /* Copy the return address from the stack to the rvalue argument.
+         * WIN64 runs out of argument registers for stores.
+         */
+        if (i < (int)ARRAY_SIZE(tcg_target_call_iarg_regs)) {
+            tcg_out_ld(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[i],
+                       TCG_REG_ESP, 0);
+        } else {
+            tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_RAX, TCG_REG_ESP, 0);
+            tcg_out_st(s, TCG_TYPE_PTR, TCG_REG_RAX, TCG_REG_ESP,
+                       TCG_TARGET_CALL_STACK_OFFSET + 8);
+        }
+    }
+
+    /* Tail call to the helper.  */
+    if (is_ld) {
+        tcg_out_jmp(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SSIZE)]);
+    } else {
+        tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
+    }
+
+    return thunk;
+}
+#endif
+
 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
                               const TCGArg *args, const int *const_args)
 {