diff mbox series

[v4,52/57] tcg/i386: Honor 64-bit atomicity in 32-bit mode

Message ID 20230503070656.1746170-53-richard.henderson@linaro.org
State Superseded
Headers show
Series tcg: Improve atomicity support | expand

Commit Message

Richard Henderson May 3, 2023, 7:06 a.m. UTC
Use the fpu to perform 64-bit loads and stores.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.c.inc | 44 +++++++++++++++++++++++++++++++++------
 1 file changed, 38 insertions(+), 6 deletions(-)

Comments

Peter Maydell May 5, 2023, 1:27 p.m. UTC | #1
On Wed, 3 May 2023 at 08:18, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> Use the fpu to perform 64-bit loads and stores.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>


> @@ -2091,7 +2095,20 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>              datalo = datahi;
>              datahi = t;
>          }
> -        if (h.base == datalo || h.index == datalo) {
> +        if (h.atom == MO_64) {
> +            /*
> +             * Atomicity requires that we use use a single 8-byte load.
> +             * For simplicity and code size, always use the FPU for this.
> +             * Similar insns using SSE/AVX are merely larger.

I'm surprised there's no performance penalty for throwing old-school
FPU insns into what is presumably otherwise code that's only
using modern SSE.

> +             * Load from memory in one go, then store back to the stack,
> +             * from whence we can load into the correct integer regs.
> +             */
> +            tcg_out_modrm_sib_offset(s, OPC_ESCDF + h.seg, ESCDF_FILD_m64,
> +                                     h.base, h.index, 0, h.ofs);
> +            tcg_out_modrm_offset(s, OPC_ESCDF, ESCDF_FISTP_m64, TCG_REG_ESP, 0);
> +            tcg_out_modrm_offset(s, movop, datalo, TCG_REG_ESP, 0);
> +            tcg_out_modrm_offset(s, movop, datahi, TCG_REG_ESP, 4);
> +        } else if (h.base == datalo || h.index == datalo) {
>              tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
>                                       h.base, h.index, 0, h.ofs);
>              tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);

I assume the caller has arranged that the top of the stack
is trashable at this point?

Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

-- PMM
Richard Henderson May 8, 2023, 4:15 p.m. UTC | #2
On 5/5/23 14:27, Peter Maydell wrote:
> On Wed, 3 May 2023 at 08:18, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> Use the fpu to perform 64-bit loads and stores.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> 
> 
>> @@ -2091,7 +2095,20 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
>>               datalo = datahi;
>>               datahi = t;
>>           }
>> -        if (h.base == datalo || h.index == datalo) {
>> +        if (h.atom == MO_64) {
>> +            /*
>> +             * Atomicity requires that we use use a single 8-byte load.
>> +             * For simplicity and code size, always use the FPU for this.
>> +             * Similar insns using SSE/AVX are merely larger.
> 
> I'm surprised there's no performance penalty for throwing old-school
> FPU insns into what is presumably otherwise code that's only
> using modern SSE.

I have no idea about performance.  We don't require SSE for TCG at the moment.

> I assume the caller has arranged that the top of the stack
> is trashable at this point?

The entire fpu stack is call-clobbered.


r~
diff mbox series

Patch

diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index 3e21f067d6..5c6c64c48a 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -468,6 +468,10 @@  static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_GRP5        (0xff)
 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 
+#define OPC_ESCDF       (0xdf)
+#define ESCDF_FILD_m64  5
+#define ESCDF_FISTP_m64 7
+
 /* Group 1 opcode extensions for 0x80-0x83.
    These are also used as modifiers for OPC_ARITH.  */
 #define ARITH_ADD 0
@@ -2091,7 +2095,20 @@  static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
             datalo = datahi;
             datahi = t;
         }
-        if (h.base == datalo || h.index == datalo) {
+        if (h.atom == MO_64) {
+            /*
+             * Atomicity requires that we use use a single 8-byte load.
+             * For simplicity and code size, always use the FPU for this.
+             * Similar insns using SSE/AVX are merely larger.
+             * Load from memory in one go, then store back to the stack,
+             * from whence we can load into the correct integer regs.
+             */
+            tcg_out_modrm_sib_offset(s, OPC_ESCDF + h.seg, ESCDF_FILD_m64,
+                                     h.base, h.index, 0, h.ofs);
+            tcg_out_modrm_offset(s, OPC_ESCDF, ESCDF_FISTP_m64, TCG_REG_ESP, 0);
+            tcg_out_modrm_offset(s, movop, datalo, TCG_REG_ESP, 0);
+            tcg_out_modrm_offset(s, movop, datahi, TCG_REG_ESP, 4);
+        } else if (h.base == datalo || h.index == datalo) {
             tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
                                      h.base, h.index, 0, h.ofs);
             tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
@@ -2161,12 +2178,27 @@  static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
         if (TCG_TARGET_REG_BITS == 64) {
             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
                                      h.base, h.index, 0, h.ofs);
+            break;
+        }
+        if (use_movbe) {
+            TCGReg t = datalo;
+            datalo = datahi;
+            datahi = t;
+        }
+        if (h.atom == MO_64) {
+            /*
+             * Atomicity requires that we use use one 8-byte store.
+             * For simplicity, and code size, always use the FPU for this.
+             * Similar insns using SSE/AVX are merely larger.
+             * Assemble the 8-byte quantity in required endianness
+             * on the stack, load to coproc unit, and store.
+             */
+            tcg_out_modrm_offset(s, movop, datalo, TCG_REG_ESP, 0);
+            tcg_out_modrm_offset(s, movop, datahi, TCG_REG_ESP, 4);
+            tcg_out_modrm_offset(s, OPC_ESCDF, ESCDF_FILD_m64, TCG_REG_ESP, 0);
+            tcg_out_modrm_sib_offset(s, OPC_ESCDF + h.seg, ESCDF_FISTP_m64,
+                                     h.base, h.index, 0, h.ofs);
         } else {
-            if (use_movbe) {
-                TCGReg t = datalo;
-                datalo = datahi;
-                datahi = t;
-            }
             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
                                      h.base, h.index, 0, h.ofs);
             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,