diff mbox series

[v5,36/54] tcg: Introduce tcg_out_movext3

Message ID 20230515143313.734053-37-richard.henderson@linaro.org
State Accepted
Commit 2462e30e99676c710624806febe5ce67a45f0521
Headers show
Series tcg: Improve atomicity support | expand

Commit Message

Richard Henderson May 15, 2023, 2:32 p.m. UTC
With x86_64 as host, we do not have any temporaries with which to
resolve cycles, but we do have xchg.   As a side bonus, the set of
graphs that can be made with 3 nodes and all nodes conflicting is
small: two.  We can solve the cycle with a single temp.

This is required for x86_64 to handle stores of i128: 1 address
register and 2 data registers.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/tcg.c | 138 ++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 107 insertions(+), 31 deletions(-)

Comments

Peter Maydell May 16, 2023, 10:03 a.m. UTC | #1
On Mon, 15 May 2023 at 15:43, Richard Henderson
<richard.henderson@linaro.org> wrote:
>
> With x86_64 as host, we do not have any temporaries with which to
> resolve cycles, but we do have xchg.   As a side bonus, the set of
> graphs that can be made with 3 nodes and all nodes conflicting is
> small: two.  We can solve the cycle with a single temp.
>
> This is required for x86_64 to handle stores of i128: 1 address
> register and 2 data registers.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>



>  static void tcg_out_helper_load_regs(TCGContext *s,
>                                       unsigned nmov, TCGMovExtend *mov,
> -                                     unsigned ntmp, const int *tmp)
> +                                     const TCGLdstHelperParam *parm)
>  {
> +    TCGReg dst3;
> +
>      switch (nmov) {
> -    default:
> +    case 4:
>          /* The backend must have provided enough temps for the worst case. */
> -        tcg_debug_assert(ntmp + 1 >= nmov);
> +        tcg_debug_assert(parm->ntmp >= 2);
>
> -        for (unsigned i = nmov - 1; i >= 2; --i) {
> -            TCGReg dst = mov[i].dst;
> -
> -            for (unsigned j = 0; j < i; ++j) {
> -                if (dst == mov[j].src) {
> -                    /*
> -                     * Conflict.
> -                     * Copy the source to a temporary, recurse for the
> -                     * remaining moves, perform the extension from our
> -                     * scratch on the way out.
> -                     */
> -                    TCGReg scratch = tmp[--ntmp];
> -                    tcg_out_mov(s, mov[i].src_type, scratch, mov[i].src);
> -                    mov[i].src = scratch;
> -
> -                    tcg_out_helper_load_regs(s, i, mov, ntmp, tmp);
> -                    tcg_out_movext1(s, &mov[i]);
> -                    return;
> -                }
> +        dst3 = mov[3].dst;
> +        for (unsigned j = 0; j < 3; ++j) {
> +            if (dst3 == mov[j].src) {
> +                /*
> +                 * Conflict. Copy the source to a temporary, perform the
> +                 * remaining moves, then the extension from our scratch
> +                 * on the way out.
> +                 */
> +                TCGReg scratch = parm->tmp[1];
> +                tcg_out_movext3(s, mov, mov + 1, mov + 2, parm->tmp[0]);
> +                tcg_out_movext1_new_src(s, &mov[3], scratch);

Isn't this missing the "copy the source to a temporary" part?
I was expecting an initial tcg_out_mov() like the old code has.

> +                break;
>              }
> -

Otherwise
Reviewed-by: Peter Maydell <peter.maydell@linaro.org>

-- PMM
Richard Henderson May 16, 2023, 1:56 p.m. UTC | #2
On 5/16/23 03:03, Peter Maydell wrote:
> On Mon, 15 May 2023 at 15:43, Richard Henderson
> <richard.henderson@linaro.org> wrote:
>>
>> With x86_64 as host, we do not have any temporaries with which to
>> resolve cycles, but we do have xchg.   As a side bonus, the set of
>> graphs that can be made with 3 nodes and all nodes conflicting is
>> small: two.  We can solve the cycle with a single temp.
>>
>> This is required for x86_64 to handle stores of i128: 1 address
>> register and 2 data registers.
>>
>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> 
> 
> 
>>   static void tcg_out_helper_load_regs(TCGContext *s,
>>                                        unsigned nmov, TCGMovExtend *mov,
>> -                                     unsigned ntmp, const int *tmp)
>> +                                     const TCGLdstHelperParam *parm)
>>   {
>> +    TCGReg dst3;
>> +
>>       switch (nmov) {
>> -    default:
>> +    case 4:
>>           /* The backend must have provided enough temps for the worst case. */
>> -        tcg_debug_assert(ntmp + 1 >= nmov);
>> +        tcg_debug_assert(parm->ntmp >= 2);
>>
>> -        for (unsigned i = nmov - 1; i >= 2; --i) {
>> -            TCGReg dst = mov[i].dst;
>> -
>> -            for (unsigned j = 0; j < i; ++j) {
>> -                if (dst == mov[j].src) {
>> -                    /*
>> -                     * Conflict.
>> -                     * Copy the source to a temporary, recurse for the
>> -                     * remaining moves, perform the extension from our
>> -                     * scratch on the way out.
>> -                     */
>> -                    TCGReg scratch = tmp[--ntmp];
>> -                    tcg_out_mov(s, mov[i].src_type, scratch, mov[i].src);
>> -                    mov[i].src = scratch;
>> -
>> -                    tcg_out_helper_load_regs(s, i, mov, ntmp, tmp);
>> -                    tcg_out_movext1(s, &mov[i]);
>> -                    return;
>> -                }
>> +        dst3 = mov[3].dst;
>> +        for (unsigned j = 0; j < 3; ++j) {
>> +            if (dst3 == mov[j].src) {
>> +                /*
>> +                 * Conflict. Copy the source to a temporary, perform the
>> +                 * remaining moves, then the extension from our scratch
>> +                 * on the way out.
>> +                 */
>> +                TCGReg scratch = parm->tmp[1];
>> +                tcg_out_movext3(s, mov, mov + 1, mov + 2, parm->tmp[0]);
>> +                tcg_out_movext1_new_src(s, &mov[3], scratch);
> 
> Isn't this missing the "copy the source to a temporary" part?
> I was expecting an initial tcg_out_mov() like the old code has.

It is.  Sloppy of me, and I haven't re-tested ppc32 this week.


r~
diff mbox series

Patch

diff --git a/tcg/tcg.c b/tcg/tcg.c
index aa0a6c3763..8688248284 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -532,6 +532,82 @@  static void tcg_out_movext2(TCGContext *s, const TCGMovExtend *i1,
     tcg_out_movext1_new_src(s, i1, src1);
 }
 
+/**
+ * tcg_out_movext3 -- move and extend three pair
+ * @s: tcg context
+ * @i1: first move description
+ * @i2: second move description
+ * @i3: third move description
+ * @scratch: temporary register, or -1 for none
+ *
+ * As tcg_out_movext, for all of @i1, @i2 and @i3, caring for overlap
+ * between the sources and destinations.
+ */
+
+static void tcg_out_movext3(TCGContext *s, const TCGMovExtend *i1,
+                            const TCGMovExtend *i2, const TCGMovExtend *i3,
+                            int scratch)
+{
+    TCGReg src1 = i1->src;
+    TCGReg src2 = i2->src;
+    TCGReg src3 = i3->src;
+
+    if (i1->dst != src2 && i1->dst != src3) {
+        tcg_out_movext1(s, i1);
+        tcg_out_movext2(s, i2, i3, scratch);
+        return;
+    }
+    if (i2->dst != src1 && i2->dst != src3) {
+        tcg_out_movext1(s, i2);
+        tcg_out_movext2(s, i1, i3, scratch);
+        return;
+    }
+    if (i3->dst != src1 && i3->dst != src2) {
+        tcg_out_movext1(s, i3);
+        tcg_out_movext2(s, i1, i2, scratch);
+        return;
+    }
+
+    /*
+     * There is a cycle.  Since there are only 3 nodes, the cycle is
+     * either "clockwise" or "anti-clockwise", and can be solved with
+     * a single scratch or two xchg.
+     */
+    if (i1->dst == src2 && i2->dst == src3 && i3->dst == src1) {
+        /* "Clockwise" */
+        if (tcg_out_xchg(s, MAX(i1->src_type, i2->src_type), src1, src2)) {
+            tcg_out_xchg(s, MAX(i2->src_type, i3->src_type), src2, src3);
+            /* The data is now in the correct registers, now extend. */
+            tcg_out_movext1_new_src(s, i1, i1->dst);
+            tcg_out_movext1_new_src(s, i2, i2->dst);
+            tcg_out_movext1_new_src(s, i3, i3->dst);
+        } else {
+            tcg_debug_assert(scratch >= 0);
+            tcg_out_mov(s, i1->src_type, scratch, src1);
+            tcg_out_movext1(s, i3);
+            tcg_out_movext1(s, i2);
+            tcg_out_movext1_new_src(s, i1, scratch);
+        }
+    } else if (i1->dst == src3 && i2->dst == src1 && i3->dst == src2) {
+        /* "Anti-clockwise" */
+        if (tcg_out_xchg(s, MAX(i2->src_type, i3->src_type), src2, src3)) {
+            tcg_out_xchg(s, MAX(i1->src_type, i2->src_type), src1, src2);
+            /* The data is now in the correct registers, now extend. */
+            tcg_out_movext1_new_src(s, i1, i1->dst);
+            tcg_out_movext1_new_src(s, i2, i2->dst);
+            tcg_out_movext1_new_src(s, i3, i3->dst);
+        } else {
+            tcg_debug_assert(scratch >= 0);
+            tcg_out_mov(s, i1->src_type, scratch, src1);
+            tcg_out_movext1(s, i2);
+            tcg_out_movext1(s, i3);
+            tcg_out_movext1_new_src(s, i1, scratch);
+        }
+    } else {
+        g_assert_not_reached();
+    }
+}
+
 #define C_PFX1(P, A)                    P##A
 #define C_PFX2(P, A, B)                 P##A##_##B
 #define C_PFX3(P, A, B, C)              P##A##_##B##_##C
@@ -5149,46 +5225,46 @@  static int tcg_out_helper_stk_ofs(TCGType type, unsigned slot)
 
 static void tcg_out_helper_load_regs(TCGContext *s,
                                      unsigned nmov, TCGMovExtend *mov,
-                                     unsigned ntmp, const int *tmp)
+                                     const TCGLdstHelperParam *parm)
 {
+    TCGReg dst3;
+
     switch (nmov) {
-    default:
+    case 4:
         /* The backend must have provided enough temps for the worst case. */
-        tcg_debug_assert(ntmp + 1 >= nmov);
+        tcg_debug_assert(parm->ntmp >= 2);
 
-        for (unsigned i = nmov - 1; i >= 2; --i) {
-            TCGReg dst = mov[i].dst;
-
-            for (unsigned j = 0; j < i; ++j) {
-                if (dst == mov[j].src) {
-                    /*
-                     * Conflict.
-                     * Copy the source to a temporary, recurse for the
-                     * remaining moves, perform the extension from our
-                     * scratch on the way out.
-                     */
-                    TCGReg scratch = tmp[--ntmp];
-                    tcg_out_mov(s, mov[i].src_type, scratch, mov[i].src);
-                    mov[i].src = scratch;
-
-                    tcg_out_helper_load_regs(s, i, mov, ntmp, tmp);
-                    tcg_out_movext1(s, &mov[i]);
-                    return;
-                }
+        dst3 = mov[3].dst;
+        for (unsigned j = 0; j < 3; ++j) {
+            if (dst3 == mov[j].src) {
+                /*
+                 * Conflict. Copy the source to a temporary, perform the
+                 * remaining moves, then the extension from our scratch
+                 * on the way out.
+                 */
+                TCGReg scratch = parm->tmp[1];
+                tcg_out_movext3(s, mov, mov + 1, mov + 2, parm->tmp[0]);
+                tcg_out_movext1_new_src(s, &mov[3], scratch);
+                break;
             }
-
-            /* No conflicts: perform this move and continue. */
-            tcg_out_movext1(s, &mov[i]);
         }
-        /* fall through for the final two moves */
 
+        /* No conflicts: perform this move and continue. */
+        tcg_out_movext1(s, &mov[3]);
+        /* fall through */
+
+    case 3:
+        tcg_out_movext3(s, mov, mov + 1, mov + 2,
+                        parm->ntmp ? parm->tmp[0] : -1);
+        break;
     case 2:
-        tcg_out_movext2(s, mov, mov + 1, ntmp ? tmp[0] : -1);
-        return;
+        tcg_out_movext2(s, mov, mov + 1,
+                        parm->ntmp ? parm->tmp[0] : -1);
+        break;
     case 1:
         tcg_out_movext1(s, mov);
-        return;
-    case 0:
+        break;
+    default:
         g_assert_not_reached();
     }
 }
@@ -5235,7 +5311,7 @@  static void tcg_out_helper_load_slots(TCGContext *s,
     for (i = 0; i < nmov; ++i) {
         mov[i].dst = tcg_target_call_iarg_regs[mov[i].dst];
     }
-    tcg_out_helper_load_regs(s, nmov, mov, parm->ntmp, parm->tmp);
+    tcg_out_helper_load_regs(s, nmov, mov, parm);
 }
 
 static void tcg_out_helper_load_imm(TCGContext *s, unsigned slot,