diff mbox series

[11/20] tcg/i386: Implement avx512 immediate rotate

Message ID 20211218194250.247633-12-richard.henderson@linaro.org
State Superseded
Headers show
Series tcg: vector improvements | expand

Commit Message

Richard Henderson Dec. 18, 2021, 7:42 p.m. UTC
AVX512VL has VPROLD and VPROLQ, layered onto the same
opcode as PSHIFTD, but requires EVEX encoding and W.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 tcg/i386/tcg-target.h     |  2 +-
 tcg/i386/tcg-target.c.inc | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

Comments

Alex Bennée Feb. 2, 2022, 2:05 p.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> AVX512VL has VPROLD and VPROLQ, layered onto the same
> opcode as PSHIFTD, but requires EVEX encoding and W.
>
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>  tcg/i386/tcg-target.h     |  2 +-
>  tcg/i386/tcg-target.c.inc | 15 +++++++++++++--
>  2 files changed, 14 insertions(+), 3 deletions(-)
>
> diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
> index 12d098ad6c..38c09fd66c 100644
> --- a/tcg/i386/tcg-target.h
> +++ b/tcg/i386/tcg-target.h
> @@ -195,7 +195,7 @@ extern bool have_movbe;
>  #define TCG_TARGET_HAS_not_vec          0
>  #define TCG_TARGET_HAS_neg_vec          0
>  #define TCG_TARGET_HAS_abs_vec          1
> -#define TCG_TARGET_HAS_roti_vec         0
> +#define TCG_TARGET_HAS_roti_vec         have_avx512vl
>  #define TCG_TARGET_HAS_rots_vec         0
>  #define TCG_TARGET_HAS_rotv_vec         0
>  #define TCG_TARGET_HAS_shi_vec          1
> diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
> index c4e6f2e5ea..5ab7c4c0fa 100644
> --- a/tcg/i386/tcg-target.c.inc
> +++ b/tcg/i386/tcg-target.c.inc
> @@ -361,7 +361,7 @@ static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
>  #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
>  #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
>  #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
> -#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
> +#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
>  #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
>  #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
>  #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
> @@ -2906,6 +2906,14 @@ static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
>              insn |= P_VEXW | P_EVEX;
>          }
>          sub = 4;
> +        goto gen_shift;
> +    case INDEX_op_rotli_vec:
> +        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
> +        if (vece == MO_64) {
> +            insn |= P_VEXW;
> +        }
> +        sub = 1;
> +        goto gen_shift;

This could just be a /* fall-through */ although given the large amount
of gotos the switch statement is gathering I'm not sure it makes too
much difference.

Is there any reason why gen_shift couldn't be pushed into a helper
function so we just had:

    static void tcg_out_vec_shift(s, vece, insn, sub, a0, a1, a2) {
        tcg_debug_assert(vece != MO_8);
        if (type == TCG_TYPE_V256) {
            insn |= P_VEXL;
        }
        tcg_out_vex_modrm(s, insn, sub, a0, a1);
        tcg_out8(s, a2);
    }

    ...

    case INDEX_op_rotli_vec:
        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
        if (vece == MO_64) {
            insn |= P_VEXW;
        }
        tcg_out_vec_shift(s, vece, insn, 1, a0, a1, a2);
        break;

Surely the compiler would inline if needed (and even if it didn't it the
code generation that critical we care about a few cycles)?
Richard Henderson Feb. 3, 2022, 1:26 a.m. UTC | #2
On 2/3/22 01:05, Alex Bennée wrote:
> Is there any reason why gen_shift couldn't be pushed into a helper
> function so we just had:
> 
>      static void tcg_out_vec_shift(s, vece, insn, sub, a0, a1, a2) {
>          tcg_debug_assert(vece != MO_8);
>          if (type == TCG_TYPE_V256) {
>              insn |= P_VEXL;
>          }
>          tcg_out_vex_modrm(s, insn, sub, a0, a1);
>          tcg_out8(s, a2);
>      }
> 
>      ...
> 
>      case INDEX_op_rotli_vec:
>          insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
>          if (vece == MO_64) {
>              insn |= P_VEXW;
>          }
>          tcg_out_vec_shift(s, vece, insn, 1, a0, a1, a2);
>          break;
> 
> Surely the compiler would inline if needed (and even if it didn't it the
> code generation that critical we care about a few cycles)?

Yes, I suppose I could pull out a helper or two.
Just one of those things where something used to be cleaner, and then the code grew.

r~
diff mbox series

Patch

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 12d098ad6c..38c09fd66c 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -195,7 +195,7 @@  extern bool have_movbe;
 #define TCG_TARGET_HAS_not_vec          0
 #define TCG_TARGET_HAS_neg_vec          0
 #define TCG_TARGET_HAS_abs_vec          1
-#define TCG_TARGET_HAS_roti_vec         0
+#define TCG_TARGET_HAS_roti_vec         have_avx512vl
 #define TCG_TARGET_HAS_rots_vec         0
 #define TCG_TARGET_HAS_rotv_vec         0
 #define TCG_TARGET_HAS_shi_vec          1
diff --git a/tcg/i386/tcg-target.c.inc b/tcg/i386/tcg-target.c.inc
index c4e6f2e5ea..5ab7c4c0fa 100644
--- a/tcg/i386/tcg-target.c.inc
+++ b/tcg/i386/tcg-target.c.inc
@@ -361,7 +361,7 @@  static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
-#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
+#define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
@@ -2906,6 +2906,14 @@  static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
             insn |= P_VEXW | P_EVEX;
         }
         sub = 4;
+        goto gen_shift;
+    case INDEX_op_rotli_vec:
+        insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
+        if (vece == MO_64) {
+            insn |= P_VEXW;
+        }
+        sub = 1;
+        goto gen_shift;
     gen_shift:
         tcg_debug_assert(vece != MO_8);
         if (type == TCG_TYPE_V256) {
@@ -3195,6 +3203,7 @@  static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
     case INDEX_op_sari_vec:
+    case INDEX_op_rotli_vec:
     case INDEX_op_x86_psrldq_vec:
         return C_O1_I1(x, x);
 
@@ -3216,11 +3225,13 @@  int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
     case INDEX_op_xor_vec:
     case INDEX_op_andc_vec:
         return 1;
-    case INDEX_op_rotli_vec:
     case INDEX_op_cmp_vec:
     case INDEX_op_cmpsel_vec:
         return -1;
 
+    case INDEX_op_rotli_vec:
+        return have_avx512vl && vece >= MO_32 ? 1 : -1;
+
     case INDEX_op_shli_vec:
     case INDEX_op_shri_vec:
         /* We must expand the operation for MO_8.  */