diff mbox series

[PULL,32/44] target/mips/mxu: Add Q8MAC Q8MACSU instructions

Message ID 20230710222611.50978-33-philmd@linaro.org
State New
Headers show
Series [PULL,01/44] target/mips: Rework cp0_timer with clock API | expand

Commit Message

Philippe Mathieu-Daudé July 10, 2023, 10:25 p.m. UTC
From: Siarhei Volkau <lis8215@gmail.com>

These instructions do parallel quad 8-bit multiply and accumulate.
They are close to existing Q8MUL Q8MULSU so the generation
function modified to support all of them.
Also the patch fixes decoding of Q8MULSU according to tests on
hardware.

Signed-off-by: Siarhei Volkau <lis8215@gmail.com>
Message-Id: <20230608104222.1520143-30-lis8215@gmail.com>
Signed-off-by: Philippe Mathieu-Daudé <philmd@linaro.org>
---
 target/mips/tcg/mxu_translate.c | 128 +++++++++++++++++++++-----------
 1 file changed, 86 insertions(+), 42 deletions(-)
diff mbox series

Patch

diff --git a/target/mips/tcg/mxu_translate.c b/target/mips/tcg/mxu_translate.c
index f8c3fdb103..6a82849ef7 100644
--- a/target/mips/tcg/mxu_translate.c
+++ b/target/mips/tcg/mxu_translate.c
@@ -326,7 +326,7 @@ 
  *          ├─ 110111 ─ OPC_MXU_Q16SAR
  *          │                               23..22
  *          ├─ 111000 ─ OPC_MXU__POOL19 ─┬─ 00 ─ OPC_MXU_Q8MUL
- *          │                            └─ 01 ─ OPC_MXU_Q8MULSU
+ *          │                            └─ 10 ─ OPC_MXU_Q8MULSU
  *          │
  *          │                               20..18
  *          ├─ 111001 ─ OPC_MXU__POOL20 ─┬─ 000 ─ OPC_MXU_Q8MOVZ
@@ -404,6 +404,7 @@  enum {
     OPC_MXU_Q16SAR   = 0x37,
     OPC_MXU__POOL19  = 0x38,
     OPC_MXU__POOL20  = 0x39,
+    OPC_MXU__POOL21  = 0x3A,
 };
 
 
@@ -539,7 +540,7 @@  enum {
  */
 enum {
     OPC_MXU_Q8MUL    = 0x00,
-    OPC_MXU_Q8MULSU  = 0x01,
+    OPC_MXU_Q8MULSU  = 0x02,
 };
 
 /*
@@ -554,6 +555,15 @@  enum {
     OPC_MXU_S32MOVN  = 0x05,
 };
 
+/*
+ * MXU pool 21
+ */
+enum {
+    OPC_MXU_Q8MAC    = 0x00,
+    OPC_MXU_Q8MACSU  = 0x02,
+};
+
+
 /* MXU accumulate add/subtract 1-bit pattern 'aptn1' */
 #define MXU_APTN1_A    0
 #define MXU_APTN1_S    1
@@ -1328,13 +1338,17 @@  static void gen_mxu_s16mad(DisasContext *ctx)
 }
 
 /*
- * Q8MUL   XRa, XRb, XRc, XRd - Parallel unsigned 8 bit pattern multiply
- * Q8MULSU XRa, XRb, XRc, XRd - Parallel signed 8 bit pattern multiply
+ * Q8MUL   XRa, XRb, XRc, XRd - Parallel quad unsigned 8 bit multiply
+ * Q8MULSU XRa, XRb, XRc, XRd - Parallel quad signed 8 bit multiply
+ * Q8MAC   XRa, XRb, XRc, XRd - Parallel quad unsigned 8 bit multiply
+ *   and accumulate
+ * Q8MACSU XRa, XRb, XRc, XRd - Parallel quad signed 8 bit multiply
+ *   and accumulate
  */
-static void gen_mxu_q8mul_q8mulsu(DisasContext *ctx)
+static void gen_mxu_q8mul_mac(DisasContext *ctx, bool su, bool mac)
 {
     TCGv t0, t1, t2, t3, t4, t5, t6, t7;
-    uint32_t XRa, XRb, XRc, XRd, sel;
+    uint32_t XRa, XRb, XRc, XRd, aptn2;
 
     t0 = tcg_temp_new();
     t1 = tcg_temp_new();
@@ -1349,54 +1363,60 @@  static void gen_mxu_q8mul_q8mulsu(DisasContext *ctx)
     XRb = extract32(ctx->opcode, 10, 4);
     XRc = extract32(ctx->opcode, 14, 4);
     XRd = extract32(ctx->opcode, 18, 4);
-    sel = extract32(ctx->opcode, 22, 2);
+    aptn2 = extract32(ctx->opcode, 24, 2);
 
     gen_load_mxu_gpr(t3, XRb);
     gen_load_mxu_gpr(t7, XRc);
 
-    if (sel == 0x2) {
-        /* Q8MULSU */
-        tcg_gen_ext8s_tl(t0, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8s_tl(t1, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8s_tl(t2, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8s_tl(t3, t3);
+    if (su) {
+        /* Q8MULSU / Q8MACSU */
+        tcg_gen_sextract_tl(t0, t3,  0, 8);
+        tcg_gen_sextract_tl(t1, t3,  8, 8);
+        tcg_gen_sextract_tl(t2, t3, 16, 8);
+        tcg_gen_sextract_tl(t3, t3, 24, 8);
     } else {
-        /* Q8MUL */
-        tcg_gen_ext8u_tl(t0, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8u_tl(t1, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8u_tl(t2, t3);
-        tcg_gen_shri_tl(t3, t3, 8);
-        tcg_gen_ext8u_tl(t3, t3);
+        /* Q8MUL / Q8MAC */
+        tcg_gen_extract_tl(t0, t3,  0, 8);
+        tcg_gen_extract_tl(t1, t3,  8, 8);
+        tcg_gen_extract_tl(t2, t3, 16, 8);
+        tcg_gen_extract_tl(t3, t3, 24, 8);
     }
 
-    tcg_gen_ext8u_tl(t4, t7);
-    tcg_gen_shri_tl(t7, t7, 8);
-    tcg_gen_ext8u_tl(t5, t7);
-    tcg_gen_shri_tl(t7, t7, 8);
-    tcg_gen_ext8u_tl(t6, t7);
-    tcg_gen_shri_tl(t7, t7, 8);
-    tcg_gen_ext8u_tl(t7, t7);
+    tcg_gen_extract_tl(t4, t7,  0, 8);
+    tcg_gen_extract_tl(t5, t7,  8, 8);
+    tcg_gen_extract_tl(t6, t7, 16, 8);
+    tcg_gen_extract_tl(t7, t7, 24, 8);
 
     tcg_gen_mul_tl(t0, t0, t4);
     tcg_gen_mul_tl(t1, t1, t5);
     tcg_gen_mul_tl(t2, t2, t6);
     tcg_gen_mul_tl(t3, t3, t7);
 
-    tcg_gen_andi_tl(t0, t0, 0xFFFF);
-    tcg_gen_andi_tl(t1, t1, 0xFFFF);
-    tcg_gen_andi_tl(t2, t2, 0xFFFF);
-    tcg_gen_andi_tl(t3, t3, 0xFFFF);
+    if (mac) {
+        gen_load_mxu_gpr(t4, XRd);
+        gen_load_mxu_gpr(t5, XRa);
+        tcg_gen_extract_tl(t6, t4,  0, 16);
+        tcg_gen_extract_tl(t7, t4, 16, 16);
+        if (aptn2 & 1) {
+            tcg_gen_sub_tl(t0, t6, t0);
+            tcg_gen_sub_tl(t1, t7, t1);
+        } else {
+            tcg_gen_add_tl(t0, t6, t0);
+            tcg_gen_add_tl(t1, t7, t1);
+        }
+        tcg_gen_extract_tl(t6, t5,  0, 16);
+        tcg_gen_extract_tl(t7, t5, 16, 16);
+        if (aptn2 & 2) {
+            tcg_gen_sub_tl(t2, t6, t2);
+            tcg_gen_sub_tl(t3, t7, t3);
+        } else {
+            tcg_gen_add_tl(t2, t6, t2);
+            tcg_gen_add_tl(t3, t7, t3);
+        }
+    }
 
-    tcg_gen_shli_tl(t1, t1, 16);
-    tcg_gen_shli_tl(t3, t3, 16);
-
-    tcg_gen_or_tl(t0, t0, t1);
-    tcg_gen_or_tl(t1, t2, t3);
+    tcg_gen_deposit_tl(t0, t0, t1, 16, 16);
+    tcg_gen_deposit_tl(t1, t2, t3, 16, 16);
 
     gen_store_mxu_gpr(t0, XRd);
     gen_store_mxu_gpr(t1, XRa);
@@ -4548,12 +4568,14 @@  static void decode_opc_mxu__pool18(DisasContext *ctx)
 
 static void decode_opc_mxu__pool19(DisasContext *ctx)
 {
-    uint32_t opcode = extract32(ctx->opcode, 22, 2);
+    uint32_t opcode = extract32(ctx->opcode, 22, 4);
 
     switch (opcode) {
     case OPC_MXU_Q8MUL:
+        gen_mxu_q8mul_mac(ctx, false, false);
+        break;
     case OPC_MXU_Q8MULSU:
-        gen_mxu_q8mul_q8mulsu(ctx);
+        gen_mxu_q8mul_mac(ctx, true, false);
         break;
     default:
         MIPS_INVAL("decode_opc_mxu");
@@ -4592,6 +4614,25 @@  static void decode_opc_mxu__pool20(DisasContext *ctx)
     }
 }
 
+static void decode_opc_mxu__pool21(DisasContext *ctx)
+{
+    uint32_t opcode = extract32(ctx->opcode, 22, 2);
+
+    switch (opcode) {
+    case OPC_MXU_Q8MAC:
+        gen_mxu_q8mul_mac(ctx, false, true);
+        break;
+    case OPC_MXU_Q8MACSU:
+        gen_mxu_q8mul_mac(ctx, true, true);
+        break;
+    default:
+        MIPS_INVAL("decode_opc_mxu");
+        gen_reserved_instruction(ctx);
+        break;
+    }
+}
+
+
 bool decode_ase_mxu(DisasContext *ctx, uint32_t insn)
 {
     uint32_t opcode = extract32(insn, 0, 6);
@@ -4755,6 +4796,9 @@  bool decode_ase_mxu(DisasContext *ctx, uint32_t insn)
         case OPC_MXU__POOL20:
             decode_opc_mxu__pool20(ctx);
             break;
+        case OPC_MXU__POOL21:
+            decode_opc_mxu__pool21(ctx);
+            break;
         default:
             return false;
         }