diff mbox series

[33/36] target/arm: Convert Neon fp VMUL, VMLA, VMLS 3-reg-same insns to decodetree

Message ID 20200430181003.21682-34-peter.maydell@linaro.org
State Superseded
Headers show
Series target/arm: Convert Neon to decodetree (part 1) | expand

Commit Message

Peter Maydell April 30, 2020, 6:10 p.m. UTC
Convert the Neon integer VMUL, VMLA, and VMLS 3-reg-same inssn to
decodetree.

Since VMLA and VMLS accumulate into the destination register, we add
a reads_vd parameter to do_3same_fp() which tells it to load the
old value into vd before calling the callback function, in the same
way that the translate-vfp.inc.c do_vfp_3op_sp() and do_vfp_3op_dp()
functions work.

This conversion fixes in passing an underdecoding for VMUL
(originally reported by Fredrik Strupe <fredrik@strupe.net>): bit 1
of the 'size' field must be 0.  The old decoder didn't enforce this,
but the decodetree pattern does.

The gen_VMLA_fp_reg() function performs the addition operation
with the operands in the opposite order to the old decoder:
since Neon sets 'default NaN mode' float32_add operations are
commutative so there is no behaviour difference, but putting
them this way around matches the Arm ARM pseudocode and the
required operation order for the subtraction in gen_VMLS_fp_reg().

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

---
 target/arm/translate-neon.inc.c | 49 +++++++++++++++++++++++++++------
 target/arm/translate.c          | 17 +-----------
 target/arm/neon-dp.decode       |  3 ++
 3 files changed, 44 insertions(+), 25 deletions(-)

-- 
2.20.1

Comments

Richard Henderson May 1, 2020, 4:07 a.m. UTC | #1
On 4/30/20 11:10 AM, Peter Maydell wrote:
> Convert the Neon integer VMUL, VMLA, and VMLS 3-reg-same inssn to

> decodetree.

> 

> Since VMLA and VMLS accumulate into the destination register, we add

> a reads_vd parameter to do_3same_fp() which tells it to load the

> old value into vd before calling the callback function, in the same

> way that the translate-vfp.inc.c do_vfp_3op_sp() and do_vfp_3op_dp()

> functions work.

> 

> This conversion fixes in passing an underdecoding for VMUL

> (originally reported by Fredrik Strupe <fredrik@strupe.net>): bit 1

> of the 'size' field must be 0.  The old decoder didn't enforce this,

> but the decodetree pattern does.

> 

> The gen_VMLA_fp_reg() function performs the addition operation

> with the operands in the opposite order to the old decoder:

> since Neon sets 'default NaN mode' float32_add operations are

> commutative so there is no behaviour difference, but putting

> them this way around matches the Arm ARM pseudocode and the

> required operation order for the subtraction in gen_VMLS_fp_reg().

> 

> Signed-off-by: Peter Maydell <peter.maydell@linaro.org>

> ---

>  target/arm/translate-neon.inc.c | 49 +++++++++++++++++++++++++++------

>  target/arm/translate.c          | 17 +-----------

>  target/arm/neon-dp.decode       |  3 ++

>  3 files changed, 44 insertions(+), 25 deletions(-)


Note that we do have helper_gvec_fmul_s, similar to fadd before, but currently
no mla.

Otherwise,

Reviewed-by: Richard Henderson <richard.henderson@linaro.org>


r~
diff mbox series

Patch

diff --git a/target/arm/translate-neon.inc.c b/target/arm/translate-neon.inc.c
index 30832309924..47879bbb6c9 100644
--- a/target/arm/translate-neon.inc.c
+++ b/target/arm/translate-neon.inc.c
@@ -1322,9 +1322,15 @@  static bool trans_VQRDMULH_3s(DisasContext *s, arg_3same *a)
     return do_3same_32(s, a, fns[a->size - 1]);
 }
 
-static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
+static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn,
+                        bool reads_vd)
 {
-    /* FP operations handled elementwise 32 bits at a time */
+    /*
+     * FP operations handled elementwise 32 bits at a time.
+     * If reads_vd is true then the old value of Vd will be
+     * loaded before calling the callback function. This is
+     * used for multiply-accumulate type operations.
+     */
     TCGv_i32 tmp, tmp2;
     int pass;
 
@@ -1350,9 +1356,16 @@  static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
         tmp = neon_load_reg(a->vn, pass);
         tmp2 = neon_load_reg(a->vm, pass);
-        fn(tmp, tmp, tmp2, fpstatus);
+        if (reads_vd) {
+            TCGv_i32 tmp_rd = neon_load_reg(a->vd, pass);
+            fn(tmp_rd, tmp, tmp2, fpstatus);
+            neon_store_reg(a->vd, pass, tmp_rd);
+            tcg_temp_free_i32(tmp);
+        } else {
+            fn(tmp, tmp, tmp2, fpstatus);
+            neon_store_reg(a->vd, pass, tmp);
+        }
         tcg_temp_free_i32(tmp2);
-        neon_store_reg(a->vd, pass, tmp);
     }
     tcg_temp_free_ptr(fpstatus);
     return true;
@@ -1362,19 +1375,37 @@  static bool do_3same_fp(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
  * For all the functions using this macro, size == 1 means fp16,
  * which is an architecture extension we don't implement yet.
  */
-#define DO_3S_FP(INSN,FUNC)                                         \
+#define DO_3S_FP(INSN,FUNC,READS_VD)                                \
     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
     {                                                               \
         if (a->size != 0) {                                         \
             /* TODO fp16 support */                                 \
             return false;                                           \
         }                                                           \
-        return do_3same_fp(s, a, FUNC);                             \
+        return do_3same_fp(s, a, FUNC, READS_VD);                   \
     }
 
-DO_3S_FP(VADD, gen_helper_vfp_adds)
-DO_3S_FP(VSUB, gen_helper_vfp_subs)
-DO_3S_FP(VABD, gen_helper_neon_abd_f32)
+DO_3S_FP(VADD, gen_helper_vfp_adds, false)
+DO_3S_FP(VSUB, gen_helper_vfp_subs, false)
+DO_3S_FP(VABD, gen_helper_neon_abd_f32, false)
+DO_3S_FP(VMUL, gen_helper_vfp_muls, false)
+
+static void gen_VMLA_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
+                            TCGv_ptr fpstatus)
+{
+    gen_helper_vfp_muls(vn, vn, vm, fpstatus);
+    gen_helper_vfp_adds(vd, vd, vn, fpstatus);
+}
+
+static void gen_VMLS_fp_3s(TCGv_i32 vd, TCGv_i32 vn, TCGv_i32 vm,
+                            TCGv_ptr fpstatus)
+{
+    gen_helper_vfp_muls(vn, vn, vm, fpstatus);
+    gen_helper_vfp_subs(vd, vd, vn, fpstatus);
+}
+
+DO_3S_FP(VMLA, gen_VMLA_fp_3s, true)
+DO_3S_FP(VMLS, gen_VMLS_fp_3s, true)
 
 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a, VFPGen3OpSPFn *fn)
 {
diff --git a/target/arm/translate.c b/target/arm/translate.c
index 5ae982ee253..57343daa10a 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -4785,6 +4785,7 @@  static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
         case NEON_3R_VPADD_VQRDMLAH:
         case NEON_3R_VQDMULH_VQRDMULH:
         case NEON_3R_FLOAT_ARITH:
+        case NEON_3R_FLOAT_MULTIPLY:
             /* Already handled by decodetree */
             return 1;
         }
@@ -4831,22 +4832,6 @@  static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
         tmp = neon_load_reg(rn, pass);
         tmp2 = neon_load_reg(rm, pass);
         switch (op) {
-        case NEON_3R_FLOAT_MULTIPLY:
-        {
-            TCGv_ptr fpstatus = get_fpstatus_ptr(1);
-            gen_helper_vfp_muls(tmp, tmp, tmp2, fpstatus);
-            if (!u) {
-                tcg_temp_free_i32(tmp2);
-                tmp2 = neon_load_reg(rd, pass);
-                if (size == 0) {
-                    gen_helper_vfp_adds(tmp, tmp, tmp2, fpstatus);
-                } else {
-                    gen_helper_vfp_subs(tmp, tmp2, tmp, fpstatus);
-                }
-            }
-            tcg_temp_free_ptr(fpstatus);
-            break;
-        }
         case NEON_3R_FLOAT_CMP:
         {
             TCGv_ptr fpstatus = get_fpstatus_ptr(1);
diff --git a/target/arm/neon-dp.decode b/target/arm/neon-dp.decode
index 378c2dd5105..96866c03db4 100644
--- a/target/arm/neon-dp.decode
+++ b/target/arm/neon-dp.decode
@@ -165,5 +165,8 @@  VADD_fp_3s       1111 001 0 0 . 0 . .... .... 1101 ... 0 .... @3same_fp
 VSUB_fp_3s       1111 001 0 0 . 1 . .... .... 1101 ... 0 .... @3same_fp
 VPADD_fp_3s      1111 001 1 0 . 0 . .... .... 1101 ... 0 .... @3same_fp_q0
 VABD_fp_3s       1111 001 1 0 . 1 . .... .... 1101 ... 0 .... @3same_fp
+VMLA_fp_3s       1111 001 0 0 . 0 . .... .... 1101 ... 1 .... @3same_fp
+VMLS_fp_3s       1111 001 0 0 . 1 . .... .... 1101 ... 1 .... @3same_fp
+VMUL_fp_3s       1111 001 1 0 . 0 . .... .... 1101 ... 1 .... @3same_fp
 VPMAX_fp_3s      1111 001 1 0 . 0 . .... .... 1111 ... 0 .... @3same_fp_q0
 VPMIN_fp_3s      1111 001 1 0 . 1 . .... .... 1111 ... 0 .... @3same_fp_q0