diff mbox series

[8/9] target/arm: Implement FP data-processing (2 source) for fp16

Message ID 20180425012300.14698-9-richard.henderson@linaro.org
State Superseded
Headers show
Series target/arm: Fixups for ARM_FEATURE_V8_FP16 | expand

Commit Message

Richard Henderson April 25, 2018, 1:22 a.m. UTC
We missed all of the scalar fp16 binary operations.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 target/arm/translate-a64.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

-- 
2.14.3

Comments

Alex Bennée May 1, 2018, 11:13 a.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> We missed all of the scalar fp16 binary operations.

>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  target/arm/translate-a64.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++

>  1 file changed, 73 insertions(+)

>

> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c

> index 794ede7222..11b90b7eb0 100644

> --- a/target/arm/translate-a64.c

> +++ b/target/arm/translate-a64.c

> @@ -532,6 +532,14 @@ static TCGv_i32 read_fp_sreg(DisasContext *s, int reg)

>      return v;

>  }

>

> +static TCGv_i32 read_fp_hreg(DisasContext *s, int reg)

> +{

> +    TCGv_i32 v = tcg_temp_new_i32();

> +

> +    tcg_gen_ld16u_i32(v, cpu_env, fp_reg_offset(s, reg, MO_16));

> +    return v;

> +}

> +

>  /* Clear the bits above an N-bit vector, for N = (is_q ? 128 : 64).

>   * If SVE is not enabled, then there are only 128 bits in the vector.

>   */

> @@ -4968,6 +4976,61 @@ static void handle_fp_2src_double(DisasContext *s, int opcode,

>      tcg_temp_free_i64(tcg_res);

>  }

>

> +/* Floating-point data-processing (2 source) - half precision */

> +static void handle_fp_2src_half(DisasContext *s, int opcode,

> +                                int rd, int rn, int rm)

> +{

> +    TCGv_i32 tcg_op1;

> +    TCGv_i32 tcg_op2;

> +    TCGv_i32 tcg_res;

> +    TCGv_ptr fpst;

> +

> +    tcg_res = tcg_temp_new_i32();

> +    fpst = get_fpstatus_ptr(true);

> +    tcg_op1 = read_fp_hreg(s, rn);

> +    tcg_op2 = read_fp_hreg(s, rm);

> +

> +    switch (opcode) {

> +    case 0x0: /* FMUL */

> +        gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);

> +        break;

> +    case 0x1: /* FDIV */

> +        gen_helper_advsimd_divh(tcg_res, tcg_op1, tcg_op2, fpst);

> +        break;

> +    case 0x2: /* FADD */

> +        gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);

> +        break;

> +    case 0x3: /* FSUB */

> +        gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);

> +        break;

> +    case 0x4: /* FMAX */

> +        gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);

> +        break;

> +    case 0x5: /* FMIN */

> +        gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);

> +        break;

> +    case 0x6: /* FMAXNM */

> +        gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);

> +        break;

> +    case 0x7: /* FMINNM */

> +        gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);

> +        break;

> +    case 0x8: /* FNMUL */

> +        gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);

> +        tcg_gen_xori_i32(tcg_res, tcg_res, 0x8000);

> +        break;

> +    default:

> +        g_assert_not_reached();

> +    }

> +

> +    write_fp_sreg(s, rd, tcg_res);


If we are going to the trouble of adding a read_fp_hreg() we might as
well do the same for the write case. Then we can convert the various:

  read_vec_element_i32(s, tcg_vm, rm, 0, MO_16);

that we used before.

> +

> +    tcg_temp_free_ptr(fpst);

> +    tcg_temp_free_i32(tcg_op1);

> +    tcg_temp_free_i32(tcg_op2);

> +    tcg_temp_free_i32(tcg_res);

> +}

> +

>  /* Floating point data-processing (2 source)

>   *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0

>   * +---+---+---+-----------+------+---+------+--------+-----+------+------+

> @@ -5000,6 +5063,16 @@ static void disas_fp_2src(DisasContext *s, uint32_t insn)

>          }

>          handle_fp_2src_double(s, opcode, rd, rn, rm);

>          break;

> +    case 3:

> +        if (!arm_dc_feature(s, ARM_FEATURE_V8_FP16)) {

> +            unallocated_encoding(s);

> +            return;

> +        }

> +        if (!fp_access_check(s)) {

> +            return;

> +        }

> +        handle_fp_2src_half(s, opcode, rd, rn, rm);

> +        break;

>      default:

>          unallocated_encoding(s);

>      }


Otherwise:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>



--
Alex Bennée
Richard Henderson May 2, 2018, 6:28 p.m. UTC | #2
On 05/01/2018 04:13 AM, Alex Bennée wrote:
> If we are going to the trouble of adding a read_fp_hreg() we might as

> well do the same for the write case. Then we can convert the various:

> 

>   read_vec_element_i32(s, tcg_vm, rm, 0, MO_16);

> 

> that we used before.


Ok, I've split these new functions into two new patches.


r~
Richard Henderson May 2, 2018, 6:47 p.m. UTC | #3
On 05/01/2018 04:13 AM, Alex Bennée wrote:
> If we are going to the trouble of adding a read_fp_hreg() we might as

> well do the same for the write case.


The write case is much harder.

Almost all of the time we have a helper function that has already zero-extended
the result from 16 to 32-bits, and therefore write_fp_sreg is perfectly fine.
In more than a few cases, this writeback is shared by both single and half
precision paths and so adjusting the writeback is not appropriate.

There are in fact zero instances at this point in the tree that do require
extra zero-extension.  So I'm going to leave this alone.


r~
diff mbox series

Patch

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 794ede7222..11b90b7eb0 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -532,6 +532,14 @@  static TCGv_i32 read_fp_sreg(DisasContext *s, int reg)
     return v;
 }
 
+static TCGv_i32 read_fp_hreg(DisasContext *s, int reg)
+{
+    TCGv_i32 v = tcg_temp_new_i32();
+
+    tcg_gen_ld16u_i32(v, cpu_env, fp_reg_offset(s, reg, MO_16));
+    return v;
+}
+
 /* Clear the bits above an N-bit vector, for N = (is_q ? 128 : 64).
  * If SVE is not enabled, then there are only 128 bits in the vector.
  */
@@ -4968,6 +4976,61 @@  static void handle_fp_2src_double(DisasContext *s, int opcode,
     tcg_temp_free_i64(tcg_res);
 }
 
+/* Floating-point data-processing (2 source) - half precision */
+static void handle_fp_2src_half(DisasContext *s, int opcode,
+                                int rd, int rn, int rm)
+{
+    TCGv_i32 tcg_op1;
+    TCGv_i32 tcg_op2;
+    TCGv_i32 tcg_res;
+    TCGv_ptr fpst;
+
+    tcg_res = tcg_temp_new_i32();
+    fpst = get_fpstatus_ptr(true);
+    tcg_op1 = read_fp_hreg(s, rn);
+    tcg_op2 = read_fp_hreg(s, rm);
+
+    switch (opcode) {
+    case 0x0: /* FMUL */
+        gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x1: /* FDIV */
+        gen_helper_advsimd_divh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x2: /* FADD */
+        gen_helper_advsimd_addh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x3: /* FSUB */
+        gen_helper_advsimd_subh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x4: /* FMAX */
+        gen_helper_advsimd_maxh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x5: /* FMIN */
+        gen_helper_advsimd_minh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x6: /* FMAXNM */
+        gen_helper_advsimd_maxnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x7: /* FMINNM */
+        gen_helper_advsimd_minnumh(tcg_res, tcg_op1, tcg_op2, fpst);
+        break;
+    case 0x8: /* FNMUL */
+        gen_helper_advsimd_mulh(tcg_res, tcg_op1, tcg_op2, fpst);
+        tcg_gen_xori_i32(tcg_res, tcg_res, 0x8000);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+
+    write_fp_sreg(s, rd, tcg_res);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tcg_op1);
+    tcg_temp_free_i32(tcg_op2);
+    tcg_temp_free_i32(tcg_res);
+}
+
 /* Floating point data-processing (2 source)
  *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
  * +---+---+---+-----------+------+---+------+--------+-----+------+------+
@@ -5000,6 +5063,16 @@  static void disas_fp_2src(DisasContext *s, uint32_t insn)
         }
         handle_fp_2src_double(s, opcode, rd, rn, rm);
         break;
+    case 3:
+        if (!arm_dc_feature(s, ARM_FEATURE_V8_FP16)) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_fp_2src_half(s, opcode, rd, rn, rm);
+        break;
     default:
         unallocated_encoding(s);
     }