diff mbox series

[9/9] target/arm: Implement FP data-processing (3 source) for fp16

Message ID 20180425012300.14698-10-richard.henderson@linaro.org
State Superseded
Headers show
Series target/arm: Fixups for ARM_FEATURE_V8_FP16 | expand

Commit Message

Richard Henderson April 25, 2018, 1:23 a.m. UTC
We missed all of the scalar fp16 fma operations.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 target/arm/translate-a64.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

-- 
2.14.3

Comments

Alex Bennée May 1, 2018, 11:21 a.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> We missed all of the scalar fp16 fma operations.

>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  target/arm/translate-a64.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++

>  1 file changed, 48 insertions(+)

>

> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c

> index 11b90b7eb0..0cb1fc4d67 100644

> --- a/target/arm/translate-a64.c

> +++ b/target/arm/translate-a64.c

> @@ -5154,6 +5154,44 @@ static void handle_fp_3src_double(DisasContext *s, bool o0, bool o1,

>      tcg_temp_free_i64(tcg_res);

>  }

>

> +/* Floating-point data-processing (3 source) - half precision */

> +static void handle_fp_3src_half(DisasContext *s, bool o0, bool o1,

> +                                int rd, int rn, int rm, int ra)

> +{

> +    TCGv_i32 tcg_op1, tcg_op2, tcg_op3;

> +    TCGv_i32 tcg_res = tcg_temp_new_i32();

> +    TCGv_ptr fpst = get_fpstatus_ptr(true);

> +

> +    tcg_op1 = read_fp_hreg(s, rn);

> +    tcg_op2 = read_fp_hreg(s, rm);

> +    tcg_op3 = read_fp_hreg(s, ra);

> +

> +    /* These are fused multiply-add, and must be done as one

> +     * floating point operation with no rounding between the

> +     * multiplication and addition steps.


I got confused first time reading this as we cover F[N]M[ADD|SUB].
Perhaps that is better enumerated at the top of the function?

Anyway:

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>


> +     * NB that doing the negations here as separate steps is

> +     * correct : an input NaN should come out with its sign bit

> +     * flipped if it is a negated-input.

> +     */

> +    if (o1 == true) {

> +        tcg_gen_xori_i32(tcg_op3, tcg_op3, 0x8000);

> +    }

> +

> +    if (o0 != o1) {

> +        tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000);

> +    }

> +

> +    gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);

> +

> +    write_fp_sreg(s, rd, tcg_res);

> +

> +    tcg_temp_free_ptr(fpst);

> +    tcg_temp_free_i32(tcg_op1);

> +    tcg_temp_free_i32(tcg_op2);

> +    tcg_temp_free_i32(tcg_op3);

> +    tcg_temp_free_i32(tcg_res);

> +}

> +

>  /* Floating point data-processing (3 source)

>   *   31  30  29 28       24 23  22  21  20  16  15  14  10 9    5 4    0

>   * +---+---+---+-----------+------+----+------+----+------+------+------+

> @@ -5183,6 +5221,16 @@ static void disas_fp_3src(DisasContext *s, uint32_t insn)

>          }

>          handle_fp_3src_double(s, o0, o1, rd, rn, rm, ra);

>          break;

> +    case 3:

> +        if (!arm_dc_feature(s, ARM_FEATURE_V8_FP16)) {

> +            unallocated_encoding(s);

> +            return;

> +        }

> +        if (!fp_access_check(s)) {

> +            return;

> +        }

> +        handle_fp_3src_half(s, o0, o1, rd, rn, rm, ra);

> +        break;

>      default:

>          unallocated_encoding(s);

>      }



--
Alex Bennée
Richard Henderson May 2, 2018, 6:49 p.m. UTC | #2
On 05/01/2018 04:21 AM, Alex Bennée wrote:
> 

> Richard Henderson <richard.henderson@linaro.org> writes:

> 

>> We missed all of the scalar fp16 fma operations.

>>

>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

>> ---

>>  target/arm/translate-a64.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++

>>  1 file changed, 48 insertions(+)

>>

>> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c

>> index 11b90b7eb0..0cb1fc4d67 100644

>> --- a/target/arm/translate-a64.c

>> +++ b/target/arm/translate-a64.c

>> @@ -5154,6 +5154,44 @@ static void handle_fp_3src_double(DisasContext *s, bool o0, bool o1,

>>      tcg_temp_free_i64(tcg_res);

>>  }

>>

>> +/* Floating-point data-processing (3 source) - half precision */

>> +static void handle_fp_3src_half(DisasContext *s, bool o0, bool o1,

>> +                                int rd, int rn, int rm, int ra)

>> +{

>> +    TCGv_i32 tcg_op1, tcg_op2, tcg_op3;

>> +    TCGv_i32 tcg_res = tcg_temp_new_i32();

>> +    TCGv_ptr fpst = get_fpstatus_ptr(true);

>> +

>> +    tcg_op1 = read_fp_hreg(s, rn);

>> +    tcg_op2 = read_fp_hreg(s, rm);

>> +    tcg_op3 = read_fp_hreg(s, ra);

>> +

>> +    /* These are fused multiply-add, and must be done as one

>> +     * floating point operation with no rounding between the

>> +     * multiplication and addition steps.

> 

> I got confused first time reading this as we cover F[N]M[ADD|SUB].

> Perhaps that is better enumerated at the top of the function?


*shrug* It's an exact copy of the beginnings of the single and double
functions.  Is it really that confusing?


r~
diff mbox series

Patch

diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 11b90b7eb0..0cb1fc4d67 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -5154,6 +5154,44 @@  static void handle_fp_3src_double(DisasContext *s, bool o0, bool o1,
     tcg_temp_free_i64(tcg_res);
 }
 
+/* Floating-point data-processing (3 source) - half precision */
+static void handle_fp_3src_half(DisasContext *s, bool o0, bool o1,
+                                int rd, int rn, int rm, int ra)
+{
+    TCGv_i32 tcg_op1, tcg_op2, tcg_op3;
+    TCGv_i32 tcg_res = tcg_temp_new_i32();
+    TCGv_ptr fpst = get_fpstatus_ptr(true);
+
+    tcg_op1 = read_fp_hreg(s, rn);
+    tcg_op2 = read_fp_hreg(s, rm);
+    tcg_op3 = read_fp_hreg(s, ra);
+
+    /* These are fused multiply-add, and must be done as one
+     * floating point operation with no rounding between the
+     * multiplication and addition steps.
+     * NB that doing the negations here as separate steps is
+     * correct : an input NaN should come out with its sign bit
+     * flipped if it is a negated-input.
+     */
+    if (o1 == true) {
+        tcg_gen_xori_i32(tcg_op3, tcg_op3, 0x8000);
+    }
+
+    if (o0 != o1) {
+        tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000);
+    }
+
+    gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst);
+
+    write_fp_sreg(s, rd, tcg_res);
+
+    tcg_temp_free_ptr(fpst);
+    tcg_temp_free_i32(tcg_op1);
+    tcg_temp_free_i32(tcg_op2);
+    tcg_temp_free_i32(tcg_op3);
+    tcg_temp_free_i32(tcg_res);
+}
+
 /* Floating point data-processing (3 source)
  *   31  30  29 28       24 23  22  21  20  16  15  14  10 9    5 4    0
  * +---+---+---+-----------+------+----+------+----+------+------+------+
@@ -5183,6 +5221,16 @@  static void disas_fp_3src(DisasContext *s, uint32_t insn)
         }
         handle_fp_3src_double(s, o0, o1, rd, rn, rm, ra);
         break;
+    case 3:
+        if (!arm_dc_feature(s, ARM_FEATURE_V8_FP16)) {
+            unallocated_encoding(s);
+            return;
+        }
+        if (!fp_access_check(s)) {
+            return;
+        }
+        handle_fp_3src_half(s, o0, o1, rd, rn, rm, ra);
+        break;
     default:
         unallocated_encoding(s);
     }